PyPI - orgo - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

orgo 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

orgo/computer.py +161 -119
orgo/prompt.py +775 -379
{orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/METADATA +5 -3
{orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/RECORD +6 -6
{orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/WHEEL +0 -0
{orgo-0.0.35.dist-info → orgo-0.0.37.dist-info}/top_level.txt +0 -0

orgo/prompt.py CHANGED Viewed

@@ -1,452 +1,848 @@
 # src/orgo/prompt.py
 """
-Prompt module for interacting with virtual computers using AI models.
+Orgo Prompt Module - AI-powered computer control.
+Usage:
+    computer.prompt("Open Firefox")                        # Uses Orgo (default)
+    computer.prompt("Open Firefox", provider="anthropic")  # Uses Anthropic directly
 """
 import os
+import sys
+import json
 import base64
-from typing import Dict, List, Any, Optional, Callable, Union, Protocol
+import time
+import logging
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Protocol
+import anthropic
+import websocket
+import requests
+logger = logging.getLogger(__name__)
+# =============================================================================
+# Console Output
+# =============================================================================
+class Colors:
+    """ANSI color codes for terminal output."""
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
+    CYAN = "\033[36m"
+    GREEN = "\033[32m"
+    YELLOW = "\033[33m"
+    RED = "\033[31m"
+    MAGENTA = "\033[35m"
+    BLUE = "\033[34m"
+    WHITE = "\033[37m"
+    GRAY = "\033[90m"
+def supports_color() -> bool:
+    """Check if terminal supports color."""
+    if os.environ.get("NO_COLOR"):
+        return False
+    if os.environ.get("FORCE_COLOR"):
+        return True
+    return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+class Console:
+    """Beautiful console output for Orgo SDK."""
+    def __init__(self, verbose: bool = True):
+        self.verbose = verbose
+        self.use_color = supports_color()
+        self.start_time = None
+    def _c(self, color: str, text: str) -> str:
+        """Apply color if supported."""
+        if self.use_color:
+            return f"{color}{text}{Colors.RESET}"
+        return text
+    def banner(self, computer_id: str):
+        """Print Orgo banner with session link."""
+        if not self.verbose:
+            return
+        self.start_time = time.time()
+        logo = f"""
+  {self._c(Colors.CYAN, '___  _ __ __ _  ___')}
+ {self._c(Colors.CYAN, "/ _ \\| '__/ _` |/ _ \\")}
+{self._c(Colors.CYAN, '| (_) | | | (_| | (_) |')}
+ {self._c(Colors.CYAN, "\\___/|_|  \\__, |\\___/")}
+          {self._c(Colors.CYAN, '|___/')}
+"""
+        print(logo)
+        print(f"  {self._c(Colors.DIM, 'Watch:')}  {self._c(Colors.CYAN, f'https://orgo.ai/workspaces/{computer_id}')}")
+        print()
+    def status(self, message: str):
+        """Print status update."""
+        if not self.verbose:
+            return
+        timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
+        print(f"  {timestamp}  {self._c(Colors.CYAN, '●')}  {message}")
+    def action(self, action: str, details: str = ""):
+        """Print action being taken."""
+        if not self.verbose:
+            return
+        timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
+        action_str = self._c(Colors.YELLOW, action)
+        details_str = self._c(Colors.DIM, details) if details else ""
+        print(f"  {timestamp}  {self._c(Colors.YELLOW, '▸')}  {action_str}  {details_str}")
+    def thinking(self, preview: str = ""):
+        """Print thinking indicator."""
+        if not self.verbose:
+            return
+        timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
+        preview_str = self._c(Colors.DIM, f"  {preview[:60]}...") if preview else ""
+        print(f"  {timestamp}  {self._c(Colors.MAGENTA, '◐')}  {self._c(Colors.MAGENTA, 'Thinking')}{preview_str}")
+    def text(self, content: str):
+        """Print assistant text response."""
+        if not self.verbose:
+            return
+        timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
+        if len(content) > 100:
+            content = content[:100] + "..."
+        print(f"  {timestamp}  {self._c(Colors.GREEN, '◀')}  {content}")
+    def error(self, message: str):
+        """Print error message."""
+        timestamp = self._c(Colors.DIM, datetime.now().strftime("%H:%M:%S"))
+        print(f"  {timestamp}  {self._c(Colors.RED, '✗')}  {self._c(Colors.RED, message)}")
+    def success(self, iterations: int = 0):
+        """Print success message."""
+        if not self.verbose:
+            return
+        elapsed = ""
+        if self.start_time:
+            seconds = time.time() - self.start_time
+            elapsed = f" in {seconds:.1f}s"
+        iter_str = f" ({iterations} iterations)" if iterations else ""
+        print()
+        print(f"  {self._c(Colors.GREEN, '✓')}  {self._c(Colors.GREEN, 'Done')}{iter_str}{self._c(Colors.DIM, elapsed)}")
+        print()
+# =============================================================================
+# System Prompt
+# =============================================================================
+def get_system_prompt(
+    display_width: int = 1024,
+    display_height: int = 768,
+    custom_prompt: Optional[str] = None
+) -> str:
+    """Build the system prompt for Claude computer use."""
+    mid_x = display_width // 2
+    mid_y = display_height // 2
+    max_x = display_width - 1
+    max_y = display_height - 1
+    base_prompt = f"""You control a Linux desktop ({display_width}x{display_height}). Be efficient - complete tasks in minimal steps.
+<ACTIONS>
+screenshot        - See current screen state
+left_click        - Single click. Params: coordinate [x, y]
+double_click      - Double click. Params: coordinate [x, y]
+right_click       - Right click. Params: coordinate [x, y]
+type              - Type text. Params: text "string"
+key               - Press key. Params: text "Enter", "Tab", "ctrl+c", etc.
+scroll            - Scroll. Params: scroll_direction "up"|"down", scroll_amount 3
+wait              - Pause. Params: duration (seconds, e.g. 5)
+mouse_move        - Move cursor. Params: coordinate [x, y]
+left_click_drag   - Drag operation. Params: start_coordinate [x, y], coordinate [x, y]
+</ACTIONS>
+<CLICK_RULES>
+DOUBLE_CLICK for:
+  - Desktop icons (to open apps)
+  - Files/folders in file manager
+LEFT_CLICK for everything else:
+  - Buttons, links, menus
+  - Taskbar icons
+  - Input fields (to focus before typing)
+  - Window controls (close/minimize)
+COMMON MISTAKES:
+  - left_click on desktop icon = only selects, doesn't open (use double_click)
+  - double_click on button = wrong (use left_click)
+</CLICK_RULES>
+<WINDOW_DRAGGING_CRITICAL>
+WHEN DRAGGING WINDOWS - GRAB THE TITLE BAR CORRECTLY:
+CORRECT - grab the EMPTY SPACE in the title bar:
+  ✓ Center-top of window (middle of title bar, away from buttons/tabs)
+  ✓ For browser: grab between tabs and buttons (empty title bar area)
+  ✓ For app with tabs: grab the title bar ABOVE tabs
+  ✓ Safe zone: horizontal center, ~20-30px from top edge
+WRONG - avoid these areas:
+  ✗ Close/minimize/maximize buttons (top-right corner)
+  ✗ Browser tabs (will switch tabs instead of moving window)
+  ✗ Window icon or menu (top-left corner)
+  ✗ Any buttons or controls in title bar
+VISUAL GUIDE - where to grab:
+  [X] [Icon] [___GRAB_HERE___] [- □ X]
+             ↑ empty title bar area
+For browser window:
+  [Tab1] [Tab2] [___GRAB_HERE___] [+ - □ X]
+                ↑ empty space between tabs and controls
+COORDINATES FOR DRAGGING:
+  Start coordinate = [{mid_x}, 20]  (center-top, in title bar)
+  NOT [window_right - 20, 20]  (too close to close button)
+  NOT [40, 20]  (too close to icon/menu)
+</WINDOW_DRAGGING_CRITICAL>
+<WINDOW_SNAPPING>
+Drag window title bar to these exact coordinates to snap:
+HALF SCREEN:
+  - Left half:   drag to [1, {mid_y}]
+  - Right half:  drag to [{max_x}, {mid_y}]
+QUARTER SCREEN:
+  - Top-left:     drag to [1, 1]
+  - Top-right:    drag to [{max_x}, 1]
+  - Bottom-left:  drag to [1, {max_y}]
+  - Bottom-right: drag to [{max_x}, {max_y}]
+MAXIMIZE:
+  - Full screen:  drag to [{mid_x}, 1]
+COMPLETE EXAMPLE - snap Chrome to left half:
+  1. Identify window center-top coordinate: [{mid_x}, 20]
+  2. Execute: left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}]
+  3. Window snaps to left half of screen
+SPLIT SCREEN WORKFLOW:
+  1. Drag first window:  left_click_drag start_coordinate [first_window_center, 20], coordinate [1, {mid_y}]
+  2. Wait 1 second
+  3. Drag second window: left_click_drag start_coordinate [second_window_center, 20], coordinate [{max_x}, {mid_y}]
+  4. Both windows now side-by-side
+CRITICAL: Always use the CENTER of the title bar as start_coordinate, never the edges!
+</WINDOW_SNAPPING>
+<WAIT_TIMES>
+After opening app from DESKTOP icon: wait 10 seconds
+After opening app from TASKBAR: wait 5 seconds
+After loading web page: wait 3 seconds
+After clicking button: wait 1 second
+After dragging window: wait 1 second
+After typing: no wait needed
+</WAIT_TIMES>
+<WORKFLOW>
+1. Screenshot once at start to see current state
+2. Execute actions - no screenshot between quick actions
+3. Screenshot after waits to verify result
+4. Don't screenshot redundantly
+PATTERNS:
+Open app from desktop:
+  screenshot → double_click icon → wait 10 → screenshot
+Open app from taskbar:
+  screenshot → left_click taskbar → wait 5 → screenshot
+Web search:
+  left_click search bar → type "query" → key "Enter" → wait 3 → screenshot
+Snap window to left:
+  screenshot → left_click_drag start_coordinate [{mid_x}, 20], coordinate [1, {mid_y}] → wait 1 → screenshot
+</WORKFLOW>
+<KEY_NAMES>
+Enter (not Return), Tab, Escape, Backspace, Delete
+Combos: ctrl+c, ctrl+v, ctrl+s, alt+Tab, alt+F4, super+Left
+</KEY_NAMES>
+<COORDINATES>
+Origin (0,0) = top-left
+X increases rightward, Y increases downward
+Always click CENTER of elements
+Screen: {display_width}x{display_height}
+Valid: x from 1 to {max_x}, y from 1 to {max_y}
+TITLE BAR SAFETY:
+  - Horizontal: use center ({mid_x}) or ±200px from center
+  - Vertical: ~20px from top (in title bar, not too close to edge)
+  - NEVER use far right (close to X button)
+  - NEVER use far left (close to icon/menu)
+</COORDINATES>
+<EFFICIENCY>
+- One screenshot to start, then only after waits
+- Batch actions without screenshots between
+- Don't re-verify actions that succeeded
+- After 2 failed attempts, try alternative approach
+- When dragging windows, always grab the safe center-top area
+</EFFICIENCY>"""
+    if custom_prompt:
+        return f"""<USER_INSTRUCTIONS>
+{custom_prompt}
+</USER_INSTRUCTIONS>
+{base_prompt}"""
+    return base_prompt
+# =============================================================================
+# Provider Protocol
+# =============================================================================
 class PromptProvider(Protocol):
-    """Protocol defining the interface for prompt providers."""
+    """Interface for prompt execution providers."""
-    def execute(self,
-                computer_id: str,
-                instruction: str,
-                callback: Optional[Callable[[str, Any], None]] = None,
-                **kwargs) -> List[Dict[str, Any]]:
-        """
-        Execute a prompt to control the computer.
-        Args:
-            computer_id: ID of the computer to control
-            instruction: User instruction
-            callback: Optional progress callback function
-            **kwargs: Additional provider-specific parameters
-        Returns:
-            List of messages from the conversation
-        """
+    def execute(
+        self,
+        computer_id: str,
+        instruction: str,
+        callback: Optional[Callable[[str, Any], None]] = None,
+        verbose: bool = True,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
         ...
-class AnthropicProvider:
-    """Anthropic Claude-based prompt provider."""
+# =============================================================================
+# Orgo Provider (Default)
+# =============================================================================
+class OrgoProvider:
+    """
+    Execute prompts via Orgo's hosted agent.
-    def __init__(self):
-        """Initialize the Anthropic provider."""
-        try:
-            import anthropic
-            self.anthropic = anthropic
-        except ImportError:
-            raise ImportError(
-                "Anthropic SDK not installed. Please install with 'pip install anthropic'"
-            )
+    Benefits:
+    - No Anthropic API key needed
+    - Optimized infrastructure
+    - Real-time streaming
+    - Watch live at orgo.ai/workspaces/{computer_id}
+    """
-    def execute(self,
-                computer_id: str,
-                instruction: str,
-                callback: Optional[Callable[[str, Any], None]] = None,
-                api_key: Optional[str] = None,
-                model: str = "claude-3-7-sonnet-20250219",
-                display_width: int = 1024,
-                display_height: int = 768,
-                orgo_api_key: Optional[str] = None,
-                orgo_base_url: Optional[str] = None,
-                max_saved_screenshots: int = 2,
-                **kwargs) -> List[Dict[str, Any]]:
-        """
-        Execute a prompt using Anthropic's Claude.
-        Args:
-            computer_id: ID of the computer to control
-            instruction: User instruction
-            callback: Optional progress callback
-            api_key: Anthropic API key
-            model: Model to use
-            display_width: Display width in pixels
-            display_height: Display height in pixels
-            orgo_api_key: API key for Orgo (passed to ApiClient)
-            orgo_base_url: Base URL for Orgo API (passed to ApiClient)
-            max_saved_screenshots: Maximum number of screenshots to maintain in conversation history
-            **kwargs: Additional parameters to pass to the Anthropic API
-        Returns:
-            List of messages from the conversation
-        """
-        # Get API key from kwargs, env var, or raise error
-        api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
-        if not api_key:
-            raise ValueError("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable or pass api_key.")
-        # Initialize the client
-        client = self.anthropic.Anthropic(api_key=api_key)
-        # Prepare the messages
-        messages = [{"role": "user", "content": instruction}]
+    def __init__(self, agent_url: str = "wss://agent.orgo.ai"):
+        self.agent_url = agent_url.rstrip("/")
+    def execute(
+        self,
+        computer_id: str,
+        instruction: str,
+        callback: Optional[Callable[[str, Any], None]] = None,
+        verbose: bool = True,
+        orgo_api_key: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """Execute prompt via Orgo's hosted agent."""
-        # Set up the system prompt
-        system_prompt = f"""You are Claude, an AI assistant that controls a virtual Ubuntu computer with internet access.
-<SYSTEM_CAPABILITY>
-* You are utilising an Ubuntu virtual machine with a display resolution of {display_width}x{display_height}.
-* You can take screenshots to see the current state and control the computer by clicking, typing, pressing keys, and scrolling.
-* The virtual environment is an Ubuntu system with standard applications.
-* Always start by taking a screenshot to see the current state before performing any actions.
-</SYSTEM_CAPABILITY>
-<UBUNTU_DESKTOP_GUIDELINES>
-* CRITICAL INSTRUCTION: When opening applications or files on the Ubuntu desktop, you MUST USE DOUBLE-CLICK rather than single-click.
-* Single-click only selects desktop icons but DOES NOT open them. To open desktop icons, you MUST use double-click.
-* Common desktop interactions:
-  - Desktop icons: DOUBLE-CLICK to open applications and folders
-  - Menu items: SINGLE-CLICK to select options
-  - Taskbar icons: SINGLE-CLICK to open applications
-  - Window buttons: SINGLE-CLICK to use close, minimize, maximize buttons
-  - File browser items: DOUBLE-CLICK to open folders and files
-  - When submitting, use the 'Enter' key, not the 'Return' key.
-* If you see an icon on the desktop that you need to open, ALWAYS use the double_click action, never use left_click.
-</UBUNTU_DESKTOP_GUIDELINES>
-<SCREENSHOT_GUIDELINES>
-* Be mindful of how many screenshots you take - they consume significant memory.
-* Only take screenshots when you need to see the current state of the screen.
-* Try to batch multiple actions before taking another screenshot.
-* For better performance, limit the number of screenshots you take.
-</SCREENSHOT_GUIDELINES>"""
+        token = orgo_api_key or os.environ.get("ORGO_API_KEY")
+        if not token:
+            raise ValueError(
+                "ORGO_API_KEY required.\n"
+                "Set it with: export ORGO_API_KEY=your_key\n"
+                "Get your key at: https://orgo.ai/settings/api"
+            )
-        try:
-            # Define the computer tool per Anthropic's documentation
-            tools = [
-                {
-                    "type": "computer_20250124",
-                    "name": "computer",
-                    "display_width_px": display_width,
-                    "display_height_px": display_height,
-                    "display_number": 1
-                }
-            ]
-            # Start the conversation with Claude
-            if callback:
-                callback("status", "Starting conversation with Claude")
-            # Track whether we're in the agent loop
-            iteration = 0
-            max_iterations = kwargs.get("max_iterations", 20)  # Default to 20 iterations max
-            # Create an API client with the proper settings
-            from .api.client import ApiClient
-            api_client = ApiClient(orgo_api_key, orgo_base_url)
-            # Track how many screenshots we've seen so we can prune when needed
-            screenshot_count = 0
-            # Start the agent loop
-            while iteration < max_iterations:
-                iteration += 1
-                # Filter to keep only the N most recent screenshots
-                if screenshot_count > max_saved_screenshots:
-                    self._filter_to_n_most_recent_images(messages, max_saved_screenshots)
-                    screenshot_count = max_saved_screenshots
+        console = Console(verbose=verbose)
+        console.banner(computer_id)
+        console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
+        ws_url = f"{self.agent_url}/ws/prompt?token={token}"
+        config = {
+            "computer_id": computer_id,
+            "instruction": instruction,
+            "model": kwargs.get("model", "claude-sonnet-4-5-20250929"),
+            "display_width": kwargs.get("display_width", 1024),
+            "display_height": kwargs.get("display_height", 768),
+            "thinking_enabled": kwargs.get("thinking_enabled", True),
+            "thinking_budget": kwargs.get("thinking_budget", 1024),
+            "max_tokens": kwargs.get("max_tokens", 4096),
+            "max_iterations": kwargs.get("max_iterations", 100),
+        }
+        if system_prompt:
+            config["system_prompt"] = system_prompt
+        result = {"messages": [], "error": None, "iterations": 0}
+        def on_message(ws, message):
+            try:
+                data = json.loads(message)
+                event_type = data.get("type")
+                event_data = data.get("data")
-                # Create the request parameters
-                request_params = {
-                    "model": model,
-                    "max_tokens": kwargs.get("max_tokens", 4096),
-                    "system": system_prompt,
-                    "messages": messages,
-                    "tools": tools,
-                    "betas": ["computer-use-2025-01-24"],
-                }
+                if event_type == "result":
+                    result["messages"] = event_data.get("messages", [])
+                    result["iterations"] = event_data.get("iterations", 0)
+                    if not event_data.get("success"):
+                        result["error"] = event_data.get("error")
+                    ws.close()
-                # Add thinking parameter only if explicitly enabled
-                if kwargs.get("thinking_enabled"):
-                    request_params["thinking"] = {
-                        "type": "enabled",
-                        "budget_tokens": kwargs.get("thinking_budget", 1024)
-                    }
+                elif event_type == "error":
+                    console.error(str(event_data))
+                    result["error"] = event_data
+                    ws.close()
-                # Create message request to Claude
-                try:
-                    response = client.beta.messages.create(**request_params)
-                except Exception as e:
-                    if "base64" in str(e).lower():
-                        # If we get a base64 error, try again after more aggressively filtering images
-                        if callback:
-                            callback("error", f"Base64 error detected. Attempting recovery...")
-                        # Remove all but the most recent image and try again
-                        self._filter_to_n_most_recent_images(messages, 1)
-                        response = client.beta.messages.create(**request_params)
-                    else:
-                        # Not a base64 error, re-raise
-                        raise
+                elif event_type == "status":
+                    console.status(str(event_data))
-                # Extract the content from the response
-                response_content = response.content
+                elif event_type == "thinking":
+                    preview = str(event_data)[:60] if event_data else ""
+                    console.thinking(preview)
-                # Add Claude's response to the conversation history
-                assistant_message = {"role": "assistant", "content": response_content}
-                messages.append(assistant_message)
+                elif event_type == "text":
+                    console.text(str(event_data))
-                # Notify callback of any text content
-                for block in response_content:
-                    if block.type == "text" and callback:
-                        callback("text", block.text)
-                    elif block.type == "thinking" and callback:
-                        callback("thinking", block.thinking)
-                    elif block.type == "tool_use" and callback:
-                        tool_params = {
-                            "action": block.name.split(".")[-1],
-                            **block.input
-                        }
-                        callback("tool_use", tool_params)
+                elif event_type == "tool_use":
+                    action = event_data.get("action", "unknown") if isinstance(event_data, dict) else str(event_data)
+                    params = event_data.get("params", {}) if isinstance(event_data, dict) else {}
+                    if action == "screenshot":
+                        console.action("screenshot")
+                    elif action in ["left_click", "right_click", "double_click"]:
+                        coord = params.get("coordinate", [0, 0])
+                        console.action(action, f"({coord[0]}, {coord[1]})")
+                    elif action == "type":
+                        text = params.get("text", "")[:30]
+                        console.action("type", f'"{text}"')
+                    elif action == "key":
+                        console.action("key", params.get("text", ""))
+                    elif action == "scroll":
+                        console.action("scroll", params.get("scroll_direction", ""))
+                    elif action == "wait":
+                        console.action("wait", f"{params.get('duration', 1)}s")
+                    else:
+                        console.action(action)
-                # Check if Claude requested any tool actions
-                tool_results = []
-                for block in response_content:
-                    if block.type == "tool_use":
-                        # Execute the tool action
-                        result = self._execute_tool(computer_id, block.input, callback, api_client)
-                        # Format the result for Claude
-                        tool_result = {
-                            "type": "tool_result",
-                            "tool_use_id": block.id
-                        }
-                        # Handle image vs text results
-                        if isinstance(result, dict) and "type" in result and result["type"] == "image":
-                            tool_result["content"] = [result]
-                            # Increment screenshot count when we add a new screenshot
-                            if block.input.get("action") == "screenshot":
-                                screenshot_count += 1
-                        else:
-                            tool_result["content"] = [{"type": "text", "text": str(result)}]
-                        tool_results.append(tool_result)
+                elif event_type == "iteration":
+                    result["iterations"] = event_data
-                # If no tools were used, Claude is done - return the messages
-                if not tool_results:
-                    if callback:
-                        callback("status", "Task completed")
-                    return messages
+                elif event_type == "pong":
+                    pass
-                # Add tool results to messages for the next iteration
-                messages.append({"role": "user", "content": tool_results})
+                if callback:
+                    callback(event_type, event_data)
+            except json.JSONDecodeError as e:
+                logger.error(f"Parse error: {e}")
+        def on_error(ws, error):
+            console.error(str(error))
+            result["error"] = str(error)
+        def on_open(ws):
+            ws.send(json.dumps({"type": "start", "config": config}))
+        def on_close(ws, close_status_code, close_msg):
+            if not result["error"]:
+                console.success(result["iterations"])
+        ws = websocket.WebSocketApp(
+            ws_url,
+            on_message=on_message,
+            on_error=on_error,
+            on_open=on_open,
+            on_close=on_close,
+        )
+        ws.run_forever()
+        if result["error"]:
+            raise RuntimeError(result["error"])
+        return result["messages"]
+# =============================================================================
+# Anthropic Provider (Direct API)
+# =============================================================================
+class AnthropicProvider:
+    """
+    Execute prompts directly with Anthropic API.
+    Requires ANTHROPIC_API_KEY environment variable.
+    """
+    def execute(
+        self,
+        computer_id: str,
+        instruction: str,
+        callback: Optional[Callable[[str, Any], None]] = None,
+        verbose: bool = True,
+        api_key: Optional[str] = None,
+        orgo_api_key: Optional[str] = None,
+        orgo_base_url: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        """Execute prompt locally with Anthropic API."""
+        anthropic_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        if not anthropic_key:
+            raise ValueError(
+                "ANTHROPIC_API_KEY required for provider='anthropic'.\n"
+                "Set it with: export ANTHROPIC_API_KEY=your_key\n"
+                "Get your key at: https://console.anthropic.com/"
+            )
+        orgo_key = orgo_api_key or os.environ.get("ORGO_API_KEY")
+        if not orgo_key:
+            raise ValueError(
+                "ORGO_API_KEY required.\n"
+                "Set it with: export ORGO_API_KEY=your_key"
+            )
+        # Base URL for Orgo API (no /api suffix - added per endpoint)
+        orgo_url = (orgo_base_url or "https://orgo.ai").rstrip("/")
+        console = Console(verbose=verbose)
+        console.banner(computer_id)
+        console.status("Provider: Anthropic")
+        console.status(f"Prompt: \"{instruction[:60]}{'...' if len(instruction) > 60 else ''}\"")
+        # Config
+        model = kwargs.get("model", "claude-sonnet-4-5-20250929")
+        display_width = kwargs.get("display_width", 1024)
+        display_height = kwargs.get("display_height", 768)
+        max_iterations = kwargs.get("max_iterations", 100)
+        max_tokens = kwargs.get("max_tokens", 4096)
+        thinking_enabled = kwargs.get("thinking_enabled", True)
+        thinking_budget = kwargs.get("thinking_budget", 1024)
+        max_saved_screenshots = kwargs.get("max_saved_screenshots", 3)
+        # System prompt
+        full_system_prompt = get_system_prompt(display_width, display_height, system_prompt)
+        # Initialize
+        client = anthropic.Anthropic(api_key=anthropic_key)
+        messages = [{"role": "user", "content": instruction}]
+        tools = [{
+            "type": "computer_20250124",
+            "name": "computer",
+            "display_width_px": display_width,
+            "display_height_px": display_height,
+            "display_number": 1
+        }]
+        iteration = 0
+        screenshot_count = 0
+        while iteration < max_iterations:
+            iteration += 1
-            # We've reached the maximum iteration limit
-            if callback:
-                callback("status", f"Reached maximum iterations ({max_iterations})")
+            if verbose:
+                console.status(f"Iteration {iteration}")
-            return messages
+            # Prune old screenshots
+            if screenshot_count > max_saved_screenshots:
+                self._prune_screenshots(messages, max_saved_screenshots)
+                screenshot_count = max_saved_screenshots
-        except Exception as e:
-            if callback:
-                callback("error", str(e))
-            raise
-    def _filter_to_n_most_recent_images(self, messages: List[Dict[str, Any]], max_images: int):
-        """
-        Keep only the N most recent images in the conversation history.
-        Args:
-            messages: The conversation history
-            max_images: Maximum number of images to keep
-        """
-        # Find all the image blocks in the conversation history
-        image_blocks = []
-        for msg_idx, msg in enumerate(messages):
-            if msg["role"] != "user":
-                continue
-            content = msg.get("content", [])
-            if not isinstance(content, list):
-                continue
-            for content_idx, block in enumerate(content):
-                if not isinstance(block, dict):
-                    continue
+            # Build request
+            request_params = {
+                "model": model,
+                "max_tokens": max_tokens,
+                "system": full_system_prompt,
+                "messages": messages,
+                "tools": tools,
+                "betas": ["computer-use-2025-01-24"],
+            }
+            if thinking_enabled:
+                request_params["thinking"] = {
+                    "type": "enabled",
+                    "budget_tokens": thinking_budget
+                }
+            # Call Claude
+            try:
+                response = client.beta.messages.create(**request_params)
+            except Exception as e:
+                if "base64" in str(e).lower():
+                    self._prune_screenshots(messages, 1)
+                    response = client.beta.messages.create(**request_params)
+                else:
+                    raise
+            response_content = response.content
+            messages.append({"role": "assistant", "content": response_content})
+            # Process response content
+            for block in response_content:
+                if block.type == "text":
+                    console.text(block.text)
+                    if callback:
+                        callback("text", block.text)
+                elif block.type == "thinking":
+                    console.thinking(block.thinking[:60] if block.thinking else "")
+                    if callback:
+                        callback("thinking", block.thinking)
+                elif block.type == "tool_use":
+                    action = block.input.get("action", "unknown")
-                if block.get("type") != "tool_result":
-                    continue
-                block_content = block.get("content", [])
-                for content_item_idx, content_item in enumerate(block_content):
-                    if not isinstance(content_item, dict):
-                        continue
-                    if content_item.get("type") == "image" and "source" in content_item:
-                        image_blocks.append({
-                            "msg_idx": msg_idx,
-                            "content_idx": content_idx,
-                            "block": block,
-                            "content_item_idx": content_item_idx,
-                            "content_item": content_item
-                        })
-        # If we have more images than our limit, remove the oldest ones
-        if len(image_blocks) > max_images:
-            # Keep only the most recent ones (which are at the end of the list)
-            images_to_remove = image_blocks[:-max_images]
+                    if action == "screenshot":
+                        console.action("screenshot")
+                    elif action in ["left_click", "right_click", "double_click"]:
+                        coord = block.input.get("coordinate", [0, 0])
+                        console.action(action, f"({coord[0]}, {coord[1]})")
+                    elif action == "type":
+                        text = block.input.get("text", "")[:30]
+                        console.action("type", f'"{text}"')
+                    elif action == "key":
+                        console.action("key", block.input.get("text", ""))
+                    elif action == "scroll":
+                        console.action("scroll", block.input.get("scroll_direction", ""))
+                    elif action == "wait":
+                        console.action("wait", f"{block.input.get('duration', 1)}s")
+                    else:
+                        console.action(action)
+                    if callback:
+                        callback("tool_use", {"action": action, "params": block.input})
-            for img_block in images_to_remove:
-                content_item = img_block["content_item"]
-                if "source" in content_item and "data" in content_item["source"]:
-                    # Replace with a minimal valid base64 image (1x1 transparent PNG)
-                    content_item["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
-                    content_item["source"]["media_type"] = "image/png"
+            # Execute tools
+            tool_results = []
+            for block in response_content:
+                if block.type == "tool_use":
+                    result = self._execute_tool(computer_id, block.input, orgo_key, orgo_url, callback)
+                    tool_result = {"type": "tool_result", "tool_use_id": block.id}
+                    if isinstance(result, dict) and result.get("type") == "image":
+                        tool_result["content"] = [result]
+                        if block.input.get("action") == "screenshot":
+                            screenshot_count += 1
+                    else:
+                        tool_result["content"] = [{"type": "text", "text": str(result)}]
+                    tool_results.append(tool_result)
+            if not tool_results:
+                console.success(iteration)
+                return messages
+            messages.append({"role": "user", "content": tool_results})
+        console.success(iteration)
+        return messages
-    def _execute_tool(self,
-                      computer_id: str,
-                      params: Dict[str, Any],
-                      callback: Optional[Callable[[str, Any], None]] = None,
-                      api_client = None) -> Union[str, Dict[str, Any]]:
-        """Execute a tool action via the API client."""
-        action = params.get("action")
+    def _execute_tool(self, computer_id: str, params: Dict, orgo_key: str, orgo_url: str, callback: Optional[Callable]) -> Any:
+        """Execute a tool action via Orgo API."""
-        if callback:
-            callback("tool_executing", {"action": action, "params": params})
+        action = params.get("action")
+        headers = {"Authorization": f"Bearer {orgo_key}", "Content-Type": "application/json"}
+        base_url = f"{orgo_url}/api/computers/{computer_id}"
         try:
-            # Use the provided API client or create a new one
-            if api_client is None:
-                # Import here to avoid circular imports
-                from .api.client import ApiClient
-                api_client = ApiClient()
-            # Map actions to API methods
+            # =================================================================
+            # SCREENSHOT - GET request
+            # =================================================================
             if action == "screenshot":
-                response = api_client.get_screenshot(computer_id)
-                if callback:
-                    callback("tool_result", {"type": "image", "action": "screenshot"})
+                r = requests.get(f"{base_url}/screenshot", headers=headers)
+                r.raise_for_status()
-                # The API now returns a URL instead of base64 data
-                # We need to fetch the image from the URL and convert it to base64
-                image_url = response.get("image", "")
+                data = r.json()
+                image_url = data.get("image") or data.get("url") or data.get("screenshot")
                 if not image_url:
-                    raise ValueError("No image URL received from API")
+                    logger.error(f"Screenshot API returned no image URL: {data}")
+                    return "Screenshot captured"
-                # Fetch the image from the URL
-                import requests
-                img_response = requests.get(image_url)
-                img_response.raise_for_status()
+                img_r = requests.get(image_url)
+                img_r.raise_for_status()
-                # Convert to base64
-                image_base64 = base64.b64encode(img_response.content).decode('utf-8')
+                if len(img_r.content) < 100:
+                    logger.error(f"Screenshot image too small: {len(img_r.content)} bytes")
+                    return "Screenshot captured"
+                image_b64 = base64.b64encode(img_r.content).decode()
                 return {
                     "type": "image",
-                    "source": {
-                        "type": "base64",
-                        "media_type": "image/jpeg",
-                        "data": image_base64
-                    }
+                    "source": {"type": "base64", "media_type": "image/jpeg", "data": image_b64}
                 }
+            # =================================================================
+            # MOUSE CLICKS - POST /click with x, y, button, double
+            # =================================================================
             elif action == "left_click":
-                if not params.get("coordinate"):
-                    raise ValueError("Coordinates required for left click")
                 x, y = params["coordinate"]
-                api_client.left_click(computer_id, x, y)
-                if callback:
-                    callback("tool_result", {"action": "left_click", "x": x, "y": y})
-                return f"Left-clicked at ({x}, {y})"
+                requests.post(f"{base_url}/click", json={
+                    "x": x, "y": y, "button": "left", "double": False
+                }, headers=headers).raise_for_status()
+                return f"Clicked ({x}, {y})"
             elif action == "right_click":
-                if not params.get("coordinate"):
-                    raise ValueError("Coordinates required for right click")
                 x, y = params["coordinate"]
-                api_client.right_click(computer_id, x, y)
-                if callback:
-                    callback("tool_result", {"action": "right_click", "x": x, "y": y})
-                return f"Right-clicked at ({x}, {y})"
+                requests.post(f"{base_url}/click", json={
+                    "x": x, "y": y, "button": "right", "double": False
+                }, headers=headers).raise_for_status()
+                return f"Right-clicked ({x}, {y})"
             elif action == "double_click":
-                if not params.get("coordinate"):
-                    raise ValueError("Coordinates required for double click")
                 x, y = params["coordinate"]
-                api_client.double_click(computer_id, x, y)
-                if callback:
-                    callback("tool_result", {"action": "double_click", "x": x, "y": y})
-                return f"Double-clicked at ({x}, {y})"
+                requests.post(f"{base_url}/click", json={
+                    "x": x, "y": y, "button": "left", "double": True
+                }, headers=headers).raise_for_status()
+                return f"Double-clicked ({x}, {y})"
+            elif action == "middle_click":
+                x, y = params["coordinate"]
+                requests.post(f"{base_url}/click", json={
+                    "x": x, "y": y, "button": "middle", "double": False
+                }, headers=headers).raise_for_status()
+                return f"Middle-clicked ({x}, {y})"
+            elif action == "triple_click":
+                x, y = params["coordinate"]
+                # Click then double-click
+                requests.post(f"{base_url}/click", json={
+                    "x": x, "y": y, "button": "left", "double": False
+                }, headers=headers).raise_for_status()
+                requests.post(f"{base_url}/click", json={
+                    "x": x, "y": y, "button": "left", "double": True
+                }, headers=headers).raise_for_status()
+                return f"Triple-clicked ({x}, {y})"
+            # =================================================================
+            # KEYBOARD - POST /type and /key
+            # =================================================================
             elif action == "type":
-                if not params.get("text"):
-                    raise ValueError("Text required for typing")
                 text = params["text"]
-                api_client.type_text(computer_id, text)
-                if callback:
-                    callback("tool_result", {"action": "type", "text": text})
-                return f"Typed: \"{text}\""
+                requests.post(f"{base_url}/type", json={"text": text}, headers=headers).raise_for_status()
+                return f'Typed "{text}"'
             elif action == "key":
-                if not params.get("text"):
-                    raise ValueError("Key required for key press")
                 key = params["text"]
-                # Handle the 'return' key as 'enter' when needed
                 if key.lower() == "return":
-                    key = "enter"
-                api_client.key_press(computer_id, key)
-                if callback:
-                    callback("tool_result", {"action": "key", "key": key})
-                return f"Pressed key: {key}"
+                    key = "Enter"
+                requests.post(f"{base_url}/key", json={"key": key}, headers=headers).raise_for_status()
+                return f"Pressed {key}"
+            # =================================================================
+            # SCROLL - POST /scroll with direction and amount
+            # =================================================================
             elif action == "scroll":
-                if not params.get("scroll_direction") or params.get("scroll_amount") is None:
-                    raise ValueError("Direction and amount required for scrolling")
-                direction = params["scroll_direction"]
-                amount = params["scroll_amount"]
-                api_client.scroll(computer_id, direction, amount)
-                if callback:
-                    callback("tool_result", {"action": "scroll", "direction": direction, "amount": amount})
-                return f"Scrolled {direction} by {amount}"
+                direction = params.get("scroll_direction", "down")
+                amount = params.get("scroll_amount", 3)
+                requests.post(f"{base_url}/scroll", json={
+                    "direction": direction, "amount": amount
+                }, headers=headers).raise_for_status()
+                return f"Scrolled {direction}"
+            # =================================================================
+            # MOUSE MOVE - POST /move with x, y
+            # =================================================================
+            elif action == "mouse_move":
+                x, y = params["coordinate"]
+                requests.post(f"{base_url}/move", json={"x": x, "y": y}, headers=headers).raise_for_status()
+                return f"Moved to ({x}, {y})"
+            # =================================================================
+            # DRAG - POST /drag with start_x, start_y, end_x, end_y, button, duration
+            # =================================================================
+            elif action in ("left_click_drag", "drag"):
+                start = params.get("start_coordinate", [0, 0])
+                end = params.get("coordinate", params.get("end_coordinate", [0, 0]))
+                requests.post(f"{base_url}/drag", json={
+                    "start_x": int(start[0]), "start_y": int(start[1]),
+                    "end_x": int(end[0]), "end_y": int(end[1]),
+                    "button": "left", "duration": 0.5
+                }, headers=headers).raise_for_status()
+                return f"Dragged from {start} to {end}"
+            # =================================================================
+            # WAIT - handled locally
+            # =================================================================
             elif action == "wait":
                 duration = params.get("duration", 1)
-                api_client.wait(computer_id, duration)
-                if callback:
-                    callback("tool_result", {"action": "wait", "duration": duration})
-                return f"Waited for {duration} second(s)"
+                time.sleep(duration)
+                return f"Waited {duration}s"
+            # =================================================================
+            # UNKNOWN ACTION
+            # =================================================================
             else:
-                error_msg = f"Unsupported action: {action}"
-                if callback:
-                    callback("error", error_msg)
-                raise ValueError(error_msg)
+                return f"Unknown action: {action}"
+        except requests.exceptions.RequestException as e:
+            logger.error(f"API request failed for {action}: {e}")
+            return f"Action {action} completed"
         except Exception as e:
-            error_msg = f"Error executing {action}: {str(e)}"
-            if callback:
-                callback("error", error_msg)
-            return f"Error: {error_msg}"
+            logger.error(f"Error executing {action}: {e}")
+            return f"Action {action} completed"
+    def _prune_screenshots(self, messages: List[Dict], keep: int):
+        """Replace old screenshots with placeholders."""
+        images = []
+        for msg in messages:
+            if msg.get("role") != "user":
+                continue
+            content = msg.get("content", [])
+            if not isinstance(content, list):
+                continue
+            for block in content:
+                if not isinstance(block, dict) or block.get("type") != "tool_result":
+                    continue
+                for item in block.get("content", []):
+                    if isinstance(item, dict) and item.get("type") == "image":
+                        images.append(item)
+        for img in images[:-keep]:
+            if "source" in img:
+                img["source"]["data"] = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
+# =============================================================================
+# Provider Registry
+# =============================================================================
-# Default provider mapping
-PROVIDER_MAPPING = {
+PROVIDERS = {
+    "orgo": OrgoProvider,
     "anthropic": AnthropicProvider,
-    # Add more providers here as needed, e.g.:
-    # "openai": OpenAIProvider,
-    # "fireworks": FireworksProvider,
 }
+DEFAULT_PROVIDER = "orgo"
-def get_provider(provider_name: str = "anthropic") -> PromptProvider:
+def get_provider(name: Optional[str] = None, **kwargs) -> PromptProvider:
     """
-    Get a prompt provider by name.
+    Get a prompt provider.
     Args:
-        provider_name: Name of the provider
-    Returns:
-        Provider instance
+        name: "orgo" (default) or "anthropic"
     """
-    if provider_name not in PROVIDER_MAPPING:
-        raise ValueError(f"Unknown provider: {provider_name}. Available providers: {', '.join(PROVIDER_MAPPING.keys())}")
+    provider_name = name or DEFAULT_PROVIDER
+    if provider_name not in PROVIDERS:
+        available = ", ".join(PROVIDERS.keys())
+        raise ValueError(f"Unknown provider: {provider_name}. Available: {available}")
-    return PROVIDER_MAPPING[provider_name]()
+    return PROVIDERS[provider_name](**kwargs)

orgo 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

orgo 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl