PyPI - orgo - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

orgo 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

orgo/api/client.py +4 -4
orgo/computer.py +96 -4
orgo/prompt.py +405 -0
{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/METADATA +2 -2
{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/RECORD +7 -6
{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/WHEEL +1 -1
{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/top_level.txt +0 -0

orgo/api/client.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """API client for Orgo service"""
+# src/orgo/api/client.py
 import requests
 from typing import Dict, Any, Optional
@@ -6,10 +7,9 @@ from typing import Dict, Any, Optional
 from orgo.utils.auth import get_api_key
 class ApiClient:
-    BASE_URL = "https://www.orgo.ai/api"
-    def __init__(self, api_key: Optional[str] = None):
+    def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None):
         self.api_key = get_api_key(api_key)
+        self.base_url = base_url or "https://www.orgo.ai/api"
         self.session = requests.Session()
         self.session.headers.update({
             "Authorization": f"Bearer {self.api_key}",
@@ -18,7 +18,7 @@ class ApiClient:
         })
     def _request(self, method: str, endpoint: str, data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
-        url = f"{self.BASE_URL}/{endpoint}"
+        url = f"{self.base_url}/{endpoint}"
         try:
             if method.upper() == "GET":

orgo/computer.py CHANGED Viewed

@@ -1,16 +1,30 @@
 """Computer class for interacting with Orgo virtual environments"""
+# src/orgo/computer.py
 import os
 import io
 import base64
-from typing import Dict, Any
+from typing import Dict, List, Any, Optional, Callable, Union
 from PIL import Image
 from .api.client import ApiClient
+from .prompt import get_provider
 class Computer:
-    def __init__(self, project_id=None, api_key=None, config=None):
-        self.api = ApiClient(api_key or os.environ.get("ORGO_API_KEY"))
+    def __init__(self, project_id=None, api_key=None, config=None, base_api_url=None):
+        """
+        Initialize an Orgo virtual computer.
+        Args:
+            project_id: Existing project ID to connect to (optional)
+            api_key: Orgo API key (defaults to ORGO_API_KEY env var)
+            config: Configuration for new computer (optional)
+            base_api_url: Custom API URL (optional)
+        """
+        self.api_key = api_key or os.environ.get("ORGO_API_KEY")
+        self.base_api_url = base_api_url
+        self.api = ApiClient(self.api_key, self.base_api_url)
         if project_id:
             self.project_id = project_id
@@ -81,4 +95,82 @@ class Computer:
     def wait(self, seconds: float) -> Dict[str, Any]:
         """Wait for specified number of seconds"""
-        return self.api.wait(self.project_id, seconds)
+        return self.api.wait(self.project_id, seconds)
+    # AI control method
+    def prompt(self,
+               instruction: str,
+               provider: str = "anthropic",
+               model: str = "claude-3-7-sonnet-20250219",
+               display_width: int = 1024,
+               display_height: int = 768,
+               callback: Optional[Callable[[str, Any], None]] = None,
+               thinking_enabled: bool = False,
+               thinking_budget: int = 1024,
+               max_tokens: int = 4096,
+               max_iterations: int = 20,
+               max_saved_screenshots: int = 5,
+               api_key: Optional[str] = None) -> List[Dict[str, Any]]:
+        """
+        Control the computer with natural language instructions using an AI assistant.
+        Args:
+            instruction: What you want the AI to do with the computer
+            provider: AI provider to use (default: "anthropic")
+            model: Model to use (default: "claude-3-7-sonnet-20250219")
+            display_width: Screen width in pixels
+            display_height: Screen height in pixels
+            callback: Optional callback function for progress updates
+            thinking_enabled: Enable Claude's thinking capability (default: False)
+            thinking_budget: Token budget for thinking (default: 1024)
+            max_tokens: Maximum tokens for model response
+            max_iterations: Maximum number of agent loop iterations
+            max_saved_screenshots: Maximum number of screenshots to keep in history (default: 5)
+            api_key: API key for the AI provider (defaults to env var)
+        Returns:
+            List of messages from the conversation
+        Examples:
+            # Simple usage with environment variables
+            computer.prompt("Open Firefox and search for Python tutorials")
+            # With explicit API key
+            computer.prompt("Open Terminal and list files", api_key="your-anthropic-key")
+            # With callback for progress updates
+            computer.prompt("Create a new text file", callback=my_callback_function)
+            # With thinking enabled (Claude 3.7 Sonnet)
+            computer.prompt(
+                "Analyze a complex webpage",
+                thinking_enabled=True
+            )
+            # With custom screenshot management
+            computer.prompt(
+                "Perform a complex multi-step task",
+                max_saved_screenshots=10  # Keep more screenshots for complex tasks
+            )
+        """
+        # Get the provider instance
+        provider_instance = get_provider(provider)
+        # Execute the prompt
+        return provider_instance.execute(
+            computer_id=self.project_id,
+            instruction=instruction,
+            callback=callback,
+            api_key=api_key,
+            model=model,
+            display_width=display_width,
+            display_height=display_height,
+            thinking_enabled=thinking_enabled,
+            thinking_budget=thinking_budget,
+            max_tokens=max_tokens,
+            max_iterations=max_iterations,
+            max_saved_screenshots=max_saved_screenshots,
+            # Pass through the Orgo API client configuration
+            orgo_api_key=self.api_key,
+            orgo_base_url=self.base_api_url
+        )

orgo/prompt.py ADDED Viewed

@@ -0,0 +1,405 @@
+"""
+Prompt module for interacting with virtual computers using AI models.
+"""
+import os
+from typing import Dict, List, Any, Optional, Callable, Union, Protocol
+class PromptProvider(Protocol):
+    """Protocol defining the interface for prompt providers."""
+    def execute(self,
+                computer_id: str,
+                instruction: str,
+                callback: Optional[Callable[[str, Any], None]] = None,
+                **kwargs) -> List[Dict[str, Any]]:
+        """
+        Execute a prompt to control the computer.
+        Args:
+            computer_id: ID of the computer to control
+            instruction: User instruction
+            callback: Optional progress callback function
+            **kwargs: Additional provider-specific parameters
+        Returns:
+            List of messages from the conversation
+        """
+        ...
+class AnthropicProvider:
+    """Anthropic Claude-based prompt provider."""
+    def __init__(self):
+        """Initialize the Anthropic provider."""
+        try:
+            import anthropic
+            self.anthropic = anthropic
+        except ImportError:
+            raise ImportError(
+                "Anthropic SDK not installed. Please install with 'pip install anthropic'"
+            )
+    def execute(self,
+                computer_id: str,
+                instruction: str,
+                callback: Optional[Callable[[str, Any], None]] = None,
+                api_key: Optional[str] = None,
+                model: str = "claude-3-7-sonnet-20250219",
+                display_width: int = 1024,
+                display_height: int = 768,
+                orgo_api_key: Optional[str] = None,
+                orgo_base_url: Optional[str] = None,
+                max_saved_screenshots: int = 2,
+                **kwargs) -> List[Dict[str, Any]]:
+        """
+        Execute a prompt using Anthropic's Claude.
+        Args:
+            computer_id: ID of the computer to control
+            instruction: User instruction
+            callback: Optional progress callback
+            api_key: Anthropic API key
+            model: Model to use
+            display_width: Display width in pixels
+            display_height: Display height in pixels
+            orgo_api_key: API key for Orgo (passed to ApiClient)
+            orgo_base_url: Base URL for Orgo API (passed to ApiClient)
+            max_saved_screenshots: Maximum number of screenshots to maintain in conversation history
+            **kwargs: Additional parameters to pass to the Anthropic API
+        Returns:
+            List of messages from the conversation
+        """
+        # Get API key from kwargs, env var, or raise error
+        api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise ValueError("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable or pass api_key.")
+        # Initialize the client
+        client = self.anthropic.Anthropic(api_key=api_key)
+        # Prepare the messages
+        messages = [{"role": "user", "content": instruction}]
+        # Set up the system prompt
+        system_prompt = f"""You are Claude, an AI assistant that controls a virtual Ubuntu computer with internet access.
+<SYSTEM_CAPABILITY>
+* You are utilising an Ubuntu virtual machine with a display resolution of {display_width}x{display_height}.
+* You can take screenshots to see the current state and control the computer by clicking, typing, pressing keys, and scrolling.
+* The virtual environment is an Ubuntu system with standard applications.
+* Always start by taking a screenshot to see the current state before performing any actions.
+</SYSTEM_CAPABILITY>
+<UBUNTU_DESKTOP_GUIDELINES>
+* CRITICAL INSTRUCTION: When opening applications or files on the Ubuntu desktop, you MUST USE DOUBLE-CLICK rather than single-click.
+* Single-click only selects desktop icons but DOES NOT open them. To open desktop icons, you MUST use double-click.
+* Common desktop interactions:
+  - Desktop icons: DOUBLE-CLICK to open applications and folders
+  - Menu items: SINGLE-CLICK to select options
+  - Taskbar icons: SINGLE-CLICK to open applications
+  - Window buttons: SINGLE-CLICK to use close, minimize, maximize buttons
+  - File browser items: DOUBLE-CLICK to open folders and files
+  - When submitting, use the 'Enter' key, not the 'Return' key.
+* If you see an icon on the desktop that you need to open, ALWAYS use the double_click action, never use left_click.
+</UBUNTU_DESKTOP_GUIDELINES>"""
+        try:
+            # Define the computer tool per Anthropic's documentation
+            tools = [
+                {
+                    "type": "computer_20250124",
+                    "name": "computer",
+                    "display_width_px": display_width,
+                    "display_height_px": display_height,
+                    "display_number": 1
+                }
+            ]
+            # Start the conversation with Claude
+            if callback:
+                callback("status", "Starting conversation with Claude")
+            # Track whether we're in the agent loop
+            iteration = 0
+            max_iterations = kwargs.get("max_iterations", 20)  # Default to 20 iterations max
+            # Create an API client with the proper settings
+            from .api.client import ApiClient
+            api_client = ApiClient(orgo_api_key, orgo_base_url)
+            # Track how many screenshots we've seen so we can prune when needed
+            screenshot_count = 0
+            # Start the agent loop
+            while iteration < max_iterations:
+                iteration += 1
+                # Prune old screenshots if we've exceeded our limit
+                if screenshot_count > max_saved_screenshots:
+                    self._prune_old_screenshots(messages, screenshot_count - max_saved_screenshots)
+                    screenshot_count = max_saved_screenshots
+                # Create the request parameters
+                request_params = {
+                    "model": model,
+                    "max_tokens": kwargs.get("max_tokens", 4096),
+                    "system": system_prompt,
+                    "messages": messages,
+                    "tools": tools,
+                    "betas": ["computer-use-2025-01-24"],
+                }
+                # Add thinking parameter only if explicitly enabled
+                if kwargs.get("thinking_enabled"):
+                    request_params["thinking"] = {
+                        "type": "enabled",
+                        "budget_tokens": kwargs.get("thinking_budget", 1024)
+                    }
+                # Create message request to Claude
+                response = client.beta.messages.create(**request_params)
+                # Extract the content from the response
+                response_content = response.content
+                # Add Claude's response to the conversation history
+                assistant_message = {"role": "assistant", "content": response_content}
+                messages.append(assistant_message)
+                # Notify callback of any text content
+                for block in response_content:
+                    if block.type == "text" and callback:
+                        callback("text", block.text)
+                    elif block.type == "thinking" and callback:
+                        callback("thinking", block.thinking)
+                    elif block.type == "tool_use" and callback:
+                        tool_params = {
+                            "action": block.name.split(".")[-1],
+                            **block.input
+                        }
+                        callback("tool_use", tool_params)
+                # Check if Claude requested any tool actions
+                tool_results = []
+                for block in response_content:
+                    if block.type == "tool_use":
+                        # Execute the tool action
+                        result = self._execute_tool(computer_id, block.input, callback, api_client)
+                        # Format the result for Claude
+                        tool_result = {
+                            "type": "tool_result",
+                            "tool_use_id": block.id
+                        }
+                        # Handle image vs text results
+                        if isinstance(result, dict) and "type" in result and result["type"] == "image":
+                            tool_result["content"] = [result]
+                            # Increment screenshot count when we add a new screenshot
+                            if block.input.get("action") == "screenshot":
+                                screenshot_count += 1
+                        else:
+                            tool_result["content"] = [{"type": "text", "text": str(result)}]
+                        tool_results.append(tool_result)
+                # If no tools were used, Claude is done - return the messages
+                if not tool_results:
+                    if callback:
+                        callback("status", "Task completed")
+                    return messages
+                # Add tool results to messages for the next iteration
+                messages.append({"role": "user", "content": tool_results})
+            # We've reached the maximum iteration limit
+            if callback:
+                callback("status", f"Reached maximum iterations ({max_iterations})")
+            return messages
+        except Exception as e:
+            if callback:
+                callback("error", str(e))
+            raise
+    def _prune_old_screenshots(self, messages: List[Dict[str, Any]], num_to_prune: int):
+        """
+        Remove old screenshots from the conversation history.
+        Args:
+            messages: The conversation history
+            num_to_prune: Number of screenshots to remove
+        """
+        screenshots_pruned = 0
+        # Start from the beginning of the messages (excluding the first user message)
+        for i in range(1, len(messages)):
+            if messages[i]["role"] != "user":
+                continue
+            content = messages[i]["content"]
+            if not isinstance(content, list):
+                continue
+            # Look for tool_result blocks in the content
+            for j, block in enumerate(content):
+                if not isinstance(block, dict):
+                    continue
+                if block.get("type") != "tool_result":
+                    continue
+                # Check if this tool_result contains an image
+                block_content = block.get("content", [])
+                for k, content_item in enumerate(block_content):
+                    if not isinstance(content_item, dict):
+                        continue
+                    if content_item.get("type") == "image":
+                        # This is a screenshot, remove it
+                        if "source" in content_item and "data" in content_item["source"]:
+                            # Replace the base64 data with a placeholder to save space
+                            content_item["source"]["data"] = "[IMAGE DATA REMOVED]"
+                            screenshots_pruned += 1
+                            if screenshots_pruned >= num_to_prune:
+                                return
+    def _execute_tool(self,
+                      computer_id: str,
+                      params: Dict[str, Any],
+                      callback: Optional[Callable[[str, Any], None]] = None,
+                      api_client = None) -> Union[str, Dict[str, Any]]:
+        """Execute a tool action via the API client."""
+        action = params.get("action")
+        if callback:
+            callback("tool_executing", {"action": action, "params": params})
+        try:
+            # Use the provided API client or create a new one
+            if api_client is None:
+                # Import here to avoid circular imports
+                from .api.client import ApiClient
+                api_client = ApiClient()
+            # Map actions to API methods
+            if action == "screenshot":
+                response = api_client.get_screenshot(computer_id)
+                if callback:
+                    callback("tool_result", {"type": "image", "action": "screenshot"})
+                return {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": response.get("image", "")
+                    }
+                }
+            elif action == "left_click":
+                if not params.get("coordinate"):
+                    raise ValueError("Coordinates required for left click")
+                x, y = params["coordinate"]
+                api_client.left_click(computer_id, x, y)
+                if callback:
+                    callback("tool_result", {"action": "left_click", "x": x, "y": y})
+                return f"Left-clicked at ({x}, {y})"
+            elif action == "right_click":
+                if not params.get("coordinate"):
+                    raise ValueError("Coordinates required for right click")
+                x, y = params["coordinate"]
+                api_client.right_click(computer_id, x, y)
+                if callback:
+                    callback("tool_result", {"action": "right_click", "x": x, "y": y})
+                return f"Right-clicked at ({x}, {y})"
+            elif action == "double_click":
+                if not params.get("coordinate"):
+                    raise ValueError("Coordinates required for double click")
+                x, y = params["coordinate"]
+                api_client.double_click(computer_id, x, y)
+                if callback:
+                    callback("tool_result", {"action": "double_click", "x": x, "y": y})
+                return f"Double-clicked at ({x}, {y})"
+            elif action == "type":
+                if not params.get("text"):
+                    raise ValueError("Text required for typing")
+                text = params["text"]
+                api_client.type_text(computer_id, text)
+                if callback:
+                    callback("tool_result", {"action": "type", "text": text})
+                return f"Typed: \"{text}\""
+            elif action == "key":
+                if not params.get("text"):
+                    raise ValueError("Key required for key press")
+                key = params["text"]
+                # Handle the 'return' key as 'enter' when needed
+                if key.lower() == "return":
+                    key = "enter"
+                api_client.key_press(computer_id, key)
+                if callback:
+                    callback("tool_result", {"action": "key", "key": key})
+                return f"Pressed key: {key}"
+            elif action == "scroll":
+                if not params.get("scroll_direction") or params.get("scroll_amount") is None:
+                    raise ValueError("Direction and amount required for scrolling")
+                direction = params["scroll_direction"]
+                amount = params["scroll_amount"]
+                api_client.scroll(computer_id, direction, amount)
+                if callback:
+                    callback("tool_result", {"action": "scroll", "direction": direction, "amount": amount})
+                return f"Scrolled {direction} by {amount}"
+            elif action == "wait":
+                duration = params.get("duration", 1)
+                api_client.wait(computer_id, duration)
+                if callback:
+                    callback("tool_result", {"action": "wait", "duration": duration})
+                return f"Waited for {duration} second(s)"
+            else:
+                error_msg = f"Unsupported action: {action}"
+                if callback:
+                    callback("error", error_msg)
+                raise ValueError(error_msg)
+        except Exception as e:
+            error_msg = f"Error executing {action}: {str(e)}"
+            if callback:
+                callback("error", error_msg)
+            return f"Error: {error_msg}"
+# Default provider mapping
+PROVIDER_MAPPING = {
+    "anthropic": AnthropicProvider,
+    # Add more providers here as needed, e.g.:
+    # "openai": OpenAIProvider,
+    # "fireworks": FireworksProvider,
+}
+def get_provider(provider_name: str = "anthropic") -> PromptProvider:
+    """
+    Get a prompt provider by name.
+    Args:
+        provider_name: Name of the provider
+    Returns:
+        Provider instance
+    """
+    if provider_name not in PROVIDER_MAPPING:
+        raise ValueError(f"Unknown provider: {provider_name}. Available providers: {', '.join(PROVIDER_MAPPING.keys())}")
+    return PROVIDER_MAPPING[provider_name]()

{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: orgo
-Version: 0.0.9
-Summary: Desktop infrastructure for AI agents
+Version: 0.0.11
+Summary: Computers for AI agents
 Author: Orgo Team
 License: MIT
 Project-URL: Homepage, https://www.orgo.ai

{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,15 @@
 orgo/__init__.py,sha256=SjO41JOwyLmX8K6fKu7LAIOyBk8sAtQKRaNVu2wZVBE,108
-orgo/computer.py,sha256=iMPNC2y-YvNdvzc2-XrqDlLlZEeVsm3HBxNbOfYwjs0,3242
+orgo/computer.py,sha256=3S-LoxhgDFiQRajOqeGZkzL93a0l09BEmLO3ADphFFI,7140
+orgo/prompt.py,sha256=XCB54dCAwYsQLTB3cTFoatTm8a5mDOwDlZEoyZGEMhA,17490
 orgo/adapters/__init__.py,sha256=LMpWJGVHvvSrLPhCJrMSENYSJ8ifWIpTfi88L6aN65I,219
 orgo/adapters/anthropic.py,sha256=KfPD5YWbwECZkpj421rSLCQf9SMr-EJ4nePe2EMXDWA,2342
 orgo/adapters/base.py,sha256=W1dNArKn1ri-X_36KuN-h5yxh1LmjIYCZMkKi9dm020,559
 orgo/adapters/openai.py,sha256=3NFmUYmaCSXHZImMvqj-fVc8oUxxrcU_6voBx2eFf5A,2605
 orgo/api/__init__.py,sha256=nE9fyIZw2q4mGqy063-1RAbBgy_Qhx11gN_swkhmbX8,86
-orgo/api/client.py,sha256=y1oG7DHfhgESqpEZ_Ui9q9K1hva7V9RqnsjUr8KI0yE,4191
+orgo/api/client.py,sha256=xMTE6kBN9y9ceiTwxkZOg8pi15a2mYk_-AfSBKV6wUg,4264
 orgo/utils/__init__.py,sha256=XSx9W-IPAc7yDnkwgED4v5eBUIQCxmokr_VQzF6LOZs,94
 orgo/utils/auth.py,sha256=mpnaOvM3BGIdZ_j9cTw8K34gPpCeLRrG0rZfoojzONc,484
-orgo-0.0.9.dist-info/METADATA,sha256=6LEeHhb5UYT4TDCZuabcpaQBMxA6Lycr-UTt19lbGX0,750
-orgo-0.0.9.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
-orgo-0.0.9.dist-info/top_level.txt,sha256=q0rYtFji8GbYuhFW8A5Ab9e0j27761IKPhnL0E9xow4,5
-orgo-0.0.9.dist-info/RECORD,,
+orgo-0.0.11.dist-info/METADATA,sha256=FlzfZeC5pIiF3gEFDSpVEUMpBp4hZ5c8S_Q908drSq0,738
+orgo-0.0.11.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
+orgo-0.0.11.dist-info/top_level.txt,sha256=q0rYtFji8GbYuhFW8A5Ab9e0j27761IKPhnL0E9xow4,5
+orgo-0.0.11.dist-info/RECORD,,

{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (79.0.0)
+Generator: setuptools (80.4.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{orgo-0.0.9.dist-info → orgo-0.0.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

orgo 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

orgo 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl