PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (57) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +1 -6
agent/core/{computer_agent.py → agent.py} +31 -76
agent/core/{loop.py → base.py} +68 -127
agent/core/factory.py +104 -0
agent/core/messages.py +279 -125
agent/core/provider_config.py +15 -0
agent/core/types.py +45 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +207 -221
agent/providers/anthropic/response_handler.py +226 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +368 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +491 -607
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +1 -3
agent/providers/omni/utils.py +224 -145
agent/providers/openai/__init__.py +6 -0
agent/providers/openai/api_handler.py +453 -0
agent/providers/openai/loop.py +440 -0
agent/providers/openai/response_handler.py +205 -0
agent/providers/openai/tools/__init__.py +15 -0
agent/providers/openai/tools/base.py +79 -0
agent/providers/openai/tools/computer.py +319 -0
agent/providers/openai/tools/manager.py +106 -0
agent/providers/openai/types.py +36 -0
agent/providers/openai/utils.py +98 -0
cua_agent-0.1.18.dist-info/METADATA +165 -0
cua_agent-0.1.18.dist-info/RECORD +73 -0
agent/README.md +0 -63
agent/providers/anthropic/messages/manager.py +0 -112
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/METADATA +0 -120
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0

agent/providers/omni/tools/manager.py CHANGED Viewed

@@ -1,81 +1,61 @@
-"""Omni tool manager implementation."""
-from typing import Dict, List, Any
-from enum import Enum
+"""Tool manager for the Omni provider."""
+from typing import Any, Dict, List
 from computer.computer import Computer
-from ....core.tools import BaseToolManager
+from ....core.tools import BaseToolManager, ToolResult
 from ....core.tools.collection import ToolCollection
+from .computer import ComputerTool
+from .bash import BashTool
+from ..types import LLMProvider
-from .bash import OmniBashTool
-from .computer import OmniComputerTool
-class ProviderType(Enum):
-    """Supported provider types."""
-    ANTHROPIC = "anthropic"
-    OPENAI = "openai"
-    CLAUDE = "claude"  # Alias for Anthropic
-    GPT = "gpt"  # Alias for OpenAI
+class ToolManager(BaseToolManager):
+    """Manages Omni provider tool initialization and execution."""
-class OmniToolManager(BaseToolManager):
-    """Tool manager for multi-provider support."""
-    def __init__(self, computer: Computer):
-        """Initialize Omni tool manager.
+    def __init__(self, computer: Computer, provider: LLMProvider):
+        """Initialize the tool manager.
         Args:
-            computer: Computer instance for tools
+            computer: Computer instance for computer-related tools
+            provider: The LLM provider being used
         """
         super().__init__(computer)
-        # Initialize tools
-        self.computer_tool = OmniComputerTool(self.computer)
-        self.bash_tool = OmniBashTool(self.computer)
+        self.provider = provider
+        # Initialize Omni-specific tools
+        self.computer_tool = ComputerTool(self.computer)
+        self.bash_tool = BashTool(self.computer)
     def _initialize_tools(self) -> ToolCollection:
         """Initialize all available tools."""
         return ToolCollection(self.computer_tool, self.bash_tool)
     async def _initialize_tools_specific(self) -> None:
-        """Initialize provider-specific tool requirements."""
+        """Initialize Omni provider-specific tool requirements."""
         await self.computer_tool.initialize_dimensions()
     def get_tool_params(self) -> List[Dict[str, Any]]:
         """Get tool parameters for API calls.
         Returns:
-            List of tool parameters in default format
+            List of tool parameters for the current provider's API
         """
         if self.tools is None:
             raise RuntimeError("Tools not initialized. Call initialize() first.")
         return self.tools.to_params()
-    def get_provider_tools(self, provider: ProviderType) -> List[Dict[str, Any]]:
-        """Get tools formatted for a specific provider.
+    async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> ToolResult:
+        """Execute a tool with the given input.
         Args:
-            provider: Provider type to format tools for
+            name: Name of the tool to execute
+            tool_input: Input parameters for the tool
         Returns:
-            List of tool parameters in provider-specific format
+            Result of the tool execution
         """
         if self.tools is None:
             raise RuntimeError("Tools not initialized. Call initialize() first.")
-        # Default is the base implementation
-        tools = self.tools.to_params()
-        # Customize for each provider if needed
-        if provider in [ProviderType.ANTHROPIC, ProviderType.CLAUDE]:
-            # Format for Anthropic API
-            # Additional adjustments can be made here
-            pass
-        elif provider in [ProviderType.OPENAI, ProviderType.GPT]:
-            # Format for OpenAI API
-            # Future implementation
-            pass
-        return tools
+        return await self.tools.run(name=name, tool_input=tool_input)

agent/providers/omni/types.py CHANGED Viewed

@@ -9,12 +9,10 @@ class LLMProvider(StrEnum):
     """Supported LLM providers."""
     ANTHROPIC = "anthropic"
+    OMNI = "omni"
     OPENAI = "openai"
-LLMProvider
 @dataclass
 class LLM:
     """Configuration for LLM model and provider."""

agent/providers/omni/utils.py CHANGED Viewed

@@ -1,157 +1,236 @@
-"""Utility functions for Omni provider."""
+"""Main entry point for computer agents."""
-import base64
-import io
+import asyncio
+import json
 import logging
-from typing import Tuple
-from PIL import Image
+import os
+from typing import Any, Dict, List, Optional
+from som.models import ParseResult
+from ...core.types import AgentResponse
 logger = logging.getLogger(__name__)
-def compress_image_base64(
-    base64_str: str, max_size_bytes: int = 5 * 1024 * 1024, quality: int = 90
-) -> tuple[str, str]:
-    """Compress a base64 encoded image to ensure it's below a certain size.
+async def to_openai_agent_response_format(
+    response: Any,
+    messages: List[Dict[str, Any]],
+    parsed_screen: Optional[ParseResult] = None,
+    parser: Optional[Any] = None,
+    model: Optional[str] = None,
+) -> AgentResponse:
+    """Create an OpenAI computer use agent compatible response format.
     Args:
-        base64_str: Base64 encoded image string (with or without data URL prefix)
-        max_size_bytes: Maximum size in bytes (default: 5MB)
-        quality: Initial JPEG quality (0-100)
+        response: The original API response
+        messages: List of messages in standard OpenAI format
+        parsed_screen: Optional pre-parsed screen information
+        parser: Optional parser instance for coordinate calculation
+        model: Optional model name
     Returns:
-        tuple[str, str]: (Compressed base64 encoded image, media_type)
+        A response formatted according to OpenAI's computer use agent standard, including:
+        - All standard OpenAI computer use agent fields
+        - Original response in response.choices[0].message
+        - Full message history in messages field
     """
-    # Handle data URL prefix if present (e.g., "data:image/png;base64,...")
-    original_prefix = ""
-    media_type = "image/png"  # Default media type
-    if base64_str.startswith("data:"):
-        parts = base64_str.split(",", 1)
-        if len(parts) == 2:
-            original_prefix = parts[0] + ","
-            base64_str = parts[1]
-            # Try to extract media type from the prefix
-            if "image/jpeg" in original_prefix.lower():
-                media_type = "image/jpeg"
-            elif "image/png" in original_prefix.lower():
-                media_type = "image/png"
-    # Check if the base64 string is small enough already
-    if len(base64_str) <= max_size_bytes:
-        logger.info(f"Image already within size limit: {len(base64_str)} bytes")
-        return original_prefix + base64_str, media_type
-    try:
-        # Decode base64
-        img_data = base64.b64decode(base64_str)
-        img_size = len(img_data)
-        logger.info(f"Original image size: {img_size} bytes")
-        # Open image
-        img = Image.open(io.BytesIO(img_data))
-        # First, try to compress as PNG (maintains transparency if present)
-        buffer = io.BytesIO()
-        img.save(buffer, format="PNG", optimize=True)
-        buffer.seek(0)
-        compressed_data = buffer.getvalue()
-        compressed_b64 = base64.b64encode(compressed_data).decode("utf-8")
-        if len(compressed_b64) <= max_size_bytes:
-            logger.info(f"Compressed to {len(compressed_data)} bytes as PNG")
-            return compressed_b64, "image/png"
-        # Strategy 1: Try reducing quality with JPEG format
-        current_quality = quality
-        while current_quality > 20:
-            buffer = io.BytesIO()
-            # Convert to RGB if image has alpha channel (JPEG doesn't support transparency)
-            if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
-                logger.info("Converting transparent image to RGB for JPEG compression")
-                rgb_img = Image.new("RGB", img.size, (255, 255, 255))
-                rgb_img.paste(img, mask=img.split()[3] if img.mode == "RGBA" else None)
-                rgb_img.save(buffer, format="JPEG", quality=current_quality, optimize=True)
-            else:
-                img.save(buffer, format="JPEG", quality=current_quality, optimize=True)
-            buffer.seek(0)
-            compressed_data = buffer.getvalue()
-            compressed_b64 = base64.b64encode(compressed_data).decode("utf-8")
-            if len(compressed_b64) <= max_size_bytes:
-                logger.info(
-                    f"Compressed to {len(compressed_data)} bytes with JPEG quality {current_quality}"
-                )
-                return compressed_b64, "image/jpeg"
-            # Reduce quality and try again
-            current_quality -= 10
-        # Strategy 2: If quality reduction isn't enough, reduce dimensions
-        scale_factor = 0.8
-        current_img = img
-        while scale_factor > 0.3:
-            # Resize image
-            new_width = int(img.width * scale_factor)
-            new_height = int(img.height * scale_factor)
-            current_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
-            # Try with reduced size and quality
-            buffer = io.BytesIO()
-            # Convert to RGB if necessary for JPEG
-            if current_img.mode in ("RGBA", "LA") or (
-                current_img.mode == "P" and "transparency" in current_img.info
-            ):
-                rgb_img = Image.new("RGB", current_img.size, (255, 255, 255))
-                rgb_img.paste(
-                    current_img, mask=current_img.split()[3] if current_img.mode == "RGBA" else None
-                )
-                rgb_img.save(buffer, format="JPEG", quality=70, optimize=True)
-            else:
-                current_img.save(buffer, format="JPEG", quality=70, optimize=True)
-            buffer.seek(0)
-            compressed_data = buffer.getvalue()
-            compressed_b64 = base64.b64encode(compressed_data).decode("utf-8")
-            if len(compressed_b64) <= max_size_bytes:
-                logger.info(
-                    f"Compressed to {len(compressed_data)} bytes with scale {scale_factor} and JPEG quality 70"
-                )
-                return compressed_b64, "image/jpeg"
-            # Reduce scale factor and try again
-            scale_factor -= 0.1
-        # If we get here, we couldn't compress enough
-        logger.warning("Could not compress image below required size with quality preservation")
-        # Last resort: Use minimum quality and size
-        buffer = io.BytesIO()
-        smallest_img = img.resize(
-            (int(img.width * 0.5), int(img.height * 0.5)), Image.Resampling.LANCZOS
+    from datetime import datetime
+    import time
+    # Create a unique ID for this response
+    response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
+    reasoning_id = f"rs_{response_id}"
+    action_id = f"cu_{response_id}"
+    call_id = f"call_{response_id}"
+    # Extract the last assistant message
+    assistant_msg = None
+    for msg in reversed(messages):
+        if msg["role"] == "assistant":
+            assistant_msg = msg
+            break
+    if not assistant_msg:
+        # If no assistant message found, create a default one
+        assistant_msg = {"role": "assistant", "content": "No response available"}
+    # Initialize output array
+    output_items = []
+    # Extract reasoning and action details from the response
+    content = assistant_msg["content"]
+    reasoning_text = None
+    action_details = None
+    for item in content:
+        if isinstance(item, dict) and item.get("type") == "text":
+            try:
+                # Try to parse JSON from text block
+                text_content = item.get("text", "")
+                parsed_json = json.loads(text_content)
+                # Get reasoning text
+                if reasoning_text is None:
+                    reasoning_text = parsed_json.get("Explanation", "")
+                # Extract action details
+                action = parsed_json.get("Action", "").lower()
+                text_input = parsed_json.get("Text", "")
+                value = parsed_json.get("Value", "")  # Also handle Value field
+                box_id = parsed_json.get("Box ID")  # Extract Box ID
+                if action in ["click", "left_click"]:
+                    # Always calculate coordinates from Box ID for click actions
+                    x, y = 100, 100  # Default fallback values
+                    if parsed_screen and box_id is not None and parser is not None:
+                        try:
+                            box_id_int = (
+                                box_id
+                                if isinstance(box_id, int)
+                                else int(str(box_id)) if str(box_id).isdigit() else None
+                            )
+                            if box_id_int is not None:
+                                # Use the parser's method to calculate coordinates
+                                x, y = await parser.calculate_click_coordinates(
+                                    box_id_int, parsed_screen
+                                )
+                        except Exception as e:
+                            logger.error(
+                                f"Error extracting coordinates for Box ID {box_id}: {str(e)}"
+                            )
+                    action_details = {
+                        "type": "click",
+                        "button": "left",
+                        "box_id": (
+                            (
+                                box_id
+                                if isinstance(box_id, int)
+                                else int(box_id) if str(box_id).isdigit() else None
+                            )
+                            if box_id is not None
+                            else None
+                        ),
+                        "x": x,
+                        "y": y,
+                    }
+                elif action in ["type", "type_text"] and (text_input or value):
+                    action_details = {
+                        "type": "type",
+                        "text": text_input or value,
+                    }
+                elif action == "hotkey" and value:
+                    action_details = {
+                        "type": "hotkey",
+                        "keys": value,
+                    }
+                elif action == "scroll":
+                    # Use default coordinates for scrolling
+                    delta_x = 0
+                    delta_y = 0
+                    # Try to extract scroll delta values from content if available
+                    scroll_data = parsed_json.get("Scroll", {})
+                    if scroll_data:
+                        delta_x = scroll_data.get("delta_x", 0)
+                        delta_y = scroll_data.get("delta_y", 0)
+                    action_details = {
+                        "type": "scroll",
+                        "x": 100,
+                        "y": 100,
+                        "scroll_x": delta_x,
+                        "scroll_y": delta_y,
+                    }
+                elif action == "none":
+                    # Handle case when action is None (task completion)
+                    action_details = {"type": "none", "description": "Task completed"}
+            except json.JSONDecodeError:
+                # If not JSON, just use as reasoning text
+                if reasoning_text is None:
+                    reasoning_text = ""
+                reasoning_text += item.get("text", "")
+    # Add reasoning item if we have text content
+    if reasoning_text:
+        output_items.append(
+            {
+                "type": "reasoning",
+                "id": reasoning_id,
+                "summary": [
+                    {
+                        "type": "summary_text",
+                        "text": reasoning_text[:200],  # Truncate to reasonable length
+                    }
+                ],
+            }
         )
-        # Convert to RGB if necessary
-        if smallest_img.mode in ("RGBA", "LA") or (
-            smallest_img.mode == "P" and "transparency" in smallest_img.info
-        ):
-            rgb_img = Image.new("RGB", smallest_img.size, (255, 255, 255))
-            rgb_img.paste(
-                smallest_img, mask=smallest_img.split()[3] if smallest_img.mode == "RGBA" else None
-            )
-            rgb_img.save(buffer, format="JPEG", quality=20, optimize=True)
-        else:
-            smallest_img.save(buffer, format="JPEG", quality=20, optimize=True)
-        buffer.seek(0)
-        final_data = buffer.getvalue()
-        final_b64 = base64.b64encode(final_data).decode("utf-8")
-        logger.warning(f"Final compressed size: {len(final_b64)} bytes (may still exceed limit)")
-        return final_b64, "image/jpeg"
-    except Exception as e:
-        logger.error(f"Error compressing image: {str(e)}")
-        raise
+    # If no action details extracted, use default
+    if not action_details:
+        action_details = {
+            "type": "click",
+            "button": "left",
+            "x": 100,
+            "y": 100,
+        }
+    # Add computer_call item
+    computer_call = {
+        "type": "computer_call",
+        "id": action_id,
+        "call_id": call_id,
+        "action": action_details,
+        "pending_safety_checks": [],
+        "status": "completed",
+    }
+    output_items.append(computer_call)
+    # Extract user and assistant messages from the history
+    user_messages = []
+    assistant_messages = []
+    for msg in messages:
+        if msg["role"] == "user":
+            user_messages.append(msg)
+        elif msg["role"] == "assistant":
+            assistant_messages.append(msg)
+    # Create the OpenAI-compatible response format with all expected fields
+    return {
+        "id": response_id,
+        "object": "response",
+        "created_at": int(time.time()),
+        "status": "completed",
+        "error": None,
+        "incomplete_details": None,
+        "instructions": None,
+        "max_output_tokens": None,
+        "model": model or "unknown",
+        "output": output_items,
+        "parallel_tool_calls": True,
+        "previous_response_id": None,
+        "reasoning": {"effort": "medium", "generate_summary": "concise"},
+        "store": True,
+        "temperature": 1.0,
+        "text": {"format": {"type": "text"}},
+        "tool_choice": "auto",
+        "tools": [
+            {
+                "type": "computer_use_preview",
+                "display_height": 768,
+                "display_width": 1024,
+                "environment": "mac",
+            }
+        ],
+        "top_p": 1.0,
+        "truncation": "auto",
+        "usage": {
+            "input_tokens": 0,  # Placeholder values
+            "input_tokens_details": {"cached_tokens": 0},
+            "output_tokens": 0,  # Placeholder values
+            "output_tokens_details": {"reasoning_tokens": 0},
+            "total_tokens": 0,  # Placeholder values
+        },
+        "user": None,
+        "metadata": {},
+        # Include the original response for backward compatibility
+        "response": {"choices": [{"message": assistant_msg, "finish_reason": "stop"}]},
+    }

agent/providers/openai/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""OpenAI Agent Response API provider for computer control."""
+from .types import LLMProvider
+from .loop import OpenAILoop
+__all__ = ["OpenAILoop", "LLMProvider"]

cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl