PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (42) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +0 -5
agent/core/computer_agent.py +21 -28
agent/core/loop.py +78 -124
agent/core/messages.py +279 -125
agent/core/types.py +35 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +206 -220
agent/providers/anthropic/response_handler.py +229 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +370 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +490 -606
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +0 -4
agent/providers/omni/utils.py +224 -145
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
cua_agent-0.1.17.dist-info/RECORD +63 -0
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0

agent/providers/anthropic/utils.py ADDED Viewed

@@ -0,0 +1,370 @@
+"""Utility functions for Anthropic message handling."""
+import time
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple, cast
+from anthropic.types.beta import BetaMessage, BetaMessageParam, BetaTextBlock
+from ..omni.parser import ParseResult
+from ...core.types import AgentResponse
+from datetime import datetime
+import json
+# Configure module logger
+logger = logging.getLogger(__name__)
+def to_anthropic_format(
+    messages: List[Dict[str, Any]],
+) -> Tuple[List[Dict[str, Any]], str]:
+    """Convert standard OpenAI format messages to Anthropic format.
+    Args:
+        messages: List of messages in OpenAI format
+    Returns:
+        Tuple containing (anthropic_messages, system_content)
+    """
+    result = []
+    system_content = ""
+    # Process messages in order to maintain conversation flow
+    previous_assistant_tool_use_ids = set()  # Track tool_use_ids in the previous assistant message
+    for i, msg in enumerate(messages):
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        if role == "system":
+            # Collect system messages for later use
+            system_content += content + "\n"
+            continue
+        if role == "assistant":
+            # Track tool_use_ids in this assistant message for the next user message
+            previous_assistant_tool_use_ids = set()
+            if isinstance(content, list):
+                for item in content:
+                    if isinstance(item, dict) and item.get("type") == "tool_use" and "id" in item:
+                        previous_assistant_tool_use_ids.add(item["id"])
+        if role in ["user", "assistant"]:
+            anthropic_msg = {"role": role}
+            # Convert content based on type
+            if isinstance(content, str):
+                # Simple text content
+                anthropic_msg["content"] = [{"type": "text", "text": content}]
+            elif isinstance(content, list):
+                # Convert complex content
+                anthropic_content = []
+                for item in content:
+                    item_type = item.get("type", "")
+                    if item_type == "text":
+                        anthropic_content.append({"type": "text", "text": item.get("text", "")})
+                    elif item_type == "image_url":
+                        # Convert OpenAI image format to Anthropic
+                        image_url = item.get("image_url", {}).get("url", "")
+                        if image_url.startswith("data:"):
+                            # Extract base64 data and media type
+                            match = re.match(r"data:(.+);base64,(.+)", image_url)
+                            if match:
+                                media_type, data = match.groups()
+                                anthropic_content.append(
+                                    {
+                                        "type": "image",
+                                        "source": {
+                                            "type": "base64",
+                                            "media_type": media_type,
+                                            "data": data,
+                                        },
+                                    }
+                                )
+                        else:
+                            # Regular URL
+                            anthropic_content.append(
+                                {
+                                    "type": "image",
+                                    "source": {
+                                        "type": "url",
+                                        "url": image_url,
+                                    },
+                                }
+                            )
+                    elif item_type == "tool_use":
+                        # Always include tool_use blocks
+                        anthropic_content.append(item)
+                    elif item_type == "tool_result":
+                        # Check if this is a user message AND if the tool_use_id exists in the previous assistant message
+                        tool_use_id = item.get("tool_use_id")
+                        # Only include tool_result if it references a tool_use from the immediately preceding assistant message
+                        if (
+                            role == "user"
+                            and tool_use_id
+                            and tool_use_id in previous_assistant_tool_use_ids
+                        ):
+                            anthropic_content.append(item)
+                        else:
+                            content_text = "Tool Result: "
+                            if "content" in item:
+                                if isinstance(item["content"], list):
+                                    for content_item in item["content"]:
+                                        if (
+                                            isinstance(content_item, dict)
+                                            and content_item.get("type") == "text"
+                                        ):
+                                            content_text += content_item.get("text", "")
+                                elif isinstance(item["content"], str):
+                                    content_text += item["content"]
+                            anthropic_content.append({"type": "text", "text": content_text})
+                anthropic_msg["content"] = anthropic_content
+            result.append(anthropic_msg)
+    return result, system_content
+def from_anthropic_format(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert Anthropic format messages to standard OpenAI format.
+    Args:
+        messages: List of messages in Anthropic format
+    Returns:
+        List of messages in OpenAI format
+    """
+    result = []
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", [])
+        if role in ["user", "assistant"]:
+            openai_msg = {"role": role}
+            # Simple case: single text block
+            if len(content) == 1 and content[0].get("type") == "text":
+                openai_msg["content"] = content[0].get("text", "")
+            else:
+                # Complex case: multiple blocks or non-text
+                openai_content = []
+                for item in content:
+                    item_type = item.get("type", "")
+                    if item_type == "text":
+                        openai_content.append({"type": "text", "text": item.get("text", "")})
+                    elif item_type == "image":
+                        # Convert Anthropic image to OpenAI format
+                        source = item.get("source", {})
+                        if source.get("type") == "base64":
+                            media_type = source.get("media_type", "image/png")
+                            data = source.get("data", "")
+                            openai_content.append(
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:{media_type};base64,{data}"},
+                                }
+                            )
+                        else:
+                            # URL
+                            openai_content.append(
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": source.get("url", "")},
+                                }
+                            )
+                    elif item_type in ["tool_use", "tool_result"]:
+                        # Pass through tool-related content
+                        openai_content.append(item)
+                openai_msg["content"] = openai_content
+            result.append(openai_msg)
+    return result
+async def to_agent_response_format(
+    response: BetaMessage,
+    messages: List[Dict[str, Any]],
+    parsed_screen: Optional[ParseResult] = None,
+    parser: Optional[Any] = None,
+    model: Optional[str] = None,
+) -> AgentResponse:
+    """Convert an Anthropic response to the standard agent response format.
+    Args:
+        response: The Anthropic API response (BetaMessage)
+        messages: List of messages in standard format
+        parsed_screen: Optional pre-parsed screen information
+        parser: Optional parser instance for coordinate calculation
+        model: Optional model name
+    Returns:
+        A response formatted according to the standard agent response format
+    """
+    # Create unique IDs for this response
+    response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
+    reasoning_id = f"rs_{response_id}"
+    action_id = f"cu_{response_id}"
+    call_id = f"call_{response_id}"
+    # Extract content and reasoning from Anthropic response
+    content = []
+    reasoning_text = None
+    action_details = None
+    for block in response.content:
+        if block.type == "text":
+            # Use the first text block as reasoning
+            if reasoning_text is None:
+                reasoning_text = block.text
+            content.append({"type": "text", "text": block.text})
+        elif block.type == "tool_use" and block.name == "computer":
+            try:
+                input_dict = cast(Dict[str, Any], block.input)
+                action = input_dict.get("action", "").lower()
+                # Extract coordinates from coordinate list if provided
+                coordinates = input_dict.get("coordinate", [100, 100])
+                x, y = coordinates if len(coordinates) == 2 else (100, 100)
+                if action == "screenshot":
+                    action_details = {
+                        "type": "screenshot",
+                    }
+                elif action in ["click", "left_click", "right_click", "double_click"]:
+                    action_details = {
+                        "type": "click",
+                        "button": "left" if action in ["click", "left_click"] else "right",
+                        "double": action == "double_click",
+                        "x": x,
+                        "y": y,
+                    }
+                elif action == "type":
+                    action_details = {
+                        "type": "type",
+                        "text": input_dict.get("text", ""),
+                    }
+                elif action == "key":
+                    action_details = {
+                        "type": "hotkey",
+                        "keys": [input_dict.get("text", "")],
+                    }
+                elif action == "scroll":
+                    scroll_amount = input_dict.get("scroll_amount", 1)
+                    scroll_direction = input_dict.get("scroll_direction", "down")
+                    delta_y = scroll_amount if scroll_direction == "down" else -scroll_amount
+                    action_details = {
+                        "type": "scroll",
+                        "x": x,
+                        "y": y,
+                        "delta_x": 0,
+                        "delta_y": delta_y,
+                    }
+                elif action == "move":
+                    action_details = {
+                        "type": "move",
+                        "x": x,
+                        "y": y,
+                    }
+            except Exception as e:
+                logger.error(f"Error extracting action details: {str(e)}")
+    # Create output items with reasoning
+    output_items = []
+    if reasoning_text:
+        output_items.append(
+            {
+                "type": "reasoning",
+                "id": reasoning_id,
+                "summary": [
+                    {
+                        "type": "summary_text",
+                        "text": reasoning_text,
+                    }
+                ],
+            }
+        )
+    # Add computer_call item with extracted or default action
+    computer_call = {
+        "type": "computer_call",
+        "id": action_id,
+        "call_id": call_id,
+        "action": action_details or {"type": "none", "description": "No action specified"},
+        "pending_safety_checks": [],
+        "status": "completed",
+    }
+    output_items.append(computer_call)
+    # Create the standard response format
+    standard_response = {
+        "id": response_id,
+        "object": "response",
+        "created_at": int(datetime.now().timestamp()),
+        "status": "completed",
+        "error": None,
+        "incomplete_details": None,
+        "instructions": None,
+        "max_output_tokens": None,
+        "model": model or "anthropic-default",
+        "output": output_items,
+        "parallel_tool_calls": True,
+        "previous_response_id": None,
+        "reasoning": {"effort": "medium", "generate_summary": "concise"},
+        "store": True,
+        "temperature": 1.0,
+        "text": {"format": {"type": "text"}},
+        "tool_choice": "auto",
+        "tools": [
+            {
+                "type": "computer_use_preview",
+                "display_height": 768,
+                "display_width": 1024,
+                "environment": "mac",
+            }
+        ],
+        "top_p": 1.0,
+        "truncation": "auto",
+        "usage": {
+            "input_tokens": 0,
+            "input_tokens_details": {"cached_tokens": 0},
+            "output_tokens": 0,
+            "output_tokens_details": {"reasoning_tokens": 0},
+            "total_tokens": 0,
+        },
+        "user": None,
+        "metadata": {},
+        "response": {
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": content,
+                        "tool_calls": [],
+                    },
+                    "finish_reason": response.stop_reason or "stop",
+                }
+            ]
+        },
+    }
+    # Add tool calls if present
+    tool_calls = []
+    for block in response.content:
+        if hasattr(block, "type") and block.type == "tool_use":
+            tool_calls.append(
+                {
+                    "id": f"call_{block.id}",
+                    "type": "function",
+                    "function": {"name": block.name, "arguments": block.input},
+                }
+            )
+    if tool_calls:
+        standard_response["response"]["choices"][0]["message"]["tool_calls"] = tool_calls
+    return cast(AgentResponse, standard_response)

agent/providers/omni/__init__.py CHANGED Viewed

@@ -1,27 +1,8 @@
 """Omni provider implementation."""
-# The OmniComputerAgent has been replaced by the unified ComputerAgent
-# which can be found in agent.core.agent
 from .types import LLMProvider
-from .experiment import ExperimentManager
-from .visualization import visualize_click, visualize_scroll, calculate_element_center
 from .image_utils import (
     decode_base64_image,
-    encode_image_base64,
-    clean_base64_data,
-    extract_base64_from_text,
-    get_image_dimensions,
 )
-__all__ = [
-    "LLMProvider",
-    "ExperimentManager",
-    "visualize_click",
-    "visualize_scroll",
-    "calculate_element_center",
-    "decode_base64_image",
-    "encode_image_base64",
-    "clean_base64_data",
-    "extract_base64_from_text",
-    "get_image_dimensions",
-]
+__all__ = ["LLMProvider", "decode_base64_image"]

agent/providers/omni/api_handler.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""API handling for Omni provider."""
+import logging
+from typing import Any, Dict, List
+from .prompts import SYSTEM_PROMPT
+logger = logging.getLogger(__name__)
+class OmniAPIHandler:
+    """Handler for Omni API calls."""
+    def __init__(self, loop):
+        """Initialize the API handler.
+        Args:
+            loop: Parent loop instance
+        """
+        self.loop = loop
+    async def make_api_call(
+        self, messages: List[Dict[str, Any]], system_prompt: str = SYSTEM_PROMPT
+    ) -> Any:
+        """Make an API call to the appropriate provider.
+        Args:
+            messages: List of messages in standard OpenAI format
+            system_prompt: System prompt to use
+        Returns:
+            API response
+        """
+        if not self.loop._make_api_call:
+            raise RuntimeError("Loop does not have _make_api_call method")
+        try:
+            # Use the loop's _make_api_call method with standard messages
+            return await self.loop._make_api_call(messages=messages, system_prompt=system_prompt)
+        except Exception as e:
+            logger.error(f"Error making API call: {str(e)}")
+            raise

agent/providers/omni/clients/anthropic.py CHANGED Viewed

@@ -44,6 +44,10 @@ class AnthropicClient(BaseOmniClient):
         anthropic_messages = []
         for message in messages:
+            # Skip messages with empty content
+            if not message.get("content"):
+                continue
             if message["role"] == "user":
                 anthropic_messages.append({"role": "user", "content": message["content"]})
             elif message["role"] == "assistant":

agent/providers/omni/image_utils.py CHANGED Viewed

@@ -32,75 +32,3 @@ def decode_base64_image(img_base64: str) -> Optional[Image.Image]:
     except Exception as e:
         logger.error(f"Error decoding base64 image: {str(e)}")
         return None
-def encode_image_base64(img: Image.Image, format: str = "PNG") -> str:
-    """Encode a PIL Image to base64.
-    Args:
-        img: PIL Image to encode
-        format: Image format (PNG, JPEG, etc.)
-    Returns:
-        Base64 encoded image string
-    """
-    try:
-        buffered = BytesIO()
-        img.save(buffered, format=format)
-        return base64.b64encode(buffered.getvalue()).decode("utf-8")
-    except Exception as e:
-        logger.error(f"Error encoding image to base64: {str(e)}")
-        return ""
-def clean_base64_data(img_base64: str) -> str:
-    """Clean base64 image data by removing data URL prefix.
-    Args:
-        img_base64: Base64 encoded image, may include data URL prefix
-    Returns:
-        Clean base64 string without prefix
-    """
-    if img_base64.startswith("data:image"):
-        return img_base64.split(",")[1]
-    return img_base64
-def extract_base64_from_text(text: str) -> Optional[str]:
-    """Extract base64 image data from a text string.
-    Args:
-        text: Text potentially containing base64 image data
-    Returns:
-        Base64 string or None if not found
-    """
-    # Look for data URL pattern
-    data_url_pattern = r"data:image/[^;]+;base64,([a-zA-Z0-9+/=]+)"
-    match = re.search(data_url_pattern, text)
-    if match:
-        return match.group(1)
-    # Look for plain base64 pattern (basic heuristic)
-    base64_pattern = r"([a-zA-Z0-9+/=]{100,})"
-    match = re.search(base64_pattern, text)
-    if match:
-        return match.group(1)
-    return None
-def get_image_dimensions(img_base64: str) -> Tuple[int, int]:
-    """Get the dimensions of a base64 encoded image.
-    Args:
-        img_base64: Base64 encoded image
-    Returns:
-        Tuple of (width, height) or (0, 0) if decoding fails
-    """
-    img = decode_base64_image(img_base64)
-    if img:
-        return img.size
-    return (0, 0)

cua-agent 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl