PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (57) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +1 -6
agent/core/{computer_agent.py → agent.py} +31 -76
agent/core/{loop.py → base.py} +68 -127
agent/core/factory.py +104 -0
agent/core/messages.py +279 -125
agent/core/provider_config.py +15 -0
agent/core/types.py +45 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +207 -221
agent/providers/anthropic/response_handler.py +226 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +368 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +491 -607
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +1 -3
agent/providers/omni/utils.py +224 -145
agent/providers/openai/__init__.py +6 -0
agent/providers/openai/api_handler.py +453 -0
agent/providers/openai/loop.py +440 -0
agent/providers/openai/response_handler.py +205 -0
agent/providers/openai/tools/__init__.py +15 -0
agent/providers/openai/tools/base.py +79 -0
agent/providers/openai/tools/computer.py +319 -0
agent/providers/openai/tools/manager.py +106 -0
agent/providers/openai/types.py +36 -0
agent/providers/openai/utils.py +98 -0
cua_agent-0.1.18.dist-info/METADATA +165 -0
cua_agent-0.1.18.dist-info/RECORD +73 -0
agent/README.md +0 -63
agent/providers/anthropic/messages/manager.py +0 -112
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/METADATA +0 -120
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0

agent/core/messages.py CHANGED Viewed

@@ -1,12 +1,11 @@
 """Message handling utilities for agent."""
-import base64
-from datetime import datetime
-from io import BytesIO
 import logging
-from typing import Any, Dict, List, Optional, Union
-from PIL import Image
+import json
+from typing import Any, Dict, List, Optional, Union, Tuple
 from dataclasses import dataclass
+import re
+from ..providers.omni.parser import ParseResult
 logger = logging.getLogger(__name__)
@@ -123,123 +122,278 @@ class BaseMessageManager:
                     break
-def create_user_message(text: str) -> Dict[str, str]:
-    """Create a user message.
-    Args:
-        text: The message text
-    Returns:
-        Message dictionary
-    """
-    return {
-        "role": "user",
-        "content": text,
-    }
-def create_assistant_message(text: str) -> Dict[str, str]:
-    """Create an assistant message.
-    Args:
-        text: The message text
-    Returns:
-        Message dictionary
-    """
-    return {
-        "role": "assistant",
-        "content": text,
-    }
-def create_system_message(text: str) -> Dict[str, str]:
-    """Create a system message.
-    Args:
-        text: The message text
-    Returns:
-        Message dictionary
-    """
-    return {
-        "role": "system",
-        "content": text,
-    }
-def create_image_message(
-    image_base64: Optional[str] = None,
-    image_path: Optional[str] = None,
-    image_obj: Optional[Image.Image] = None,
-) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
-    """Create a message with an image.
-    Args:
-        image_base64: Base64 encoded image
-        image_path: Path to image file
-        image_obj: PIL Image object
-    Returns:
-        Message dictionary with content list
-    Raises:
-        ValueError: If no image source is provided
-    """
-    if not any([image_base64, image_path, image_obj]):
-        raise ValueError("Must provide one of image_base64, image_path, or image_obj")
-    # Convert to base64 if needed
-    if image_path and not image_base64:
-        with open(image_path, "rb") as f:
-            image_bytes = f.read()
-            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-    elif image_obj and not image_base64:
-        buffer = BytesIO()
-        image_obj.save(buffer, format="PNG")
-        image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
-    return {
-        "role": "user",
-        "content": [
-            {"type": "image", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
-        ],
-    }
-def create_screen_message(
-    parsed_screen: Dict[str, Any],
-    include_raw: bool = False,
-) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
-    """Create a message with screen information.
-    Args:
-        parsed_screen: Dictionary containing parsed screen info
-        include_raw: Whether to include raw screenshot base64
-    Returns:
-        Message dictionary with content
-    """
-    if include_raw and "screenshot_base64" in parsed_screen:
-        # Create content list with both image and text
-        return {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image_url": {
-                        "url": f"data:image/png;base64,{parsed_screen['screenshot_base64']}"
-                    },
-                },
-                {
-                    "type": "text",
-                    "text": f"Screen dimensions: {parsed_screen['width']}x{parsed_screen['height']}",
-                },
-            ],
-        }
-    else:
-        # Create text-only message with screen info
-        return {
-            "role": "user",
-            "content": f"Screen dimensions: {parsed_screen['width']}x{parsed_screen['height']}",
-        }
+class StandardMessageManager:
+    """Manages messages in a standardized OpenAI format across different providers."""
+    def __init__(self, config: Optional[ImageRetentionConfig] = None):
+        """Initialize message manager.
+        Args:
+            config: Configuration for image retention
+        """
+        self.messages: List[Dict[str, Any]] = []
+        self.config = config or ImageRetentionConfig()
+    def add_user_message(self, content: Union[str, List[Dict[str, Any]]]) -> None:
+        """Add a user message.
+        Args:
+            content: Message content (text or multimodal content)
+        """
+        self.messages.append({"role": "user", "content": content})
+    def add_assistant_message(self, content: Union[str, List[Dict[str, Any]]]) -> None:
+        """Add an assistant message.
+        Args:
+            content: Message content (text or multimodal content)
+        """
+        self.messages.append({"role": "assistant", "content": content})
+    def add_system_message(self, content: str) -> None:
+        """Add a system message.
+        Args:
+            content: System message content
+        """
+        self.messages.append({"role": "system", "content": content})
+    def get_messages(self) -> List[Dict[str, Any]]:
+        """Get all messages in standard format.
+        Returns:
+            List of messages
+        """
+        # If image retention is configured, apply it
+        if self.config.num_images_to_keep is not None:
+            return self._apply_image_retention(self.messages)
+        return self.messages
+    def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Apply image retention policy to messages.
+        Args:
+            messages: List of messages
+        Returns:
+            List of messages with image retention applied
+        """
+        if not self.config.num_images_to_keep:
+            return messages
+        # Find user messages with images
+        image_messages = []
+        for msg in messages:
+            if msg["role"] == "user" and isinstance(msg["content"], list):
+                has_image = any(
+                    item.get("type") == "image_url" or item.get("type") == "image"
+                    for item in msg["content"]
+                )
+                if has_image:
+                    image_messages.append(msg)
+        # If we don't have more images than the limit, return all messages
+        if len(image_messages) <= self.config.num_images_to_keep:
+            return messages
+        # Get the most recent N images to keep
+        images_to_keep = image_messages[-self.config.num_images_to_keep :]
+        images_to_remove = image_messages[: -self.config.num_images_to_keep]
+        # Create a new message list without the older images
+        result = []
+        for msg in messages:
+            if msg in images_to_remove:
+                # Skip this message
+                continue
+            result.append(msg)
+        return result
+    def to_anthropic_format(
+        self, messages: List[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], str]:
+        """Convert standard OpenAI format messages to Anthropic format.
+        Args:
+            messages: List of messages in OpenAI format
+        Returns:
+            Tuple containing (anthropic_messages, system_content)
+        """
+        result = []
+        system_content = ""
+        # Process messages in order to maintain conversation flow
+        previous_assistant_tool_use_ids = (
+            set()
+        )  # Track tool_use_ids in the previous assistant message
+        for i, msg in enumerate(messages):
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role == "system":
+                # Collect system messages for later use
+                system_content += content + "\n"
+                continue
+            if role == "assistant":
+                # Track tool_use_ids in this assistant message for the next user message
+                previous_assistant_tool_use_ids = set()
+                if isinstance(content, list):
+                    for item in content:
+                        if (
+                            isinstance(item, dict)
+                            and item.get("type") == "tool_use"
+                            and "id" in item
+                        ):
+                            previous_assistant_tool_use_ids.add(item["id"])
+                logger.info(
+                    f"Tool use IDs in assistant message #{i}: {previous_assistant_tool_use_ids}"
+                )
+            if role in ["user", "assistant"]:
+                anthropic_msg = {"role": role}
+                # Convert content based on type
+                if isinstance(content, str):
+                    # Simple text content
+                    anthropic_msg["content"] = [{"type": "text", "text": content}]
+                elif isinstance(content, list):
+                    # Convert complex content
+                    anthropic_content = []
+                    for item in content:
+                        item_type = item.get("type", "")
+                        if item_type == "text":
+                            anthropic_content.append({"type": "text", "text": item.get("text", "")})
+                        elif item_type == "image_url":
+                            # Convert OpenAI image format to Anthropic
+                            image_url = item.get("image_url", {}).get("url", "")
+                            if image_url.startswith("data:"):
+                                # Extract base64 data and media type
+                                match = re.match(r"data:(.+);base64,(.+)", image_url)
+                                if match:
+                                    media_type, data = match.groups()
+                                    anthropic_content.append(
+                                        {
+                                            "type": "image",
+                                            "source": {
+                                                "type": "base64",
+                                                "media_type": media_type,
+                                                "data": data,
+                                            },
+                                        }
+                                    )
+                            else:
+                                # Regular URL
+                                anthropic_content.append(
+                                    {
+                                        "type": "image",
+                                        "source": {
+                                            "type": "url",
+                                            "url": image_url,
+                                        },
+                                    }
+                                )
+                        elif item_type == "tool_use":
+                            # Always include tool_use blocks
+                            anthropic_content.append(item)
+                        elif item_type == "tool_result":
+                            # Check if this is a user message AND if the tool_use_id exists in the previous assistant message
+                            tool_use_id = item.get("tool_use_id")
+                            # Only include tool_result if it references a tool_use from the immediately preceding assistant message
+                            if (
+                                role == "user"
+                                and tool_use_id
+                                and tool_use_id in previous_assistant_tool_use_ids
+                            ):
+                                anthropic_content.append(item)
+                                logger.info(
+                                    f"Including tool_result with tool_use_id: {tool_use_id}"
+                                )
+                            else:
+                                # Convert to text to preserve information
+                                logger.warning(
+                                    f"Converting tool_result to text. Tool use ID {tool_use_id} not found in previous assistant message"
+                                )
+                                content_text = "Tool Result: "
+                                if "content" in item:
+                                    if isinstance(item["content"], list):
+                                        for content_item in item["content"]:
+                                            if (
+                                                isinstance(content_item, dict)
+                                                and content_item.get("type") == "text"
+                                            ):
+                                                content_text += content_item.get("text", "")
+                                    elif isinstance(item["content"], str):
+                                        content_text += item["content"]
+                                anthropic_content.append({"type": "text", "text": content_text})
+                    anthropic_msg["content"] = anthropic_content
+                result.append(anthropic_msg)
+        return result, system_content
+    def from_anthropic_format(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Convert Anthropic format messages to standard OpenAI format.
+        Args:
+            messages: List of messages in Anthropic format
+        Returns:
+            List of messages in OpenAI format
+        """
+        result = []
+        for msg in messages:
+            role = msg.get("role", "")
+            content = msg.get("content", [])
+            if role in ["user", "assistant"]:
+                openai_msg = {"role": role}
+                # Simple case: single text block
+                if len(content) == 1 and content[0].get("type") == "text":
+                    openai_msg["content"] = content[0].get("text", "")
+                else:
+                    # Complex case: multiple blocks or non-text
+                    openai_content = []
+                    for item in content:
+                        item_type = item.get("type", "")
+                        if item_type == "text":
+                            openai_content.append({"type": "text", "text": item.get("text", "")})
+                        elif item_type == "image":
+                            # Convert Anthropic image to OpenAI format
+                            source = item.get("source", {})
+                            if source.get("type") == "base64":
+                                media_type = source.get("media_type", "image/png")
+                                data = source.get("data", "")
+                                openai_content.append(
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:{media_type};base64,{data}"},
+                                    }
+                                )
+                            else:
+                                # URL
+                                openai_content.append(
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": source.get("url", "")},
+                                    }
+                                )
+                        elif item_type in ["tool_use", "tool_result"]:
+                            # Pass through tool-related content
+                            openai_content.append(item)
+                    openai_msg["content"] = openai_content
+                result.append(openai_msg)
+        return result

agent/core/provider_config.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Provider-specific configurations and constants."""
+from ..providers.omni.types import LLMProvider
+# Default models for different providers
+DEFAULT_MODELS = {
+    LLMProvider.OPENAI: "gpt-4o",
+    LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
+}
+# Map providers to their environment variable names
+ENV_VARS = {
+    LLMProvider.OPENAI: "OPENAI_API_KEY",
+    LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
+}

agent/core/types.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Core type definitions."""
+from typing import Any, Dict, List, Optional, TypedDict, Union
+from enum import Enum, auto
+class AgentLoop(Enum):
+    """Enumeration of available loop types."""
+    ANTHROPIC = auto()  # Anthropic implementation
+    OMNI = auto()  # OmniLoop implementation
+    OPENAI = auto()  # OpenAI implementation
+    # Add more loop types as needed
+class AgentResponse(TypedDict, total=False):
+    """Agent response format."""
+    id: str
+    object: str
+    created_at: int
+    status: str
+    error: Optional[str]
+    incomplete_details: Optional[Any]
+    instructions: Optional[Any]
+    max_output_tokens: Optional[int]
+    model: str
+    output: List[Dict[str, Any]]
+    parallel_tool_calls: bool
+    previous_response_id: Optional[str]
+    reasoning: Dict[str, str]
+    store: bool
+    temperature: float
+    text: Dict[str, Dict[str, str]]
+    tool_choice: str
+    tools: List[Dict[str, Union[str, int]]]
+    top_p: float
+    truncation: str
+    usage: Dict[str, Any]
+    user: Optional[str]
+    metadata: Dict[str, Any]
+    response: Dict[str, List[Dict[str, Any]]]
+    # Additional fields for error responses
+    role: str
+    content: Union[str, List[Dict[str, Any]]]

agent/core/visualization.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Core visualization utilities for agents."""
+import logging
+import base64
+from typing import Dict, Tuple
+from PIL import Image, ImageDraw
+from io import BytesIO
+logger = logging.getLogger(__name__)
+def visualize_click(x: int, y: int, img_base64: str) -> Image.Image:
+    """Visualize a click action by drawing a circle on the screenshot.
+    Args:
+        x: X coordinate of the click
+        y: Y coordinate of the click
+        img_base64: Base64-encoded screenshot
+    Returns:
+        PIL Image with visualization
+    """
+    try:
+        # Decode the base64 image
+        image_data = base64.b64decode(img_base64)
+        img = Image.open(BytesIO(image_data))
+        # Create a copy to draw on
+        draw_img = img.copy()
+        draw = ImageDraw.Draw(draw_img)
+        # Draw a circle at the click location
+        radius = 15
+        draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], outline="red", width=3)
+        # Draw crosshairs
+        line_length = 20
+        draw.line([(x - line_length, y), (x + line_length, y)], fill="red", width=3)
+        draw.line([(x, y - line_length), (x, y + line_length)], fill="red", width=3)
+        return draw_img
+    except Exception as e:
+        logger.error(f"Error visualizing click: {str(e)}")
+        # Return a blank image as fallback
+        return Image.new("RGB", (800, 600), "white")
+def visualize_scroll(direction: str, clicks: int, img_base64: str) -> Image.Image:
+    """Visualize a scroll action by drawing arrows on the screenshot.
+    Args:
+        direction: Direction of scroll ('up' or 'down')
+        clicks: Number of scroll clicks
+        img_base64: Base64-encoded screenshot
+    Returns:
+        PIL Image with visualization
+    """
+    try:
+        # Decode the base64 image
+        image_data = base64.b64decode(img_base64)
+        img = Image.open(BytesIO(image_data))
+        # Create a copy to draw on
+        draw_img = img.copy()
+        draw = ImageDraw.Draw(draw_img)
+        # Calculate parameters for visualization
+        width, height = img.size
+        center_x = width // 2
+        # Draw arrows to indicate scrolling
+        arrow_length = min(100, height // 4)
+        arrow_width = 30
+        num_arrows = min(clicks, 3)  # Don't draw too many arrows
+        # Calculate starting position
+        if direction == "down":
+            start_y = height // 3
+            arrow_dir = 1  # Down
+        else:
+            start_y = height * 2 // 3
+            arrow_dir = -1  # Up
+        # Draw the arrows
+        for i in range(num_arrows):
+            y_pos = start_y + (i * arrow_length * arrow_dir * 0.7)
+            arrow_top = (center_x, y_pos)
+            arrow_bottom = (center_x, y_pos + arrow_length * arrow_dir)
+            # Draw the main line
+            draw.line([arrow_top, arrow_bottom], fill="red", width=5)
+            # Draw the arrowhead
+            arrowhead_size = 20
+            if direction == "down":
+                draw.line(
+                    [
+                        (center_x - arrow_width // 2, arrow_bottom[1] - arrowhead_size),
+                        arrow_bottom,
+                        (center_x + arrow_width // 2, arrow_bottom[1] - arrowhead_size),
+                    ],
+                    fill="red",
+                    width=5,
+                )
+            else:
+                draw.line(
+                    [
+                        (center_x - arrow_width // 2, arrow_bottom[1] + arrowhead_size),
+                        arrow_bottom,
+                        (center_x + arrow_width // 2, arrow_bottom[1] + arrowhead_size),
+                    ],
+                    fill="red",
+                    width=5,
+                )
+        return draw_img
+    except Exception as e:
+        logger.error(f"Error visualizing scroll: {str(e)}")
+        # Return a blank image as fallback
+        return Image.new("RGB", (800, 600), "white")
+def calculate_element_center(bbox: Dict[str, float], width: int, height: int) -> Tuple[int, int]:
+    """Calculate the center point of a UI element.
+    Args:
+        bbox: Bounding box dictionary with x1, y1, x2, y2 coordinates (0-1 normalized)
+        width: Screen width in pixels
+        height: Screen height in pixels
+    Returns:
+        (x, y) tuple with pixel coordinates
+    """
+    center_x = int((bbox["x1"] + bbox["x2"]) / 2 * width)
+    center_y = int((bbox["y1"] + bbox["y2"]) / 2 * height)
+    return center_x, center_y
+class VisualizationHelper:
+    """Helper class for visualizing agent actions."""
+    def __init__(self, agent):
+        """Initialize visualization helper.
+        Args:
+            agent: Reference to the agent that will use this helper
+        """
+        self.agent = agent
+    def visualize_action(self, x: int, y: int, img_base64: str) -> None:
+        """Visualize a click action by drawing on the screenshot."""
+        if (
+            not self.agent.save_trajectory
+            or not hasattr(self.agent, "experiment_manager")
+            or not self.agent.experiment_manager
+        ):
+            return
+        try:
+            # Use the visualization utility
+            img = visualize_click(x, y, img_base64)
+            # Save the visualization
+            self.agent.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
+        except Exception as e:
+            logger.error(f"Error visualizing action: {str(e)}")
+    def visualize_scroll(self, direction: str, clicks: int, img_base64: str) -> None:
+        """Visualize a scroll action by drawing arrows on the screenshot."""
+        if (
+            not self.agent.save_trajectory
+            or not hasattr(self.agent, "experiment_manager")
+            or not self.agent.experiment_manager
+        ):
+            return
+        try:
+            # Use the visualization utility
+            img = visualize_scroll(direction, clicks, img_base64)
+            # Save the visualization
+            self.agent.experiment_manager.save_action_visualization(
+                img, "scroll", f"{direction}_{clicks}"
+            )
+        except Exception as e:
+            logger.error(f"Error visualizing scroll: {str(e)}")
+    def save_action_visualization(
+        self, img: Image.Image, action_name: str, details: str = ""
+    ) -> str:
+        """Save a visualization of an action."""
+        if hasattr(self.agent, "experiment_manager") and self.agent.experiment_manager:
+            return self.agent.experiment_manager.save_action_visualization(
+                img, action_name, details
+            )
+        return ""

cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl