PyPI - cua-agent - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

cua-agent 0.1.5py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show

agent/__init__.py +3 -4
agent/core/__init__.py +3 -10
agent/core/computer_agent.py +207 -32
agent/core/experiment.py +20 -3
agent/core/loop.py +78 -120
agent/core/messages.py +279 -125
agent/core/telemetry.py +44 -32
agent/core/types.py +35 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +224 -209
agent/providers/anthropic/messages/manager.py +3 -1
agent/providers/anthropic/response_handler.py +229 -0
agent/providers/anthropic/tools/base.py +1 -1
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/tools/collection.py +2 -2
agent/providers/anthropic/tools/computer.py +34 -24
agent/providers/anthropic/tools/manager.py +2 -2
agent/providers/anthropic/utils.py +370 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +497 -607
agent/providers/omni/parser.py +60 -5
agent/providers/omni/tools/__init__.py +25 -8
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -181
agent/providers/omni/tools/manager.py +26 -48
agent/providers/omni/types.py +0 -4
agent/providers/omni/utils.py +225 -144
{cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
cua_agent-0.1.17.dist-info/RECORD +63 -0
agent/core/agent.py +0 -252
agent/core/base_agent.py +0 -164
agent/core/factory.py +0 -102
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -273
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -26
agent/types/base.py +0 -53
agent/types/messages.py +0 -36
cua_agent-0.1.5.dist-info/RECORD +0 -67
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
{cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0

agent/core/visualization.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Core visualization utilities for agents."""
+import logging
+import base64
+from typing import Dict, Tuple
+from PIL import Image, ImageDraw
+from io import BytesIO
+logger = logging.getLogger(__name__)
+def visualize_click(x: int, y: int, img_base64: str) -> Image.Image:
+    """Visualize a click action by drawing a circle on the screenshot.
+    Args:
+        x: X coordinate of the click
+        y: Y coordinate of the click
+        img_base64: Base64-encoded screenshot
+    Returns:
+        PIL Image with visualization
+    """
+    try:
+        # Decode the base64 image
+        image_data = base64.b64decode(img_base64)
+        img = Image.open(BytesIO(image_data))
+        # Create a copy to draw on
+        draw_img = img.copy()
+        draw = ImageDraw.Draw(draw_img)
+        # Draw a circle at the click location
+        radius = 15
+        draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], outline="red", width=3)
+        # Draw crosshairs
+        line_length = 20
+        draw.line([(x - line_length, y), (x + line_length, y)], fill="red", width=3)
+        draw.line([(x, y - line_length), (x, y + line_length)], fill="red", width=3)
+        return draw_img
+    except Exception as e:
+        logger.error(f"Error visualizing click: {str(e)}")
+        # Return a blank image as fallback
+        return Image.new("RGB", (800, 600), "white")
+def visualize_scroll(direction: str, clicks: int, img_base64: str) -> Image.Image:
+    """Visualize a scroll action by drawing arrows on the screenshot.
+    Args:
+        direction: Direction of scroll ('up' or 'down')
+        clicks: Number of scroll clicks
+        img_base64: Base64-encoded screenshot
+    Returns:
+        PIL Image with visualization
+    """
+    try:
+        # Decode the base64 image
+        image_data = base64.b64decode(img_base64)
+        img = Image.open(BytesIO(image_data))
+        # Create a copy to draw on
+        draw_img = img.copy()
+        draw = ImageDraw.Draw(draw_img)
+        # Calculate parameters for visualization
+        width, height = img.size
+        center_x = width // 2
+        # Draw arrows to indicate scrolling
+        arrow_length = min(100, height // 4)
+        arrow_width = 30
+        num_arrows = min(clicks, 3)  # Don't draw too many arrows
+        # Calculate starting position
+        if direction == "down":
+            start_y = height // 3
+            arrow_dir = 1  # Down
+        else:
+            start_y = height * 2 // 3
+            arrow_dir = -1  # Up
+        # Draw the arrows
+        for i in range(num_arrows):
+            y_pos = start_y + (i * arrow_length * arrow_dir * 0.7)
+            arrow_top = (center_x, y_pos)
+            arrow_bottom = (center_x, y_pos + arrow_length * arrow_dir)
+            # Draw the main line
+            draw.line([arrow_top, arrow_bottom], fill="red", width=5)
+            # Draw the arrowhead
+            arrowhead_size = 20
+            if direction == "down":
+                draw.line(
+                    [
+                        (center_x - arrow_width // 2, arrow_bottom[1] - arrowhead_size),
+                        arrow_bottom,
+                        (center_x + arrow_width // 2, arrow_bottom[1] - arrowhead_size),
+                    ],
+                    fill="red",
+                    width=5,
+                )
+            else:
+                draw.line(
+                    [
+                        (center_x - arrow_width // 2, arrow_bottom[1] + arrowhead_size),
+                        arrow_bottom,
+                        (center_x + arrow_width // 2, arrow_bottom[1] + arrowhead_size),
+                    ],
+                    fill="red",
+                    width=5,
+                )
+        return draw_img
+    except Exception as e:
+        logger.error(f"Error visualizing scroll: {str(e)}")
+        # Return a blank image as fallback
+        return Image.new("RGB", (800, 600), "white")
+def calculate_element_center(bbox: Dict[str, float], width: int, height: int) -> Tuple[int, int]:
+    """Calculate the center point of a UI element.
+    Args:
+        bbox: Bounding box dictionary with x1, y1, x2, y2 coordinates (0-1 normalized)
+        width: Screen width in pixels
+        height: Screen height in pixels
+    Returns:
+        (x, y) tuple with pixel coordinates
+    """
+    center_x = int((bbox["x1"] + bbox["x2"]) / 2 * width)
+    center_y = int((bbox["y1"] + bbox["y2"]) / 2 * height)
+    return center_x, center_y
+class VisualizationHelper:
+    """Helper class for visualizing agent actions."""
+    def __init__(self, agent):
+        """Initialize visualization helper.
+        Args:
+            agent: Reference to the agent that will use this helper
+        """
+        self.agent = agent
+    def visualize_action(self, x: int, y: int, img_base64: str) -> None:
+        """Visualize a click action by drawing on the screenshot."""
+        if (
+            not self.agent.save_trajectory
+            or not hasattr(self.agent, "experiment_manager")
+            or not self.agent.experiment_manager
+        ):
+            return
+        try:
+            # Use the visualization utility
+            img = visualize_click(x, y, img_base64)
+            # Save the visualization
+            self.agent.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
+        except Exception as e:
+            logger.error(f"Error visualizing action: {str(e)}")
+    def visualize_scroll(self, direction: str, clicks: int, img_base64: str) -> None:
+        """Visualize a scroll action by drawing arrows on the screenshot."""
+        if (
+            not self.agent.save_trajectory
+            or not hasattr(self.agent, "experiment_manager")
+            or not self.agent.experiment_manager
+        ):
+            return
+        try:
+            # Use the visualization utility
+            img = visualize_scroll(direction, clicks, img_base64)
+            # Save the visualization
+            self.agent.experiment_manager.save_action_visualization(
+                img, "scroll", f"{direction}_{clicks}"
+            )
+        except Exception as e:
+            logger.error(f"Error visualizing scroll: {str(e)}")
+    def save_action_visualization(
+        self, img: Image.Image, action_name: str, details: str = ""
+    ) -> str:
+        """Save a visualization of an action."""
+        if hasattr(self.agent, "experiment_manager") and self.agent.experiment_manager:
+            return self.agent.experiment_manager.save_action_visualization(
+                img, action_name, details
+            )
+        return ""

agent/providers/anthropic/api/client.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, List, Dict, cast
 import httpx
 import asyncio
 from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex
@@ -80,6 +80,147 @@ class BaseAnthropicClient:
             f"Failed after {self.MAX_RETRIES} retries. " f"Last error: {str(last_error)}"
         )
+    async def run_interleaved(
+        self, messages: List[Dict[str, Any]], system: str, max_tokens: int = 4096
+    ) -> Any:
+        """Run the Anthropic API with the Claude model, supports interleaved tool calling.
+        Args:
+            messages: List of message objects
+            system: System prompt
+            max_tokens: Maximum tokens to generate
+        Returns:
+            API response
+        """
+        # Add the tool_result check/fix logic here
+        fixed_messages = self._fix_missing_tool_results(messages)
+        # Get model name from concrete implementation if available
+        model_name = getattr(self, "model", "unknown model")
+        logger.info(f"Running Anthropic API call with model {model_name}")
+        retry_count = 0
+        while retry_count < self.MAX_RETRIES:
+            try:
+                # Call the Anthropic API through create_message which is implemented by subclasses
+                # Convert system str to the list format expected by create_message
+                system_list = [system]
+                # Convert message format if needed - concrete implementations may do further conversion
+                response = await self.create_message(
+                    messages=cast(list[BetaMessageParam], fixed_messages),
+                    system=system_list,
+                    tools=[],  # Tools are included in the messages
+                    max_tokens=max_tokens,
+                    betas=["tools-2023-12-13"],
+                )
+                logger.info(f"Anthropic API call successful")
+                return response
+            except Exception as e:
+                retry_count += 1
+                wait_time = self.INITIAL_RETRY_DELAY * (
+                    2 ** (retry_count - 1)
+                )  # Exponential backoff
+                logger.info(
+                    f"Retrying request (attempt {retry_count}/{self.MAX_RETRIES}) in {wait_time:.2f} seconds after error: {str(e)}"
+                )
+                await asyncio.sleep(wait_time)
+        # If we get here, all retries failed
+        raise RuntimeError(f"Failed to call Anthropic API after {self.MAX_RETRIES} attempts")
+    def _fix_missing_tool_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Check for and fix any missing tool_result blocks after tool_use blocks.
+        Args:
+            messages: List of message objects
+        Returns:
+            Fixed messages with proper tool_result blocks
+        """
+        fixed_messages = []
+        pending_tool_uses = {}  # Map of tool_use IDs to their details
+        for i, message in enumerate(messages):
+            # Track any tool_use blocks in this message
+            if message.get("role") == "assistant" and "content" in message:
+                content = message.get("content", [])
+                for block in content:
+                    if isinstance(block, dict) and block.get("type") == "tool_use":
+                        tool_id = block.get("id")
+                        if tool_id:
+                            pending_tool_uses[tool_id] = {
+                                "name": block.get("name", ""),
+                                "input": block.get("input", {}),
+                            }
+            # Check if this message handles any pending tool_use blocks
+            if message.get("role") == "user" and "content" in message:
+                # Check for tool_result blocks in this message
+                content = message.get("content", [])
+                for block in content:
+                    if isinstance(block, dict) and block.get("type") == "tool_result":
+                        tool_id = block.get("tool_use_id")
+                        if tool_id in pending_tool_uses:
+                            # This tool_result handles a pending tool_use
+                            pending_tool_uses.pop(tool_id)
+            # Add the message to our fixed list
+            fixed_messages.append(message)
+            # If this is an assistant message with tool_use blocks and there are
+            # pending tool uses that need to be resolved before the next assistant message
+            if (
+                i + 1 < len(messages)
+                and message.get("role") == "assistant"
+                and messages[i + 1].get("role") == "assistant"
+                and pending_tool_uses
+            ):
+                # We need to insert a user message with tool_results for all pending tool_uses
+                tool_results = []
+                for tool_id, tool_info in pending_tool_uses.items():
+                    tool_results.append(
+                        {
+                            "type": "tool_result",
+                            "tool_use_id": tool_id,
+                            "content": {
+                                "type": "error",
+                                "message": "Tool execution was skipped or failed",
+                            },
+                        }
+                    )
+                # Insert a synthetic user message with the tool results
+                if tool_results:
+                    fixed_messages.append({"role": "user", "content": tool_results})
+                # Clear pending tools since we've added results for them
+                pending_tool_uses = {}
+        # Check if there are any remaining pending tool_uses at the end of the conversation
+        if pending_tool_uses and fixed_messages and fixed_messages[-1].get("role") == "assistant":
+            # Add a final user message with tool results for any pending tool_uses
+            tool_results = []
+            for tool_id, tool_info in pending_tool_uses.items():
+                tool_results.append(
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": tool_id,
+                        "content": {
+                            "type": "error",
+                            "message": "Tool execution was skipped or failed",
+                        },
+                    }
+                )
+            if tool_results:
+                fixed_messages.append({"role": "user", "content": tool_results})
+        return fixed_messages
 class AnthropicDirectClient(BaseAnthropicClient):
     """Direct Anthropic API client implementation."""

agent/providers/anthropic/api_handler.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""API call handling for Anthropic provider."""
+import logging
+import asyncio
+from typing import List
+from anthropic.types.beta import (
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlockParam,
+)
+from .types import LLMProvider
+from .prompts import SYSTEM_PROMPT
+# Constants
+COMPUTER_USE_BETA_FLAG = "computer-use-2025-01-24"
+PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31"
+logger = logging.getLogger(__name__)
+class AnthropicAPIHandler:
+    """Handles API calls to Anthropic's API with structured error handling and retries."""
+    def __init__(self, loop):
+        """Initialize the API handler.
+        Args:
+            loop: Reference to the parent loop instance that provides context
+        """
+        self.loop = loop
+    async def make_api_call(
+        self, messages: List[BetaMessageParam], system_prompt: str = SYSTEM_PROMPT
+    ) -> BetaMessage:
+        """Make API call to Anthropic with retry logic.
+        Args:
+            messages: List of messages to send to the API
+            system_prompt: System prompt to use (default: SYSTEM_PROMPT)
+        Returns:
+            API response
+        Raises:
+            RuntimeError: If API call fails after all retries
+        """
+        if self.loop.client is None:
+            raise RuntimeError("Client not initialized. Call initialize_client() first.")
+        if self.loop.tool_manager is None:
+            raise RuntimeError("Tool manager not initialized. Call initialize_client() first.")
+        last_error = None
+        # Add detailed debug logging to examine messages
+        logger.info(f"Sending {len(messages)} messages to Anthropic API")
+        # Log tool use IDs and tool result IDs for debugging
+        tool_use_ids = set()
+        tool_result_ids = set()
+        for i, msg in enumerate(messages):
+            logger.info(f"Message {i}: role={msg.get('role')}")
+            if isinstance(msg.get("content"), list):
+                for content_block in msg.get("content", []):
+                    if isinstance(content_block, dict):
+                        block_type = content_block.get("type")
+                        if block_type == "tool_use" and "id" in content_block:
+                            tool_id = content_block.get("id")
+                            tool_use_ids.add(tool_id)
+                            logger.info(f"  - Found tool_use with ID: {tool_id}")
+                        elif block_type == "tool_result" and "tool_use_id" in content_block:
+                            result_id = content_block.get("tool_use_id")
+                            tool_result_ids.add(result_id)
+                            logger.info(f"  - Found tool_result referencing ID: {result_id}")
+        # Check for mismatches
+        missing_tool_uses = tool_result_ids - tool_use_ids
+        if missing_tool_uses:
+            logger.warning(
+                f"Found tool_result IDs without matching tool_use IDs: {missing_tool_uses}"
+            )
+        for attempt in range(self.loop.max_retries):
+            try:
+                # Log request
+                request_data = {
+                    "messages": messages,
+                    "max_tokens": self.loop.max_tokens,
+                    "system": system_prompt,
+                }
+                # Let ExperimentManager handle sanitization
+                self.loop._log_api_call("request", request_data)
+                # Setup betas and system
+                system = BetaTextBlockParam(
+                    type="text",
+                    text=system_prompt,
+                )
+                betas = [COMPUTER_USE_BETA_FLAG]
+                # Add prompt caching if enabled in the message manager's config
+                if self.loop.message_manager.config.enable_caching:
+                    betas.append(PROMPT_CACHING_BETA_FLAG)
+                    system["cache_control"] = {"type": "ephemeral"}
+                # Make API call
+                response = await self.loop.client.create_message(
+                    messages=messages,
+                    system=[system],
+                    tools=self.loop.tool_manager.get_tool_params(),
+                    max_tokens=self.loop.max_tokens,
+                    betas=betas,
+                )
+                # Let ExperimentManager handle sanitization
+                self.loop._log_api_call("response", request_data, response)
+                return response
+            except Exception as e:
+                last_error = e
+                logger.error(
+                    f"Error in API call (attempt {attempt + 1}/{self.loop.max_retries}): {str(e)}"
+                )
+                self.loop._log_api_call("error", {"messages": messages}, error=e)
+                if attempt < self.loop.max_retries - 1:
+                    await asyncio.sleep(
+                        self.loop.retry_delay * (attempt + 1)
+                    )  # Exponential backoff
+                continue
+        # If we get here, all retries failed
+        error_message = f"API call failed after {self.loop.max_retries} attempts"
+        if last_error:
+            error_message += f": {str(last_error)}"
+        logger.error(error_message)
+        raise RuntimeError(error_message)

agent/providers/anthropic/callbacks/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Anthropic callbacks package."""
+from .manager import CallbackManager
+__all__ = ["CallbackManager"]

cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.5py3-none-any.whl → 0.1.17py3-none-any.whl