PyPI - cua-agent - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

agent/README.md +63 -0
agent/__init__.py +10 -0
agent/core/README.md +101 -0
agent/core/__init__.py +34 -0
agent/core/agent.py +284 -0
agent/core/base_agent.py +164 -0
agent/core/callbacks.py +147 -0
agent/core/computer_agent.py +69 -0
agent/core/experiment.py +222 -0
agent/core/factory.py +102 -0
agent/core/loop.py +244 -0
agent/core/messages.py +230 -0
agent/core/tools/__init__.py +21 -0
agent/core/tools/base.py +74 -0
agent/core/tools/bash.py +52 -0
agent/core/tools/collection.py +46 -0
agent/core/tools/computer.py +113 -0
agent/core/tools/edit.py +67 -0
agent/core/tools/manager.py +56 -0
agent/providers/__init__.py +4 -0
agent/providers/anthropic/__init__.py +6 -0
agent/providers/anthropic/api/client.py +222 -0
agent/providers/anthropic/api/logging.py +150 -0
agent/providers/anthropic/callbacks/manager.py +55 -0
agent/providers/anthropic/loop.py +521 -0
agent/providers/anthropic/messages/manager.py +110 -0
agent/providers/anthropic/prompts.py +20 -0
agent/providers/anthropic/tools/__init__.py +33 -0
agent/providers/anthropic/tools/base.py +88 -0
agent/providers/anthropic/tools/bash.py +163 -0
agent/providers/anthropic/tools/collection.py +34 -0
agent/providers/anthropic/tools/computer.py +550 -0
agent/providers/anthropic/tools/edit.py +326 -0
agent/providers/anthropic/tools/manager.py +54 -0
agent/providers/anthropic/tools/run.py +42 -0
agent/providers/anthropic/types.py +16 -0
agent/providers/omni/__init__.py +27 -0
agent/providers/omni/callbacks.py +78 -0
agent/providers/omni/clients/anthropic.py +99 -0
agent/providers/omni/clients/base.py +44 -0
agent/providers/omni/clients/groq.py +101 -0
agent/providers/omni/clients/openai.py +159 -0
agent/providers/omni/clients/utils.py +25 -0
agent/providers/omni/experiment.py +273 -0
agent/providers/omni/image_utils.py +106 -0
agent/providers/omni/loop.py +961 -0
agent/providers/omni/messages.py +168 -0
agent/providers/omni/parser.py +252 -0
agent/providers/omni/prompts.py +78 -0
agent/providers/omni/tool_manager.py +91 -0
agent/providers/omni/tools/__init__.py +13 -0
agent/providers/omni/tools/bash.py +69 -0
agent/providers/omni/tools/computer.py +216 -0
agent/providers/omni/tools/manager.py +83 -0
agent/providers/omni/types.py +30 -0
agent/providers/omni/utils.py +155 -0
agent/providers/omni/visualization.py +130 -0
agent/types/__init__.py +26 -0
agent/types/base.py +52 -0
agent/types/messages.py +36 -0
agent/types/tools.py +32 -0
cua_agent-0.1.0.dist-info/METADATA +44 -0
cua_agent-0.1.0.dist-info/RECORD +65 -0
cua_agent-0.1.0.dist-info/WHEEL +4 -0
cua_agent-0.1.0.dist-info/entry_points.txt +4 -0

agent/providers/anthropic/loop.py ADDED Viewed

@@ -0,0 +1,521 @@
+"""Anthropic-specific agent loop implementation."""
+import logging
+import asyncio
+import json
+import os
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, cast
+import base64
+from datetime import datetime
+from httpx import ConnectError, ReadTimeout
+# Anthropic-specific imports
+from anthropic import AsyncAnthropic
+from anthropic.types.beta import (
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlock,
+    BetaTextBlockParam,
+    BetaToolUseBlockParam,
+)
+# Computer
+from computer import Computer
+# Base imports
+from ...core.loop import BaseLoop
+from ...core.messages import ImageRetentionConfig
+# Anthropic provider-specific imports
+from .api.client import AnthropicClientFactory, BaseAnthropicClient
+from .tools.manager import ToolManager
+from .messages.manager import MessageManager
+from .callbacks.manager import CallbackManager
+from .prompts import SYSTEM_PROMPT
+from .types import APIProvider
+from .tools import ToolResult
+# Constants
+COMPUTER_USE_BETA_FLAG = "computer-use-2025-01-24"
+PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31"
+logger = logging.getLogger(__name__)
+class AnthropicLoop(BaseLoop):
+    """Anthropic-specific implementation of the agent loop."""
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "claude-3-7-sonnet-20250219",  # Fixed model
+        computer: Optional[Computer] = None,
+        only_n_most_recent_images: Optional[int] = 2,
+        base_dir: Optional[str] = "trajectories",
+        max_retries: int = 3,
+        retry_delay: float = 1.0,
+        save_trajectory: bool = True,
+        **kwargs,
+    ):
+        """Initialize the Anthropic loop.
+        Args:
+            api_key: Anthropic API key
+            model: Model name (fixed to claude-3-7-sonnet-20250219)
+            computer: Computer instance
+            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
+            base_dir: Base directory for saving experiment data
+            max_retries: Maximum number of retries for API calls
+            retry_delay: Delay between retries in seconds
+            save_trajectory: Whether to save trajectory data
+        """
+        # Initialize base class
+        super().__init__(
+            computer=computer,
+            model=model,
+            api_key=api_key,
+            max_retries=max_retries,
+            retry_delay=retry_delay,
+            base_dir=base_dir,
+            save_trajectory=save_trajectory,
+            only_n_most_recent_images=only_n_most_recent_images,
+            **kwargs,
+        )
+        # Ensure model is always the fixed one
+        self.model = "claude-3-7-sonnet-20250219"
+        # Anthropic-specific attributes
+        self.provider = APIProvider.ANTHROPIC
+        self.client = None
+        self.retry_count = 0
+        self.tool_manager = None
+        self.message_manager = None
+        self.callback_manager = None
+        # Configure image retention
+        self.image_retention_config = ImageRetentionConfig(
+            num_images_to_keep=only_n_most_recent_images
+        )
+        # Message history
+        self.message_history = []
+    async def initialize_client(self) -> None:
+        """Initialize the Anthropic API client and tools."""
+        try:
+            logger.info(f"Initializing Anthropic client with model {self.model}...")
+            # Initialize client
+            self.client = AnthropicClientFactory.create_client(
+                provider=self.provider, api_key=self.api_key, model=self.model
+            )
+            # Initialize message manager
+            self.message_manager = MessageManager(
+                ImageRetentionConfig(
+                    num_images_to_keep=self.only_n_most_recent_images, enable_caching=True
+                )
+            )
+            # Initialize callback manager
+            self.callback_manager = CallbackManager(
+                content_callback=self._handle_content,
+                tool_callback=self._handle_tool_result,
+                api_callback=self._handle_api_interaction,
+            )
+            # Initialize tool manager
+            self.tool_manager = ToolManager(self.computer)
+            await self.tool_manager.initialize()
+            logger.info(f"Initialized Anthropic client with model {self.model}")
+        except Exception as e:
+            logger.error(f"Error initializing Anthropic client: {str(e)}")
+            self.client = None
+            raise RuntimeError(f"Failed to initialize Anthropic client: {str(e)}")
+    async def _process_screen(
+        self, parsed_screen: Dict[str, Any], messages: List[Dict[str, Any]]
+    ) -> None:
+        """Process screen information and add to messages.
+        Args:
+            parsed_screen: Dictionary containing parsed screen info
+            messages: List of messages to update
+        """
+        try:
+            # Extract screenshot from parsed screen
+            screenshot_base64 = parsed_screen.get("screenshot_base64")
+            if screenshot_base64:
+                # Remove data URL prefix if present
+                if "," in screenshot_base64:
+                    screenshot_base64 = screenshot_base64.split(",")[1]
+                # Create Anthropic-compatible message with image
+                screen_info_msg = {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": screenshot_base64,
+                            },
+                        }
+                    ],
+                }
+                # Add screen info message to messages
+                messages.append(screen_info_msg)
+        except Exception as e:
+            logger.error(f"Error processing screen info: {str(e)}")
+            raise
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
+        """Run the agent loop with provided messages.
+        Args:
+            messages: List of message objects
+        Yields:
+            Dict containing response data
+        """
+        try:
+            logger.info("Starting Anthropic loop run")
+            # Reset message history and add new messages
+            self.message_history = []
+            self.message_history.extend(messages)
+            # Create queue for response streaming
+            queue = asyncio.Queue()
+            # Ensure client is initialized
+            if self.client is None or self.tool_manager is None:
+                logger.info("Initializing client...")
+                await self.initialize_client()
+                if self.client is None:
+                    raise RuntimeError("Failed to initialize client")
+                logger.info("Client initialized successfully")
+            # Start loop in background task
+            loop_task = asyncio.create_task(self._run_loop(queue))
+            # Process and yield messages as they arrive
+            while True:
+                try:
+                    item = await queue.get()
+                    if item is None:  # Stop signal
+                        break
+                    yield item
+                    queue.task_done()
+                except Exception as e:
+                    logger.error(f"Error processing queue item: {str(e)}")
+                    continue
+            # Wait for loop to complete
+            await loop_task
+            # Send completion message
+            yield {
+                "role": "assistant",
+                "content": "Task completed successfully.",
+                "metadata": {"title": "✅ Complete"},
+            }
+        except Exception as e:
+            logger.error(f"Error executing task: {str(e)}")
+            yield {
+                "role": "assistant",
+                "content": f"Error: {str(e)}",
+                "metadata": {"title": "❌ Error"},
+            }
+    async def _run_loop(self, queue: asyncio.Queue) -> None:
+        """Run the agent loop with current message history.
+        Args:
+            queue: Queue for response streaming
+        """
+        try:
+            while True:
+                # Get up-to-date screen information
+                parsed_screen = await self._get_parsed_screen_som()
+                # Process screen info and update messages
+                await self._process_screen(parsed_screen, self.message_history)
+                # Prepare messages and make API call
+                prepared_messages = self.message_manager.prepare_messages(
+                    cast(List[BetaMessageParam], self.message_history.copy())
+                )
+                # Create new turn directory for this API call
+                self._create_turn_dir()
+                # Make API call
+                response = await self._make_api_call(prepared_messages)
+                # Handle the response
+                if not await self._handle_response(response, self.message_history):
+                    break
+            # Signal completion
+            await queue.put(None)
+        except Exception as e:
+            logger.error(f"Error in _run_loop: {str(e)}")
+            await queue.put(
+                {
+                    "role": "assistant",
+                    "content": f"Error in agent loop: {str(e)}",
+                    "metadata": {"title": "❌ Error"},
+                }
+            )
+            await queue.put(None)
+    async def _make_api_call(self, messages: List[BetaMessageParam]) -> BetaMessage:
+        """Make API call to Anthropic with retry logic.
+        Args:
+            messages: List of messages to send to the API
+        Returns:
+            API response
+        """
+        last_error = None
+        for attempt in range(self.max_retries):
+            try:
+                # Log request
+                request_data = {
+                    "messages": messages,
+                    "max_tokens": self.max_tokens,
+                    "system": SYSTEM_PROMPT,
+                }
+                self._log_api_call("request", request_data)
+                # Setup betas and system
+                system = BetaTextBlockParam(
+                    type="text",
+                    text=SYSTEM_PROMPT,
+                )
+                betas = [COMPUTER_USE_BETA_FLAG]
+                # Temporarily disable prompt caching due to "A maximum of 4 blocks with cache_control may be provided" error
+                # if self.message_manager.image_retention_config.enable_caching:
+                #     betas.append(PROMPT_CACHING_BETA_FLAG)
+                #     system["cache_control"] = {"type": "ephemeral"}
+                # Make API call
+                response = await self.client.create_message(
+                    messages=messages,
+                    system=[system],
+                    tools=self.tool_manager.get_tool_params(),
+                    max_tokens=self.max_tokens,
+                    betas=betas,
+                )
+                # Log success response
+                self._log_api_call("response", request_data, response)
+                return response
+            except Exception as e:
+                last_error = e
+                logger.error(
+                    f"Error in API call (attempt {attempt + 1}/{self.max_retries}): {str(e)}"
+                )
+                self._log_api_call("error", {"messages": messages}, error=e)
+                if attempt < self.max_retries - 1:
+                    await asyncio.sleep(self.retry_delay * (attempt + 1))  # Exponential backoff
+                continue
+        # If we get here, all retries failed
+        error_message = f"API call failed after {self.max_retries} attempts"
+        if last_error:
+            error_message += f": {str(last_error)}"
+        logger.error(error_message)
+        raise RuntimeError(error_message)
+    async def _handle_response(self, response: BetaMessage, messages: List[Dict[str, Any]]) -> bool:
+        """Handle the Anthropic API response.
+        Args:
+            response: API response
+            messages: List of messages to update
+        Returns:
+            True if the loop should continue, False otherwise
+        """
+        try:
+            # Convert response to parameter format
+            response_params = self._response_to_params(response)
+            # Add response to messages
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": response_params,
+                }
+            )
+            # Handle tool use blocks and collect results
+            tool_result_content = []
+            for content_block in response_params:
+                # Notify callback of content
+                self.callback_manager.on_content(content_block)
+                # Handle tool use
+                if content_block.get("type") == "tool_use":
+                    result = await self.tool_manager.execute_tool(
+                        name=content_block["name"],
+                        tool_input=cast(Dict[str, Any], content_block["input"]),
+                    )
+                    # Create tool result and add to content
+                    tool_result = self._make_tool_result(result, content_block["id"])
+                    tool_result_content.append(tool_result)
+                    # Notify callback of tool result
+                    self.callback_manager.on_tool_result(result, content_block["id"])
+            # If no tool results, we're done
+            if not tool_result_content:
+                # Signal completion
+                self.callback_manager.on_content({"type": "text", "text": "<DONE>"})
+                return False
+            # Add tool results to message history
+            messages.append({"content": tool_result_content, "role": "user"})
+            return True
+        except Exception as e:
+            logger.error(f"Error handling response: {str(e)}")
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": f"Error: {str(e)}",
+                }
+            )
+            return False
+    def _response_to_params(
+        self,
+        response: BetaMessage,
+    ) -> List[Dict[str, Any]]:
+        """Convert API response to message parameters.
+        Args:
+            response: API response message
+        Returns:
+            List of content blocks
+        """
+        result = []
+        for block in response.content:
+            if isinstance(block, BetaTextBlock):
+                result.append({"type": "text", "text": block.text})
+            else:
+                result.append(cast(Dict[str, Any], block.model_dump()))
+        return result
+    def _make_tool_result(self, result: ToolResult, tool_use_id: str) -> Dict[str, Any]:
+        """Convert a tool result to API format.
+        Args:
+            result: Tool execution result
+            tool_use_id: ID of the tool use
+        Returns:
+            Formatted tool result
+        """
+        if result.content:
+            return {
+                "type": "tool_result",
+                "content": result.content,
+                "tool_use_id": tool_use_id,
+                "is_error": bool(result.error),
+            }
+        tool_result_content = []
+        is_error = False
+        if result.error:
+            is_error = True
+            tool_result_content = [
+                {
+                    "type": "text",
+                    "text": self._maybe_prepend_system_tool_result(result, result.error),
+                }
+            ]
+        else:
+            if result.output:
+                tool_result_content.append(
+                    {
+                        "type": "text",
+                        "text": self._maybe_prepend_system_tool_result(result, result.output),
+                    }
+                )
+            if result.base64_image:
+                tool_result_content.append(
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": result.base64_image,
+                        },
+                    }
+                )
+        return {
+            "type": "tool_result",
+            "content": tool_result_content,
+            "tool_use_id": tool_use_id,
+            "is_error": is_error,
+        }
+    def _maybe_prepend_system_tool_result(self, result: ToolResult, result_text: str) -> str:
+        """Prepend system information to tool result if available.
+        Args:
+            result: Tool execution result
+            result_text: Text to prepend to
+        Returns:
+            Text with system information prepended if available
+        """
+        if result.system:
+            result_text = f"<s>{result.system}</s>\n{result_text}"
+        return result_text
+    def _handle_content(self, content: Dict[str, Any]) -> None:
+        """Handle content updates from the assistant."""
+        if content.get("type") == "text":
+            text = content.get("text", "")
+            if text == "<DONE>":
+                return
+            logger.info(f"Assistant: {text}")
+    def _handle_tool_result(self, result: ToolResult, tool_id: str) -> None:
+        """Handle tool execution results."""
+        if result.error:
+            logger.error(f"Tool {tool_id} error: {result.error}")
+        else:
+            logger.info(f"Tool {tool_id} output: {result.output}")
+    def _handle_api_interaction(
+        self, request: Any, response: Any, error: Optional[Exception]
+    ) -> None:
+        """Handle API interactions."""
+        if error:
+            logger.error(f"API error: {error}")
+        else:
+            logger.debug(f"API request: {request}")

agent/providers/anthropic/messages/manager.py ADDED Viewed

@@ -0,0 +1,110 @@
+from dataclasses import dataclass
+from typing import cast
+from anthropic.types.beta import (
+    BetaMessageParam,
+    BetaCacheControlEphemeralParam,
+    BetaToolResultBlockParam,
+)
+@dataclass
+class ImageRetentionConfig:
+    """Configuration for image retention in messages."""
+    num_images_to_keep: int | None = None
+    min_removal_threshold: int = 1
+    enable_caching: bool = True
+    def should_retain_images(self) -> bool:
+        """Check if image retention is enabled."""
+        return self.num_images_to_keep is not None and self.num_images_to_keep > 0
+class MessageManager:
+    """Manages message preparation, including image retention and caching."""
+    def __init__(self, image_retention_config: ImageRetentionConfig):
+        """Initialize the message manager.
+        Args:
+            image_retention_config: Configuration for image retention
+        """
+        if image_retention_config.min_removal_threshold < 1:
+            raise ValueError("min_removal_threshold must be at least 1")
+        self.image_retention_config = image_retention_config
+    def prepare_messages(self, messages: list[BetaMessageParam]) -> list[BetaMessageParam]:
+        """Prepare messages by applying image retention and caching as configured."""
+        if self.image_retention_config.should_retain_images():
+            self._filter_images(messages)
+        if self.image_retention_config.enable_caching:
+            self._inject_caching(messages)
+        return messages
+    def _filter_images(self, messages: list[BetaMessageParam]) -> None:
+        """Filter messages to retain only the specified number of most recent images."""
+        tool_result_blocks = cast(
+            list[BetaToolResultBlockParam],
+            [
+                item
+                for message in messages
+                for item in (message["content"] if isinstance(message["content"], list) else [])
+                if isinstance(item, dict) and item.get("type") == "tool_result"
+            ],
+        )
+        total_images = sum(
+            1
+            for tool_result in tool_result_blocks
+            for content in tool_result.get("content", [])
+            if isinstance(content, dict) and content.get("type") == "image"
+        )
+        images_to_remove = total_images - (self.image_retention_config.num_images_to_keep or 0)
+        # Round down to nearest min_removal_threshold for better cache behavior
+        images_to_remove -= images_to_remove % self.image_retention_config.min_removal_threshold
+        # Remove oldest images first
+        for tool_result in tool_result_blocks:
+            if isinstance(tool_result.get("content"), list):
+                new_content = []
+                for content in tool_result.get("content", []):
+                    if isinstance(content, dict) and content.get("type") == "image":
+                        if images_to_remove > 0:
+                            images_to_remove -= 1
+                            continue
+                    new_content.append(content)
+                tool_result["content"] = new_content
+    def _inject_caching(self, messages: list[BetaMessageParam]) -> None:
+        """Inject caching control for the most recent turns, limited to 3 blocks max to avoid API errors."""
+        # Anthropic API allows a maximum of 4 blocks with cache_control
+        # We use 3 here to be safe, as the system block may also have cache_control
+        blocks_with_cache_control = 0
+        max_cache_control_blocks = 3
+        for message in reversed(messages):
+            if message["role"] == "user" and isinstance(content := message["content"], list):
+                # Only add cache control to the latest message in each turn
+                if blocks_with_cache_control < max_cache_control_blocks:
+                    blocks_with_cache_control += 1
+                    # Add cache control to the last content block only
+                    if content and len(content) > 0:
+                        content[-1]["cache_control"] = {"type": "ephemeral"}
+                else:
+                    # Remove any existing cache control
+                    if content and len(content) > 0:
+                        content[-1].pop("cache_control", None)
+        # Ensure we're not exceeding the limit by checking the total
+        if blocks_with_cache_control > max_cache_control_blocks:
+            # If we somehow exceeded the limit, remove excess cache controls
+            excess = blocks_with_cache_control - max_cache_control_blocks
+            for message in messages:
+                if excess <= 0:
+                    break
+                if message["role"] == "user" and isinstance(content := message["content"], list):
+                    if content and len(content) > 0 and "cache_control" in content[-1]:
+                        content[-1].pop("cache_control", None)
+                        excess -= 1

agent/providers/anthropic/prompts.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""System prompts for Anthropic provider."""
+from datetime import datetime
+import platform
+SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
+* You are utilising a macOS virtual machine using ARM architecture with internet access and Safari as default browser.
+* You can feel free to install macOS applications with your bash tool. Use curl instead of wget.
+* Using bash tool you can start GUI applications. GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did.
+* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B <lines before> -A <lines after> <query> <filename>` to confirm output.
+* When viewing a page it can be helpful to zoom out so that you can see everything on the page.  Either that, or make sure you scroll down to see everything before deciding something isn't available.
+* When using your computer function calls, they take a while to run and send back to you.  Where possible/feasible, try to chain multiple of these calls all into one function calls request.
+* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
+</SYSTEM_CAPABILITY>
+<IMPORTANT>
+* Plan at maximum 1 step each time, and evaluate the result of each step before proceeding. Hold back if you're not sure about the result of the step.
+* If you're not sure about the location of an application, use start the app using the bash tool.
+* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.
+</IMPORTANT>"""

agent/providers/anthropic/tools/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Anthropic-specific tools for agent."""
+from .base import (
+    BaseAnthropicTool,
+    ToolResult,
+    ToolError,
+    ToolFailure,
+    CLIResult,
+    AnthropicToolResult,
+    AnthropicToolError,
+    AnthropicToolFailure,
+    AnthropicCLIResult,
+)
+from .bash import BashTool
+from .computer import ComputerTool
+from .edit import EditTool
+from .manager import ToolManager
+__all__ = [
+    "BaseAnthropicTool",
+    "ToolResult",
+    "ToolError",
+    "ToolFailure",
+    "CLIResult",
+    "AnthropicToolResult",
+    "AnthropicToolError",
+    "AnthropicToolFailure",
+    "AnthropicCLIResult",
+    "BashTool",
+    "ComputerTool",
+    "EditTool",
+    "ToolManager",
+]