PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (42) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +0 -5
agent/core/computer_agent.py +21 -28
agent/core/loop.py +78 -124
agent/core/messages.py +279 -125
agent/core/types.py +35 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +206 -220
agent/providers/anthropic/response_handler.py +229 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +370 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +490 -606
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +0 -4
agent/providers/omni/utils.py +224 -145
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
cua_agent-0.1.17.dist-info/RECORD +63 -0
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0

agent/providers/anthropic/loop.py CHANGED Viewed

@@ -2,39 +2,36 @@
 import logging
 import asyncio
-import json
-import os
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, cast
-import base64
-from datetime import datetime
-from httpx import ConnectError, ReadTimeout
-# Anthropic-specific imports
-from anthropic import AsyncAnthropic
 from anthropic.types.beta import (
     BetaMessage,
     BetaMessageParam,
     BetaTextBlock,
-    BetaTextBlockParam,
-    BetaToolUseBlockParam,
     BetaContentBlockParam,
 )
+import base64
+from datetime import datetime
 # Computer
 from computer import Computer
 # Base imports
 from ...core.loop import BaseLoop
-from ...core.messages import ImageRetentionConfig as CoreImageRetentionConfig
+from ...core.messages import StandardMessageManager, ImageRetentionConfig
+from ...core.types import AgentResponse
 # Anthropic provider-specific imports
 from .api.client import AnthropicClientFactory, BaseAnthropicClient
 from .tools.manager import ToolManager
-from .messages.manager import MessageManager, ImageRetentionConfig
-from .callbacks.manager import CallbackManager
 from .prompts import SYSTEM_PROMPT
 from .types import LLMProvider
 from .tools import ToolResult
+from .utils import to_anthropic_format, to_agent_response_format
+# Import the new modules we created
+from .api_handler import AnthropicAPIHandler
+from .response_handler import AnthropicResponseHandler
+from .callbacks.manager import CallbackManager
 # Constants
 COMPUTER_USE_BETA_FLAG = "computer-use-2025-01-24"
@@ -44,13 +41,22 @@ logger = logging.getLogger(__name__)
 class AnthropicLoop(BaseLoop):
-    """Anthropic-specific implementation of the agent loop."""
+    """Anthropic-specific implementation of the agent loop.
+    This class extends BaseLoop to provide specialized support for Anthropic's Claude models
+    with their unique tool-use capabilities, custom message formatting, and
+    callback-driven approach to handling responses.
+    """
+    ###########################################
+    # INITIALIZATION AND CONFIGURATION
+    ###########################################
     def __init__(
         self,
         api_key: str,
         computer: Computer,
-        model: str = "claude-3-7-sonnet-20250219",  # Fixed model
+        model: str = "claude-3-7-sonnet-20250219",
         only_n_most_recent_images: Optional[int] = 2,
         base_dir: Optional[str] = "trajectories",
         max_retries: int = 3,
@@ -83,27 +89,33 @@ class AnthropicLoop(BaseLoop):
             **kwargs,
         )
-        # Ensure model is always the fixed one
-        self.model = "claude-3-7-sonnet-20250219"
+        # Initialize message manager
+        self.message_manager = StandardMessageManager(
+            config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
+        )
         # Anthropic-specific attributes
         self.provider = LLMProvider.ANTHROPIC
         self.client = None
         self.retry_count = 0
         self.tool_manager = None
-        self.message_manager = None
         self.callback_manager = None
+        self.queue = asyncio.Queue()  # Initialize queue
-        # Configure image retention with core config
-        self.image_retention_config = CoreImageRetentionConfig(
-            num_images_to_keep=only_n_most_recent_images
-        )
+        # Initialize handlers
+        self.api_handler = AnthropicAPIHandler(self)
+        self.response_handler = AnthropicResponseHandler(self)
-        # Message history
-        self.message_history = []
+    ###########################################
+    # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
     async def initialize_client(self) -> None:
-        """Initialize the Anthropic API client and tools."""
+        """Initialize the Anthropic API client and tools.
+        Implements abstract method from BaseLoop to set up the Anthropic-specific
+        client, tool manager, message manager, and callback handlers.
+        """
         try:
             logger.info(f"Initializing Anthropic client with model {self.model}...")
@@ -112,14 +124,7 @@ class AnthropicLoop(BaseLoop):
                 provider=self.provider, api_key=self.api_key, model=self.model
             )
-            # Initialize message manager
-            self.message_manager = MessageManager(
-                image_retention_config=ImageRetentionConfig(
-                    num_images_to_keep=self.only_n_most_recent_images, enable_caching=True
-                )
-            )
-            # Initialize callback manager
+            # Initialize callback manager with our callback handlers
             self.callback_manager = CallbackManager(
                 content_callback=self._handle_content,
                 tool_callback=self._handle_tool_result,
@@ -136,62 +141,22 @@ class AnthropicLoop(BaseLoop):
             self.client = None
             raise RuntimeError(f"Failed to initialize Anthropic client: {str(e)}")
-    async def _process_screen(
-        self, parsed_screen: Dict[str, Any], messages: List[Dict[str, Any]]
-    ) -> None:
-        """Process screen information and add to messages.
+    ###########################################
+    # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
-        Args:
-            parsed_screen: Dictionary containing parsed screen info
-            messages: List of messages to update
-        """
-        try:
-            # Extract screenshot from parsed screen
-            screenshot_base64 = parsed_screen.get("screenshot_base64")
-            if screenshot_base64:
-                # Remove data URL prefix if present
-                if "," in screenshot_base64:
-                    screenshot_base64 = screenshot_base64.split(",")[1]
-                # Create Anthropic-compatible message with image
-                screen_info_msg = {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image",
-                            "source": {
-                                "type": "base64",
-                                "media_type": "image/png",
-                                "data": screenshot_base64,
-                            },
-                        }
-                    ],
-                }
-                # Add screen info message to messages
-                messages.append(screen_info_msg)
-        except Exception as e:
-            logger.error(f"Error processing screen info: {str(e)}")
-            raise
-    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
         """Run the agent loop with provided messages.
         Args:
-            messages: List of message objects
+            messages: List of message objects in standard OpenAI format
         Yields:
-            Dict containing response data
+            Agent response format
         """
         try:
             logger.info("Starting Anthropic loop run")
-            # Reset message history and add new messages
-            self.message_history = []
-            self.message_history.extend(messages)
             # Create queue for response streaming
             queue = asyncio.Queue()
@@ -204,7 +169,7 @@ class AnthropicLoop(BaseLoop):
                 logger.info("Client initialized successfully")
             # Start loop in background task
-            loop_task = asyncio.create_task(self._run_loop(queue))
+            loop_task = asyncio.create_task(self._run_loop(queue, messages))
             # Process and yield messages as they arrive
             while True:
@@ -236,37 +201,87 @@ class AnthropicLoop(BaseLoop):
                 "metadata": {"title": "❌ Error"},
             }
-    async def _run_loop(self, queue: asyncio.Queue) -> None:
-        """Run the agent loop with current message history.
+    ###########################################
+    # AGENT LOOP IMPLEMENTATION
+    ###########################################
+    async def _run_loop(self, queue: asyncio.Queue, messages: List[Dict[str, Any]]) -> None:
+        """Run the agent loop with provided messages.
         Args:
             queue: Queue for response streaming
+            messages: List of messages in standard OpenAI format
         """
         try:
             while True:
-                # Get up-to-date screen information
-                parsed_screen = await self._get_parsed_screen_som()
-                # Process screen info and update messages
-                await self._process_screen(parsed_screen, self.message_history)
+                # Capture screenshot
+                try:
+                    # Take screenshot - always returns raw PNG bytes
+                    screenshot = await self.computer.interface.screenshot()
+                    logger.info("Screenshot captured successfully")
+                    # Convert PNG bytes to base64
+                    base64_image = base64.b64encode(screenshot).decode("utf-8")
+                    logger.info(f"Screenshot converted to base64 (size: {len(base64_image)} bytes)")
+                    # Save screenshot if requested
+                    if self.save_trajectory and self.experiment_manager:
+                        try:
+                            self._save_screenshot(base64_image, action_type="state")
+                            logger.info("Screenshot saved to trajectory")
+                        except Exception as e:
+                            logger.error(f"Error saving screenshot: {str(e)}")
+                    # Create screenshot message
+                    screen_info_msg = {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/png",
+                                    "data": base64_image,
+                                },
+                            }
+                        ],
+                    }
+                    # Add screenshot to messages
+                    messages.append(screen_info_msg)
+                    logger.info("Screenshot message added to conversation")
-                # Prepare messages and make API call
-                if self.message_manager is None:
-                    raise RuntimeError(
-                        "Message manager not initialized. Call initialize_client() first."
-                    )
-                prepared_messages = self.message_manager.prepare_messages(
-                    cast(List[BetaMessageParam], self.message_history.copy())
-                )
+                except Exception as e:
+                    logger.error(f"Error capturing or processing screenshot: {str(e)}")
+                    raise
                 # Create new turn directory for this API call
                 self._create_turn_dir()
-                # Use _make_api_call instead of direct client call to ensure logging
-                response = await self._make_api_call(prepared_messages)
+                # Convert standard messages to Anthropic format using utility function
+                anthropic_messages, system_content = to_anthropic_format(messages.copy())
+                # Use API handler to make API call with Anthropic format
+                response = await self.api_handler.make_api_call(
+                    messages=cast(List[BetaMessageParam], anthropic_messages),
+                    system_prompt=system_content or SYSTEM_PROMPT,
+                )
+                # Use response handler to handle the response and get new messages
+                new_messages, should_continue = await self.response_handler.handle_response(
+                    response, messages
+                )
-                # Handle the response
-                if not await self._handle_response(response, self.message_history):
+                # Add new messages to the parent's message history
+                messages.extend(new_messages)
+                openai_compatible_response = await to_agent_response_format(
+                    response,
+                    messages,
+                    model=self.model,
+                )
+                await queue.put(openai_compatible_response)
+                if not should_continue:
                     break
             # Signal completion
@@ -283,142 +298,101 @@ class AnthropicLoop(BaseLoop):
             )
             await queue.put(None)
-    async def _make_api_call(self, messages: List[BetaMessageParam]) -> BetaMessage:
-        """Make API call to Anthropic with retry logic.
-        Args:
-            messages: List of messages to send to the API
-        Returns:
-            API response
-        """
-        if self.client is None:
-            raise RuntimeError("Client not initialized. Call initialize_client() first.")
-        if self.tool_manager is None:
-            raise RuntimeError("Tool manager not initialized. Call initialize_client() first.")
-        last_error = None
-        for attempt in range(self.max_retries):
-            try:
-                # Log request
-                request_data = {
-                    "messages": messages,
-                    "max_tokens": self.max_tokens,
-                    "system": SYSTEM_PROMPT,
-                }
-                # Let ExperimentManager handle sanitization
-                self._log_api_call("request", request_data)
-                # Setup betas and system
-                system = BetaTextBlockParam(
-                    type="text",
-                    text=SYSTEM_PROMPT,
-                )
-                betas = [COMPUTER_USE_BETA_FLAG]
-                # Temporarily disable prompt caching due to "A maximum of 4 blocks with cache_control may be provided" error
-                # if self.message_manager.image_retention_config.enable_caching:
-                #     betas.append(PROMPT_CACHING_BETA_FLAG)
-                #     system["cache_control"] = {"type": "ephemeral"}
-                # Make API call
-                response = await self.client.create_message(
-                    messages=messages,
-                    system=[system],
-                    tools=self.tool_manager.get_tool_params(),
-                    max_tokens=self.max_tokens,
-                    betas=betas,
-                )
-                # Let ExperimentManager handle sanitization
-                self._log_api_call("response", request_data, response)
-                return response
-            except Exception as e:
-                last_error = e
-                logger.error(
-                    f"Error in API call (attempt {attempt + 1}/{self.max_retries}): {str(e)}"
-                )
-                self._log_api_call("error", {"messages": messages}, error=e)
-                if attempt < self.max_retries - 1:
-                    await asyncio.sleep(self.retry_delay * (attempt + 1))  # Exponential backoff
-                continue
-        # If we get here, all retries failed
-        error_message = f"API call failed after {self.max_retries} attempts"
-        if last_error:
-            error_message += f": {str(last_error)}"
-        logger.error(error_message)
-        raise RuntimeError(error_message)
+    ###########################################
+    # RESPONSE AND CALLBACK HANDLING
+    ###########################################
     async def _handle_response(self, response: BetaMessage, messages: List[Dict[str, Any]]) -> bool:
-        """Handle the Anthropic API response.
+        """Handle a response from the Anthropic API.
         Args:
-            response: API response
-            messages: List of messages to update
+            response: The response from the Anthropic API
+            messages: The message history
         Returns:
-            True if the loop should continue, False otherwise
+            bool: Whether to continue the conversation
         """
         try:
-            # Convert response to parameter format
-            response_params = self._response_to_params(response)
-            # Add response to messages
-            messages.append(
-                {
-                    "role": "assistant",
-                    "content": response_params,
-                }
+            # Convert response to standard format
+            openai_compatible_response = await to_agent_response_format(
+                response,
+                messages,
+                model=self.model,
             )
+            # Put the response on the queue
+            await self.queue.put(openai_compatible_response)
             if self.callback_manager is None:
                 raise RuntimeError(
                     "Callback manager not initialized. Call initialize_client() first."
                 )
-            # Handle tool use blocks and collect results
+            # Handle tool use blocks and collect ALL results before adding to messages
             tool_result_content = []
-            for content_block in response_params:
+            has_tool_use = False
+            for content_block in response.content:
                 # Notify callback of content
                 self.callback_manager.on_content(cast(BetaContentBlockParam, content_block))
-                # Handle tool use
-                if content_block.get("type") == "tool_use":
+                # Handle tool use - carefully check and access attributes
+                if hasattr(content_block, "type") and content_block.type == "tool_use":
+                    has_tool_use = True
                     if self.tool_manager is None:
                         raise RuntimeError(
                             "Tool manager not initialized. Call initialize_client() first."
                         )
+                    # Safely get attributes
+                    tool_name = getattr(content_block, "name", "")
+                    tool_input = getattr(content_block, "input", {})
+                    tool_id = getattr(content_block, "id", "")
                     result = await self.tool_manager.execute_tool(
-                        name=content_block["name"],
-                        tool_input=cast(Dict[str, Any], content_block["input"]),
+                        name=tool_name,
+                        tool_input=cast(Dict[str, Any], tool_input),
                     )
-                    # Create tool result and add to content
-                    tool_result = self._make_tool_result(
-                        cast(ToolResult, result), content_block["id"]
-                    )
+                    # Create tool result
+                    tool_result = self._make_tool_result(cast(ToolResult, result), tool_id)
                     tool_result_content.append(tool_result)
                     # Notify callback of tool result
-                    self.callback_manager.on_tool_result(
-                        cast(ToolResult, result), content_block["id"]
+                    self.callback_manager.on_tool_result(cast(ToolResult, result), tool_id)
+            # If we had any tool_use blocks, we MUST add the tool_result message
+            # even if there were errors or no actual results
+            if has_tool_use:
+                # If somehow we have no tool results but had tool uses, add synthetic error results
+                if not tool_result_content:
+                    logger.warning(
+                        "Had tool uses but no tool results, adding synthetic error results"
                     )
-            # If no tool results, we're done
-            if not tool_result_content:
-                # Signal completion
+                    for content_block in response.content:
+                        if hasattr(content_block, "type") and content_block.type == "tool_use":
+                            tool_id = getattr(content_block, "id", "")
+                            if tool_id:
+                                tool_result_content.append(
+                                    {
+                                        "type": "tool_result",
+                                        "tool_use_id": tool_id,
+                                        "content": {
+                                            "type": "error",
+                                            "text": "Tool execution was skipped or failed",
+                                        },
+                                        "is_error": True,
+                                    }
+                                )
+                # Add ALL tool results as a SINGLE user message
+                messages.append({"role": "user", "content": tool_result_content})
+                return True
+            else:
+                # No tool uses, we're done
                 self.callback_manager.on_content({"type": "text", "text": "<DONE>"})
                 return False
-            # Add tool results to message history
-            messages.append({"content": tool_result_content, "role": "user"})
-            return True
         except Exception as e:
             logger.error(f"Error handling response: {str(e)}")
             messages.append(
@@ -429,28 +403,41 @@ class AnthropicLoop(BaseLoop):
             )
             return False
-    def _response_to_params(
-        self,
-        response: BetaMessage,
-    ) -> List[Dict[str, Any]]:
-        """Convert API response to message parameters.
+    def _response_to_blocks(self, response: BetaMessage) -> List[Dict[str, Any]]:
+        """Convert Anthropic API response to standard blocks format.
         Args:
             response: API response message
         Returns:
-            List of content blocks
+            List of content blocks in standard format
         """
         result = []
         for block in response.content:
             if isinstance(block, BetaTextBlock):
                 result.append({"type": "text", "text": block.text})
+            elif hasattr(block, "type") and block.type == "tool_use":
+                # Safely access attributes after confirming it's a tool_use
+                result.append(
+                    {
+                        "type": "tool_use",
+                        "id": getattr(block, "id", ""),
+                        "name": getattr(block, "name", ""),
+                        "input": getattr(block, "input", {}),
+                    }
+                )
             else:
-                result.append(cast(Dict[str, Any], block.model_dump()))
+                # For other block types, convert to dict
+                block_dict = {}
+                for key, value in vars(block).items():
+                    if not key.startswith("_"):
+                        block_dict[key] = value
+                result.append(block_dict)
         return result
     def _make_tool_result(self, result: ToolResult, tool_use_id: str) -> Dict[str, Any]:
-        """Convert a tool result to API format.
+        """Convert a tool result to standard format.
         Args:
             result: Tool execution result
@@ -489,12 +476,8 @@ class AnthropicLoop(BaseLoop):
             if result.base64_image:
                 tool_result_content.append(
                     {
-                        "type": "image",
-                        "source": {
-                            "type": "base64",
-                            "media_type": "image/png",
-                            "data": result.base64_image,
-                        },
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{result.base64_image}"},
                     }
                 )
@@ -519,16 +502,19 @@ class AnthropicLoop(BaseLoop):
             result_text = f"<s>{result.system}</s>\n{result_text}"
         return result_text
-    def _handle_content(self, content: BetaContentBlockParam) -> None:
+    ###########################################
+    # CALLBACK HANDLERS
+    ###########################################
+    def _handle_content(self, content):
         """Handle content updates from the assistant."""
         if content.get("type") == "text":
-            text_content = cast(BetaTextBlockParam, content)
-            text = text_content["text"]
+            text = content.get("text", "")
             if text == "<DONE>":
                 return
             logger.info(f"Assistant: {text}")
-    def _handle_tool_result(self, result: ToolResult, tool_id: str) -> None:
+    def _handle_tool_result(self, result, tool_id):
         """Handle tool execution results."""
         if result.error:
             logger.error(f"Tool {tool_id} error: {result.error}")

cua-agent 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl