PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (42) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +0 -5
agent/core/computer_agent.py +21 -28
agent/core/loop.py +78 -124
agent/core/messages.py +279 -125
agent/core/types.py +35 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +206 -220
agent/providers/anthropic/response_handler.py +229 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +370 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +490 -606
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +0 -4
agent/providers/omni/utils.py +224 -145
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
cua_agent-0.1.17.dist-info/RECORD +63 -0
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0

agent/providers/omni/loop.py CHANGED Viewed

@@ -1,34 +1,28 @@
 """Omni-specific agent loop implementation."""
 import logging
-from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator, Union
-import base64
-from PIL import Image
-from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple, AsyncGenerator
 import json
 import re
 import os
-from datetime import datetime
 import asyncio
 from httpx import ConnectError, ReadTimeout
-import shutil
-import copy
 from typing import cast
-from .parser import OmniParser, ParseResult, ParserMetadata, UIElement
+from .parser import OmniParser, ParseResult
 from ...core.loop import BaseLoop
+from ...core.visualization import VisualizationHelper
+from ...core.messages import StandardMessageManager, ImageRetentionConfig
+from .utils import to_openai_agent_response_format
+from ...core.types import AgentResponse
 from computer import Computer
 from .types import LLMProvider
-from .clients.base import BaseOmniClient
 from .clients.openai import OpenAIClient
-from .clients.groq import GroqClient
 from .clients.anthropic import AnthropicClient
 from .prompts import SYSTEM_PROMPT
-from .utils import compress_image_base64
-from .visualization import visualize_click, visualize_scroll, calculate_element_center
-from .image_utils import decode_base64_image, clean_base64_data
-from ...core.messages import ImageRetentionConfig
-from .messages import OmniMessageManager
+from .api_handler import OmniAPIHandler
+from .tools.manager import ToolManager
+from .tools import ToolResult
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -42,7 +36,16 @@ def extract_data(input_string: str, data_type: str) -> str:
 class OmniLoop(BaseLoop):
-    """Omni-specific implementation of the agent loop."""
+    """Omni-specific implementation of the agent loop.
+    This class extends BaseLoop to provide support for multimodal models
+    from various providers (OpenAI, Anthropic, etc.) with UI parsing
+    and desktop automation capabilities.
+    """
+    ###########################################
+    # INITIALIZATION AND CONFIGURATION
+    ###########################################
     def __init__(
         self,
@@ -77,8 +80,9 @@ class OmniLoop(BaseLoop):
         self.provider = provider
         # Initialize message manager with image retention config
-        image_retention_config = ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
-        self.message_manager = OmniMessageManager(config=image_retention_config)
+        self.message_manager = StandardMessageManager(
+            config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images)
+        )
         # Initialize base class (which will set up experiment manager)
         super().__init__(
@@ -97,87 +101,53 @@ class OmniLoop(BaseLoop):
         self.client = None
         self.retry_count = 0
-    def _should_save_debug_image(self) -> bool:
-        """Check if debug images should be saved.
-        Returns:
-            bool: Always returns False as debug image saving has been disabled.
-        """
-        # Debug image saving functionality has been removed
-        return False
-    def _extract_and_save_images(self, data: Any, prefix: str) -> None:
-        """Extract and save images from API data.
+        # Initialize handlers
+        self.api_handler = OmniAPIHandler(loop=self)
+        self.viz_helper = VisualizationHelper(agent=self)
-        This method is now a no-op as image extraction functionality has been removed.
+        # Initialize tool manager
+        self.tool_manager = ToolManager(computer=computer, provider=provider)
-        Args:
-            data: Data to extract images from
-            prefix: Prefix for the extracted image filenames
-        """
-        # Image extraction functionality has been removed
-        return
+        logger.info("OmniLoop initialized with StandardMessageManager")
-    def _save_debug_image(self, image_data: str, filename: str) -> None:
-        """Save a debug image to the current turn directory.
-        This method is now a no-op as debug image saving functionality has been removed.
-        Args:
-            image_data: Base64 encoded image data
-            filename: Name to use for the saved image
-        """
-        # Debug image saving functionality has been removed
-        return
-    def _visualize_action(self, x: int, y: int, img_base64: str) -> None:
-        """Visualize an action by drawing on the screenshot."""
-        if (
-            not self.save_trajectory
-            or not hasattr(self, "experiment_manager")
-            or not self.experiment_manager
-        ):
-            return
+    async def initialize(self) -> None:
+        """Initialize the loop by setting up tools and clients."""
+        # Initialize base class
+        await super().initialize()
+        # Initialize tool manager with error handling
         try:
-            # Use the visualization utility
-            img = visualize_click(x, y, img_base64)
-            # Save the visualization
-            self.experiment_manager.save_action_visualization(img, "click", f"x{x}_y{y}")
+            logger.info("Initializing tool manager...")
+            await self.tool_manager.initialize()
+            logger.info("Tool manager initialized successfully.")
         except Exception as e:
-            logger.error(f"Error visualizing action: {str(e)}")
-    def _visualize_scroll(self, direction: str, clicks: int, img_base64: str) -> None:
-        """Visualize a scroll action by drawing arrows on the screenshot."""
-        if (
-            not self.save_trajectory
-            or not hasattr(self, "experiment_manager")
-            or not self.experiment_manager
-        ):
-            return
-        try:
-            # Use the visualization utility
-            img = visualize_scroll(direction, clicks, img_base64)
-            # Save the visualization
-            self.experiment_manager.save_action_visualization(
-                img, "scroll", f"{direction}_{clicks}"
+            logger.error(f"Error initializing tool manager: {str(e)}")
+            logger.warning("Will attempt to initialize tools on first use.")
+        # Initialize API clients based on provider
+        if self.provider == LLMProvider.ANTHROPIC:
+            self.client = AnthropicClient(
+                api_key=self.api_key,
+                model=self.model,
             )
-        except Exception as e:
-            logger.error(f"Error visualizing scroll: {str(e)}")
+        elif self.provider == LLMProvider.OPENAI:
+            self.client = OpenAIClient(
+                api_key=self.api_key,
+                model=self.model,
+            )
+        else:
+            raise ValueError(f"Unsupported provider: {self.provider}")
-    def _save_action_visualization(
-        self, img: Image.Image, action_name: str, details: str = ""
-    ) -> str:
-        """Save a visualization of an action."""
-        if hasattr(self, "experiment_manager") and self.experiment_manager:
-            return self.experiment_manager.save_action_visualization(img, action_name, details)
-        return ""
+    ###########################################
+    # CLIENT INITIALIZATION - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
     async def initialize_client(self) -> None:
-        """Initialize the appropriate client based on provider."""
+        """Initialize the appropriate client based on provider.
+        Implements abstract method from BaseLoop to set up the specific
+        provider client (OpenAI, Anthropic, etc.).
+        """
         try:
             logger.info(f"Initializing {self.provider} client with model {self.model}...")
@@ -199,6 +169,10 @@ class OmniLoop(BaseLoop):
             self.client = None
             raise RuntimeError(f"Failed to initialize client: {str(e)}")
+    ###########################################
+    # API CALL HANDLING
+    ###########################################
     async def _make_api_call(self, messages: List[Dict[str, Any]], system_prompt: str) -> Any:
         """Make API call to provider with retry logic."""
         # Create new turn directory for this API call
@@ -218,68 +192,73 @@ class OmniLoop(BaseLoop):
                     if self.client is None:
                         raise RuntimeError("Failed to initialize client")
-                # Set the provider in message manager based on current provider
-                provider_name = str(self.provider).split(".")[-1].lower()  # Extract name from enum
-                self.message_manager.set_provider(provider_name)
-                # Apply image retention and prepare messages
-                # This will limit the number of images based on only_n_most_recent_images
-                prepared_messages = self.message_manager.get_formatted_messages(provider_name)
+                # Get messages in standard format from the message manager
+                self.message_manager.messages = messages.copy()
+                prepared_messages = self.message_manager.get_messages()
-                # Filter out system messages for Anthropic
+                # Special handling for Anthropic
                 if self.provider == LLMProvider.ANTHROPIC:
+                    # Convert to Anthropic format
+                    anthropic_messages, anthropic_system = self.message_manager.to_anthropic_format(
+                        prepared_messages
+                    )
+                    # Filter out any empty/invalid messages
                     filtered_messages = [
-                        msg for msg in prepared_messages if msg["role"] != "system"
+                        msg
+                        for msg in anthropic_messages
+                        if msg.get("role") in ["user", "assistant"]
                     ]
-                else:
-                    filtered_messages = prepared_messages
-                # Log request
-                request_data = {"messages": filtered_messages, "max_tokens": self.max_tokens}
+                    # Ensure there's at least one message for Anthropic
+                    if not filtered_messages:
+                        logger.warning(
+                            "No valid messages found for Anthropic API call. Adding a default user message."
+                        )
+                        filtered_messages = [
+                            {
+                                "role": "user",
+                                "content": [
+                                    {"type": "text", "text": "Please help with this task."}
+                                ],
+                            }
+                        ]
-                if self.provider == LLMProvider.ANTHROPIC:
-                    request_data["system"] = self._get_system_prompt()
-                else:
-                    request_data["system"] = system_prompt
+                    # Combine system prompts if needed
+                    final_system_prompt = anthropic_system or system_prompt
-                self._log_api_call("request", request_data)
+                    # Log request
+                    request_data = {
+                        "messages": filtered_messages,
+                        "max_tokens": self.max_tokens,
+                        "system": final_system_prompt,
+                    }
-                # Make API call with appropriate parameters
-                if self.client is None:
-                    raise RuntimeError("Client not initialized. Call initialize_client() first.")
-                # Check if the method is async by inspecting the client implementation
-                run_method = self.client.run_interleaved
-                is_async = asyncio.iscoroutinefunction(run_method)
-                if is_async:
-                    # For async implementations (AnthropicClient)
-                    if self.provider == LLMProvider.ANTHROPIC:
-                        response = await run_method(
-                            messages=filtered_messages,
-                            system=self._get_system_prompt(),
-                            max_tokens=self.max_tokens,
-                        )
-                    else:
-                        response = await run_method(
-                            messages=messages,
-                            system=system_prompt,
-                            max_tokens=self.max_tokens,
-                        )
+                    self._log_api_call("request", request_data)
+                    # Make API call
+                    response = await self.client.run_interleaved(
+                        messages=filtered_messages,
+                        system=final_system_prompt,
+                        max_tokens=self.max_tokens,
+                    )
                 else:
-                    # For non-async implementations (GroqClient, etc.)
-                    if self.provider == LLMProvider.ANTHROPIC:
-                        response = run_method(
-                            messages=filtered_messages,
-                            system=self._get_system_prompt(),
-                            max_tokens=self.max_tokens,
-                        )
-                    else:
-                        response = run_method(
-                            messages=messages,
-                            system=system_prompt,
-                            max_tokens=self.max_tokens,
-                        )
+                    # For OpenAI and others, use standard format directly
+                    # Log request
+                    request_data = {
+                        "messages": prepared_messages,
+                        "max_tokens": self.max_tokens,
+                        "system": system_prompt,
+                    }
+                    self._log_api_call("request", request_data)
+                    # Make API call
+                    response = await self.client.run_interleaved(
+                        messages=prepared_messages,
+                        system=system_prompt,
+                        max_tokens=self.max_tokens,
+                    )
                 # Log success response
                 self._log_api_call("response", request_data, response)
@@ -327,6 +306,10 @@ class OmniLoop(BaseLoop):
         logger.error(error_message)
         raise RuntimeError(error_message)
+    ###########################################
+    # RESPONSE AND ACTION HANDLING
+    ###########################################
     async def _handle_response(
         self, response: Any, messages: List[Dict[str, Any]], parsed_screen: ParseResult
     ) -> Tuple[bool, bool]:
@@ -341,194 +324,151 @@ class OmniLoop(BaseLoop):
             Tuple of (should_continue, action_screenshot_saved)
         """
         action_screenshot_saved = False
+        # Helper function to safely add assistant messages using the message manager
+        def add_assistant_message(content):
+            if isinstance(content, str):
+                # Convert string to proper format
+                formatted_content = [{"type": "text", "text": content}]
+                self.message_manager.add_assistant_message(formatted_content)
+                logger.info("Added formatted text assistant message")
+            elif isinstance(content, list):
+                # Already in proper format
+                self.message_manager.add_assistant_message(content)
+                logger.info("Added structured assistant message")
+            else:
+                # Default case - convert to string
+                formatted_content = [{"type": "text", "text": str(content)}]
+                self.message_manager.add_assistant_message(formatted_content)
+                logger.info("Added converted assistant message")
         try:
-            # Handle Anthropic response format
+            # Step 1: Normalize response to standard format based on provider
+            standard_content = []
+            raw_text = None
+            # Convert response to standardized content based on provider
             if self.provider == LLMProvider.ANTHROPIC:
                 if hasattr(response, "content") and isinstance(response.content, list):
-                    # Extract text from content blocks
+                    # Convert Anthropic response to standard format
                     for block in response.content:
-                        if hasattr(block, "type") and block.type == "text":
-                            content = block.text
-                            # Try to find JSON in the content
-                            try:
-                                # First look for JSON block
-                                json_content = extract_data(content, "json")
-                                parsed_content = json.loads(json_content)
-                                logger.info("Successfully parsed JSON from code block")
-                            except (json.JSONDecodeError, IndexError):
-                                # If no JSON block, try to find JSON object in the text
-                                try:
-                                    # Look for JSON object pattern
-                                    json_pattern = r"\{[^}]+\}"
-                                    json_match = re.search(json_pattern, content)
-                                    if json_match:
-                                        json_str = json_match.group(0)
-                                        parsed_content = json.loads(json_str)
-                                        logger.info("Successfully parsed JSON from text")
-                                    else:
-                                        logger.error(f"No JSON found in content: {content}")
-                                        continue
-                                except json.JSONDecodeError as e:
-                                    logger.error(f"Failed to parse JSON from text: {str(e)}")
-                                    continue
-                            # Clean up Box ID format
-                            if "Box ID" in parsed_content and isinstance(
-                                parsed_content["Box ID"], str
-                            ):
-                                parsed_content["Box ID"] = parsed_content["Box ID"].replace(
-                                    "Box #", ""
-                                )
-                            # Add any explanatory text as reasoning if not present
-                            if "Explanation" not in parsed_content:
-                                # Extract any text before the JSON as reasoning
-                                text_before_json = content.split("{")[0].strip()
-                                if text_before_json:
-                                    parsed_content["Explanation"] = text_before_json
-                            # Log the parsed content for debugging
-                            logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
-                            # Add response to messages
-                            messages.append(
-                                {"role": "assistant", "content": json.dumps(parsed_content)}
-                            )
-                            try:
-                                # Execute action with current parsed screen info
-                                await self._execute_action(
-                                    parsed_content, cast(ParseResult, parsed_screen)
-                                )
-                                action_screenshot_saved = True
-                            except Exception as e:
-                                logger.error(f"Error executing action: {str(e)}")
-                                # Add error message to conversation
-                                messages.append(
-                                    {
-                                        "role": "assistant",
-                                        "content": f"Error executing action: {str(e)}",
-                                        "metadata": {"title": "❌ Error"},
-                                    }
-                                )
-                                return False, action_screenshot_saved
-                            # Check if task is complete
-                            if parsed_content.get("Action") == "None":
-                                return False, action_screenshot_saved
-                            return True, action_screenshot_saved
-                    logger.warning("No text block found in Anthropic response")
+                        if hasattr(block, "type"):
+                            if block.type == "text":
+                                standard_content.append({"type": "text", "text": block.text})
+                                # Store raw text for JSON parsing
+                                if raw_text is None:
+                                    raw_text = block.text
+                                else:
+                                    raw_text += "\n" + block.text
+                            else:
+                                # Add other block types
+                                block_dict = {}
+                                for key, value in vars(block).items():
+                                    if not key.startswith("_"):
+                                        block_dict[key] = value
+                                standard_content.append(block_dict)
+                else:
+                    logger.warning("Invalid Anthropic response format")
                     return True, action_screenshot_saved
-            # Handle other providers' response formats
-            if isinstance(response, dict) and "choices" in response:
-                content = response["choices"][0]["message"]["content"]
             else:
-                content = response
+                # Assume OpenAI or compatible format
+                try:
+                    raw_text = response["choices"][0]["message"]["content"]
+                    standard_content = [{"type": "text", "text": raw_text}]
+                except (KeyError, TypeError, IndexError) as e:
+                    logger.error(f"Invalid response format: {str(e)}")
+                    return True, action_screenshot_saved
-            # Parse JSON content
-            if isinstance(content, str):
+            # Step 2: Add the normalized response to message history
+            add_assistant_message(standard_content)
+            # Step 3: Extract JSON from the content for action execution
+            parsed_content = None
+            # If we have raw text, try to extract JSON from it
+            if raw_text:
+                # Try different approaches to extract JSON
                 try:
                     # First try to parse the whole content as JSON
-                    parsed_content = json.loads(content)
+                    parsed_content = json.loads(raw_text)
+                    logger.info("Successfully parsed whole content as JSON")
                 except json.JSONDecodeError:
                     try:
                         # Try to find JSON block
-                        json_content = extract_data(content, "json")
+                        json_content = extract_data(raw_text, "json")
                         parsed_content = json.loads(json_content)
+                        logger.info("Successfully parsed JSON from code block")
                     except (json.JSONDecodeError, IndexError):
                         try:
                             # Look for JSON object pattern
                             json_pattern = r"\{[^}]+\}"
-                            json_match = re.search(json_pattern, content)
+                            json_match = re.search(json_pattern, raw_text)
                             if json_match:
                                 json_str = json_match.group(0)
                                 parsed_content = json.loads(json_str)
+                                logger.info("Successfully parsed JSON from text")
                             else:
-                                logger.error(f"No JSON found in content: {content}")
+                                logger.error(f"No JSON found in content")
                                 return True, action_screenshot_saved
                         except json.JSONDecodeError as e:
                             logger.error(f"Failed to parse JSON from text: {str(e)}")
                             return True, action_screenshot_saved
+            # Step 4: Process the parsed content if available
+            if parsed_content:
                 # Clean up Box ID format
                 if "Box ID" in parsed_content and isinstance(parsed_content["Box ID"], str):
                     parsed_content["Box ID"] = parsed_content["Box ID"].replace("Box #", "")
                 # Add any explanatory text as reasoning if not present
-                if "Explanation" not in parsed_content:
+                if "Explanation" not in parsed_content and raw_text:
                     # Extract any text before the JSON as reasoning
-                    text_before_json = content.split("{")[0].strip()
+                    text_before_json = raw_text.split("{")[0].strip()
                     if text_before_json:
                         parsed_content["Explanation"] = text_before_json
-                # Add response to messages with stringified content
-                messages.append({"role": "assistant", "content": json.dumps(parsed_content)})
+                # Log the parsed content for debugging
+                logger.info(f"Parsed content: {json.dumps(parsed_content, indent=2)}")
+                # Step 5: Execute the action
                 try:
-                    # Execute action with current parsed screen info
-                    await self._execute_action(parsed_content, cast(ParseResult, parsed_screen))
-                    action_screenshot_saved = True
-                except Exception as e:
-                    logger.error(f"Error executing action: {str(e)}")
-                    # Add error message to conversation
-                    messages.append(
-                        {
-                            "role": "assistant",
-                            "content": f"Error executing action: {str(e)}",
-                            "metadata": {"title": "❌ Error"},
-                        }
+                    # Execute action using the common helper method
+                    should_continue, action_screenshot_saved = (
+                        await self._execute_action_with_tools(
+                            parsed_content, cast(ParseResult, parsed_screen)
+                        )
                     )
-                    return False, action_screenshot_saved
-                # Check if task is complete
-                if parsed_content.get("Action") == "None":
-                    return False, action_screenshot_saved
-                return True, action_screenshot_saved
-            elif isinstance(content, dict):
-                # Handle case where content is already a dictionary
-                messages.append({"role": "assistant", "content": json.dumps(content)})
-                try:
-                    # Execute action with current parsed screen info
-                    await self._execute_action(content, cast(ParseResult, parsed_screen))
-                    action_screenshot_saved = True
+                    # Check if task is complete
+                    if parsed_content.get("Action") == "None":
+                        return False, action_screenshot_saved
+                    return should_continue, action_screenshot_saved
                 except Exception as e:
                     logger.error(f"Error executing action: {str(e)}")
-                    # Add error message to conversation
-                    messages.append(
-                        {
-                            "role": "assistant",
-                            "content": f"Error executing action: {str(e)}",
-                            "metadata": {"title": "❌ Error"},
-                        }
-                    )
+                    # Update the last assistant message with error
+                    error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
+                    # Replace the last assistant message with the error
+                    self.message_manager.add_assistant_message(error_message)
                     return False, action_screenshot_saved
-                # Check if task is complete
-                if content.get("Action") == "None":
-                    return False, action_screenshot_saved
-                return True, action_screenshot_saved
             return True, action_screenshot_saved
         except Exception as e:
             logger.error(f"Error handling response: {str(e)}")
-            messages.append(
-                {
-                    "role": "assistant",
-                    "content": f"Error: {str(e)}",
-                    "metadata": {"title": "❌ Error"},
-                }
-            )
+            # Add error message using the message manager
+            error_message = [{"type": "text", "text": f"Error: {str(e)}"}]
+            self.message_manager.add_assistant_message(error_message)
             raise
+    ###########################################
+    # SCREEN PARSING - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
     async def _get_parsed_screen_som(self, save_screenshot: bool = True) -> ParseResult:
-        """Get parsed screen information with SOM.
+        """Get parsed screen information with Screen Object Model.
+        Extends the base class method to use the OmniParser to parse the screen
+        and extract UI elements.
         Args:
             save_screenshot: Whether to save the screenshot (set to False when screenshots will be saved elsewhere)
@@ -563,337 +503,26 @@ class OmniLoop(BaseLoop):
             logger.error(f"Error getting parsed screen: {str(e)}")
             raise
-    async def _process_screen(
-        self, parsed_screen: ParseResult, messages: List[Dict[str, Any]]
-    ) -> None:
-        """Process and add screen info to messages."""
-        try:
-            # Only add message if we have an image and provider supports it
-            if self.provider in [LLMProvider.OPENAI, LLMProvider.ANTHROPIC]:
-                image = parsed_screen.annotated_image_base64 or None
-                if image:
-                    # Save screen info to current turn directory
-                    if self.current_turn_dir:
-                        # Save elements as JSON
-                        elements_path = os.path.join(self.current_turn_dir, "elements.json")
-                        with open(elements_path, "w") as f:
-                            # Convert elements to dicts for JSON serialization
-                            elements_json = [elem.model_dump() for elem in parsed_screen.elements]
-                            json.dump(elements_json, f, indent=2)
-                            logger.info(f"Saved elements to {elements_path}")
-                    # Format the image content based on the provider
-                    if self.provider == LLMProvider.ANTHROPIC:
-                        # Compress the image before sending to Anthropic (5MB limit)
-                        image_size = len(image)
-                        logger.info(f"Image base64 is present, length: {image_size}")
-                        # Anthropic has a 5MB limit - check against base64 string length
-                        # which is what matters for the API call payload
-                        # Use slightly smaller limit (4.9MB) to account for request overhead
-                        max_size = int(4.9 * 1024 * 1024)  # 4.9MB
-                        # Default media type (will be overridden if compression is needed)
-                        media_type = "image/png"
-                        # Check if the image already has a media type prefix
-                        if image.startswith("data:"):
-                            parts = image.split(",", 1)
-                            if len(parts) == 2 and "image/jpeg" in parts[0].lower():
-                                media_type = "image/jpeg"
-                            elif len(parts) == 2 and "image/png" in parts[0].lower():
-                                media_type = "image/png"
-                        if image_size > max_size:
-                            logger.info(
-                                f"Image size ({image_size} bytes) exceeds Anthropic limit ({max_size} bytes), compressing..."
-                            )
-                            image, media_type = compress_image_base64(image, max_size)
-                            logger.info(
-                                f"Image compressed to {len(image)} bytes with media_type {media_type}"
-                            )
-                        # Anthropic uses "type": "image"
-                        screen_info_msg = {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "image",
-                                    "source": {
-                                        "type": "base64",
-                                        "media_type": media_type,
-                                        "data": image,
-                                    },
-                                }
-                            ],
-                        }
-                    else:
-                        # OpenAI and others use "type": "image_url"
-                        screen_info_msg = {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": f"data:image/png;base64,{image}"},
-                                }
-                            ],
-                        }
-                    messages.append(screen_info_msg)
-        except Exception as e:
-            logger.error(f"Error processing screen info: {str(e)}")
-            raise
     def _get_system_prompt(self) -> str:
         """Get the system prompt for the model."""
         return SYSTEM_PROMPT
-    async def _execute_action(self, content: Dict[str, Any], parsed_screen: ParseResult) -> None:
-        """Execute the action specified in the content using the tool manager.
-        Args:
-            content: Dictionary containing the action details
-            parsed_screen: Current parsed screen information
-        """
-        try:
-            action = content.get("Action", "").lower()
-            if not action:
-                return
-            # Track if we saved an action-specific screenshot
-            action_screenshot_saved = False
-            try:
-                # Prepare kwargs based on action type
-                kwargs = {}
-                if action in ["left_click", "right_click", "double_click", "move_cursor"]:
-                    try:
-                        box_id = int(content["Box ID"])
-                        logger.info(f"Processing Box ID: {box_id}")
-                        # Calculate click coordinates
-                        x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
-                        logger.info(f"Calculated coordinates: x={x}, y={y}")
-                        kwargs["x"] = x
-                        kwargs["y"] = y
-                        # Visualize action if screenshot is available
-                        if parsed_screen.annotated_image_base64:
-                            img_data = parsed_screen.annotated_image_base64
-                            # Remove data URL prefix if present
-                            if img_data.startswith("data:image"):
-                                img_data = img_data.split(",")[1]
-                            # Only save visualization for coordinate-based actions
-                            self._visualize_action(x, y, img_data)
-                            action_screenshot_saved = True
-                    except ValueError as e:
-                        logger.error(f"Error processing Box ID: {str(e)}")
-                        return
-                elif action == "drag_to":
-                    try:
-                        box_id = int(content["Box ID"])
-                        x, y = await self._calculate_click_coordinates(box_id, parsed_screen)
-                        kwargs.update(
-                            {
-                                "x": x,
-                                "y": y,
-                                "button": content.get("button", "left"),
-                                "duration": float(content.get("duration", 0.5)),
-                            }
-                        )
-                        # Visualize drag destination if screenshot is available
-                        if parsed_screen.annotated_image_base64:
-                            img_data = parsed_screen.annotated_image_base64
-                            # Remove data URL prefix if present
-                            if img_data.startswith("data:image"):
-                                img_data = img_data.split(",")[1]
-                            # Only save visualization for coordinate-based actions
-                            self._visualize_action(x, y, img_data)
-                            action_screenshot_saved = True
-                    except ValueError as e:
-                        logger.error(f"Error processing drag coordinates: {str(e)}")
-                        return
-                elif action == "type_text":
-                    kwargs["text"] = content["Value"]
-                    # For type_text, store the value in the action type
-                    action_type = f"type_{content['Value'][:20]}"  # Truncate if too long
-                elif action == "press_key":
-                    kwargs["key"] = content["Value"]
-                    action_type = f"press_{content['Value']}"
-                elif action == "hotkey":
-                    if isinstance(content.get("Value"), list):
-                        keys = content["Value"]
-                        action_type = f"hotkey_{'_'.join(keys)}"
-                    else:
-                        # Simply split string format like "command+space" into a list
-                        keys = [k.strip() for k in content["Value"].lower().split("+")]
-                        action_type = f"hotkey_{content['Value'].replace('+', '_')}"
-                    logger.info(f"Preparing hotkey with keys: {keys}")
-                    # Get the method but call it with *args instead of **kwargs
-                    method = getattr(self.computer.interface, action)
-                    await method(*keys)  # Unpack the keys list as positional arguments
-                    logger.info(f"Tool execution completed successfully: {action}")
-                    # For hotkeys, take a screenshot after the action
-                    try:
-                        # Get a new screenshot after the action and save it with the action type
-                        new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
-                        if new_parsed_screen and new_parsed_screen.annotated_image_base64:
-                            img_data = new_parsed_screen.annotated_image_base64
-                            # Remove data URL prefix if present
-                            if img_data.startswith("data:image"):
-                                img_data = img_data.split(",")[1]
-                            # Save with action type to indicate this is a post-action screenshot
-                            self._save_screenshot(img_data, action_type=action_type)
-                            action_screenshot_saved = True
-                    except Exception as screenshot_error:
-                        logger.error(
-                            f"Error taking post-hotkey screenshot: {str(screenshot_error)}"
-                        )
-                    return
-                elif action in ["scroll_down", "scroll_up"]:
-                    clicks = int(content.get("amount", 1))
-                    kwargs["clicks"] = clicks
-                    action_type = f"scroll_{action.split('_')[1]}_{clicks}"
-                    # Visualize scrolling if screenshot is available
-                    if parsed_screen.annotated_image_base64:
-                        img_data = parsed_screen.annotated_image_base64
-                        # Remove data URL prefix if present
-                        if img_data.startswith("data:image"):
-                            img_data = img_data.split(",")[1]
-                        direction = "down" if action == "scroll_down" else "up"
-                        # For scrolling, we only save the visualization to avoid duplicate images
-                        self._visualize_scroll(direction, clicks, img_data)
-                        action_screenshot_saved = True
-                else:
-                    logger.warning(f"Unknown action: {action}")
-                    return
+    ###########################################
+    # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
+    ###########################################
-                # Execute tool and handle result
-                try:
-                    method = getattr(self.computer.interface, action)
-                    logger.info(f"Found method for action '{action}': {method}")
-                    await method(**kwargs)
-                    logger.info(f"Tool execution completed successfully: {action}")
-                    # For non-coordinate based actions that don't already have visualizations,
-                    # take a new screenshot after the action
-                    if not action_screenshot_saved:
-                        # Take a new screenshot
-                        try:
-                            # Get a new screenshot after the action and save it with the action type
-                            new_parsed_screen = await self._get_parsed_screen_som(
-                                save_screenshot=False
-                            )
-                            if new_parsed_screen and new_parsed_screen.annotated_image_base64:
-                                img_data = new_parsed_screen.annotated_image_base64
-                                # Remove data URL prefix if present
-                                if img_data.startswith("data:image"):
-                                    img_data = img_data.split(",")[1]
-                                # Save with action type to indicate this is a post-action screenshot
-                                if "action_type" in locals():
-                                    self._save_screenshot(img_data, action_type=action_type)
-                                else:
-                                    self._save_screenshot(img_data, action_type=action)
-                                # Update the action screenshot flag for this turn
-                                action_screenshot_saved = True
-                        except Exception as screenshot_error:
-                            logger.error(
-                                f"Error taking post-action screenshot: {str(screenshot_error)}"
-                            )
-                except AttributeError as e:
-                    logger.error(f"Method not found for action '{action}': {str(e)}")
-                    return
-                except Exception as tool_error:
-                    logger.error(f"Tool execution failed: {str(tool_error)}")
-                    return
-            except Exception as e:
-                logger.error(f"Error executing action {action}: {str(e)}")
-                return
-        except Exception as e:
-            logger.error(f"Error in _execute_action: {str(e)}")
-            return
-    async def _calculate_click_coordinates(
-        self, box_id: int, parsed_screen: ParseResult
-    ) -> Tuple[int, int]:
-        """Calculate click coordinates based on box ID.
-        Args:
-            box_id: The ID of the box to click
-            parsed_screen: The parsed screen information
-        Returns:
-            Tuple of (x, y) coordinates
-        Raises:
-            ValueError: If box_id is invalid or missing from parsed screen
-        """
-        # First try to use structured elements data
-        logger.info(f"Elements count: {len(parsed_screen.elements)}")
-        # Try to find element with matching ID
-        for element in parsed_screen.elements:
-            if element.id == box_id:
-                logger.info(f"Found element with ID {box_id}: {element}")
-                bbox = element.bbox
-                # Get screen dimensions from the metadata if available, or fallback
-                width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
-                height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
-                logger.info(f"Screen dimensions: width={width}, height={height}")
-                # Calculate center of the box in pixels
-                center_x = int((bbox.x1 + bbox.x2) / 2 * width)
-                center_y = int((bbox.y1 + bbox.y2) / 2 * height)
-                logger.info(f"Calculated center: ({center_x}, {center_y})")
-                # Validate coordinates - if they're (0,0) or unreasonably small,
-                # use a default position in the center of the screen
-                if center_x == 0 and center_y == 0:
-                    logger.warning("Got (0,0) coordinates, using fallback position")
-                    center_x = width // 2
-                    center_y = height // 2
-                    logger.info(f"Using fallback center: ({center_x}, {center_y})")
-                return center_x, center_y
-        # If we couldn't find the box, use center of screen
-        logger.error(
-            f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
-        )
-        # Use center of screen as fallback
-        width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
-        height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
-        logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
-        return width // 2, height // 2
-    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
         """Run the agent loop with provided messages.
         Args:
-            messages: List of message objects
+            messages: List of messages in standard OpenAI format
         Yields:
-            Dict containing response data
+            Agent response format
         """
-        # Keep track of conversation history
-        conversation_history = messages.copy()
+        # Initialize the message manager with the provided messages
+        self.message_manager.messages = messages.copy()
+        logger.info(f"Starting OmniLoop run with {len(self.message_manager.messages)} messages")
         # Continue running until explicitly told to stop
         running = True
@@ -922,26 +551,66 @@ class OmniLoop(BaseLoop):
                 # Get up-to-date screen information
                 parsed_screen = await self._get_parsed_screen_som()
-                # Process screen info and update messages
-                await self._process_screen(parsed_screen, conversation_history)
+                # Process screen info and update messages in standard format
+                try:
+                    # Get image from parsed screen
+                    image = parsed_screen.annotated_image_base64 or None
+                    if image:
+                        # Save elements as JSON if we have a turn directory
+                        if self.current_turn_dir and hasattr(parsed_screen, "elements"):
+                            elements_path = os.path.join(self.current_turn_dir, "elements.json")
+                            with open(elements_path, "w") as f:
+                                # Convert elements to dicts for JSON serialization
+                                elements_json = [
+                                    elem.model_dump() for elem in parsed_screen.elements
+                                ]
+                                json.dump(elements_json, f, indent=2)
+                                logger.info(f"Saved elements to {elements_path}")
+                        # Remove data URL prefix if present
+                        if "," in image:
+                            image = image.split(",")[1]
+                        # Add screenshot to message history using message manager
+                        self.message_manager.add_user_message(
+                            [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/png;base64,{image}"},
+                                }
+                            ]
+                        )
+                        logger.info("Added screenshot to message history")
+                except Exception as e:
+                    logger.error(f"Error processing screen info: {str(e)}")
+                    raise
                 # Get system prompt
                 system_prompt = self._get_system_prompt()
-                # Make API call with retries
-                response = await self._make_api_call(conversation_history, system_prompt)
+                # Make API call with retries using the APIHandler
+                response = await self.api_handler.make_api_call(
+                    self.message_manager.messages, system_prompt
+                )
                 # Handle the response (may execute actions)
                 # Returns: (should_continue, action_screenshot_saved)
                 should_continue, new_screenshot_saved = await self._handle_response(
-                    response, conversation_history, parsed_screen
+                    response, self.message_manager.messages, parsed_screen
                 )
                 # Update whether an action screenshot was saved this turn
                 action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
+                # Create OpenAI-compatible response format using utility function
+                openai_compatible_response = await to_openai_agent_response_format(
+                    response=response,
+                    messages=self.message_manager.messages,
+                    model=self.model,
+                )
                 # Yield the response to the caller
-                yield {"response": response}
+                yield openai_compatible_response
                 # Check if we should continue this conversation
                 running = should_continue
@@ -969,3 +638,218 @@ class OmniLoop(BaseLoop):
                 # Create a brief delay before retrying
                 await asyncio.sleep(1)
+    async def process_model_response(self, response_text: str) -> Optional[Dict[str, Any]]:
+        """Process model response to extract tool calls.
+        Args:
+            response_text: Model response text
+        Returns:
+            Extracted tool information, or None if no tool call was found
+        """
+        try:
+            # Ensure tools are initialized before use
+            await self._ensure_tools_initialized()
+            # Look for tool use in the response
+            if "function_call" in response_text or "tool_use" in response_text:
+                # The extract_tool_call method should be implemented in the OmniAPIHandler
+                # For now, we'll just use a simple approach
+                # This will be replaced with the proper implementation
+                tool_info = None
+                if "function_call" in response_text:
+                    # Extract function call params
+                    try:
+                        # Simple extraction - in real code this would be more robust
+                        import json
+                        import re
+                        match = re.search(r'"function_call"\s*:\s*{([^}]+)}', response_text)
+                        if match:
+                            function_text = "{" + match.group(1) + "}"
+                            tool_info = json.loads(function_text)
+                    except Exception as e:
+                        logger.error(f"Error extracting function call: {str(e)}")
+                if tool_info:
+                    try:
+                        # Execute the tool
+                        result = await self.tool_manager.execute_tool(
+                            name=tool_info.get("name"), tool_input=tool_info.get("arguments", {})
+                        )
+                        # Handle the result
+                        return {"tool_result": result}
+                    except Exception as e:
+                        error_msg = (
+                            f"Error executing tool '{tool_info.get('name', 'unknown')}': {str(e)}"
+                        )
+                        logger.error(error_msg)
+                        return {"tool_result": ToolResult(error=error_msg)}
+        except Exception as e:
+            logger.error(f"Error processing tool call: {str(e)}")
+        return None
+    async def process_response_with_tools(
+        self, response_text: str, parsed_screen: Optional[ParseResult] = None
+    ) -> Tuple[bool, str]:
+        """Process model response and execute tools.
+        Args:
+            response_text: Model response text
+            parsed_screen: Current parsed screen information (optional)
+        Returns:
+            Tuple of (action_taken, observation)
+        """
+        logger.info("Processing response with tools")
+        # Process the response to extract tool calls
+        tool_result = await self.process_model_response(response_text)
+        if tool_result and "tool_result" in tool_result:
+            # A tool was executed
+            result = tool_result["tool_result"]
+            if result.error:
+                return False, f"ERROR: {result.error}"
+            else:
+                return True, result.output or "Tool executed successfully"
+        # No action or tool call found
+        return False, "No action taken - no tool call detected in response"
+    ###########################################
+    # UTILITY METHODS
+    ###########################################
+    async def _ensure_tools_initialized(self) -> None:
+        """Ensure the tool manager and tools are initialized before use."""
+        if not hasattr(self.tool_manager, "tools") or self.tool_manager.tools is None:
+            logger.info("Tools not initialized. Initializing now...")
+            await self.tool_manager.initialize()
+            logger.info("Tools initialized successfully.")
+    async def _execute_action_with_tools(
+        self, action_data: Dict[str, Any], parsed_screen: ParseResult
+    ) -> Tuple[bool, bool]:
+        """Execute an action using the tools-based approach.
+        Args:
+            action_data: Dictionary containing action details
+            parsed_screen: Current parsed screen information
+        Returns:
+            Tuple of (should_continue, action_screenshot_saved)
+        """
+        action_screenshot_saved = False
+        action_type = None  # Initialize for possible use in post-action screenshot
+        try:
+            # Extract the action
+            parsed_action = action_data.get("Action", "").lower()
+            # Only process if we have a valid action
+            if not parsed_action or parsed_action == "none":
+                return False, action_screenshot_saved
+            # Convert the parsed content to a format suitable for the tools system
+            tool_name = "computer"  # Default to computer tool
+            tool_args = {"action": parsed_action}
+            # Add specific arguments based on action type
+            if parsed_action in ["left_click", "right_click", "double_click", "move_cursor"]:
+                # Calculate coordinates from Box ID using parser
+                try:
+                    box_id = int(action_data["Box ID"])
+                    x, y = await self.parser.calculate_click_coordinates(
+                        box_id, cast(ParseResult, parsed_screen)
+                    )
+                    tool_args["x"] = x
+                    tool_args["y"] = y
+                    # Visualize action if screenshot is available
+                    if parsed_screen and parsed_screen.annotated_image_base64:
+                        img_data = parsed_screen.annotated_image_base64
+                        # Remove data URL prefix if present
+                        if img_data.startswith("data:image"):
+                            img_data = img_data.split(",")[1]
+                        # Save visualization for coordinate-based actions
+                        self.viz_helper.visualize_action(x, y, img_data)
+                        action_screenshot_saved = True
+                except (ValueError, KeyError) as e:
+                    logger.error(f"Error processing Box ID: {str(e)}")
+                    return False, action_screenshot_saved
+            elif parsed_action == "type_text":
+                tool_args["text"] = action_data.get("Value", "")
+                # For type_text, store the value in the action type for screenshot naming
+                action_type = f"type_{tool_args['text'][:20]}"  # Truncate if too long
+            elif parsed_action == "press_key":
+                tool_args["key"] = action_data.get("Value", "")
+                action_type = f"press_{tool_args['key']}"
+            elif parsed_action == "hotkey":
+                value = action_data.get("Value", "")
+                if isinstance(value, list):
+                    tool_args["keys"] = value
+                    action_type = f"hotkey_{'_'.join(value)}"
+                else:
+                    # Split string format like "command+space" into a list
+                    keys = [k.strip() for k in value.lower().split("+")]
+                    tool_args["keys"] = keys
+                    action_type = f"hotkey_{value.replace('+', '_')}"
+            elif parsed_action in ["scroll_down", "scroll_up"]:
+                clicks = int(action_data.get("amount", 1))
+                tool_args["amount"] = clicks
+                action_type = f"scroll_{parsed_action.split('_')[1]}_{clicks}"
+                # Visualize scrolling if screenshot is available
+                if parsed_screen and parsed_screen.annotated_image_base64:
+                    img_data = parsed_screen.annotated_image_base64
+                    # Remove data URL prefix if present
+                    if img_data.startswith("data:image"):
+                        img_data = img_data.split(",")[1]
+                    direction = "down" if parsed_action == "scroll_down" else "up"
+                    # For scrolling, we save the visualization
+                    self.viz_helper.visualize_scroll(direction, clicks, img_data)
+                    action_screenshot_saved = True
+            # Ensure tools are initialized before use
+            await self._ensure_tools_initialized()
+            # Execute tool with prepared arguments
+            result = await self.tool_manager.execute_tool(name=tool_name, tool_input=tool_args)
+            # Take a new screenshot after the action if we haven't already saved one
+            if not action_screenshot_saved:
+                try:
+                    # Get a new screenshot after the action
+                    new_parsed_screen = await self._get_parsed_screen_som(save_screenshot=False)
+                    if new_parsed_screen and new_parsed_screen.annotated_image_base64:
+                        img_data = new_parsed_screen.annotated_image_base64
+                        # Remove data URL prefix if present
+                        if img_data.startswith("data:image"):
+                            img_data = img_data.split(",")[1]
+                        # Save with action type if defined, otherwise use the action name
+                        if action_type:
+                            self._save_screenshot(img_data, action_type=action_type)
+                        else:
+                            self._save_screenshot(img_data, action_type=parsed_action)
+                        action_screenshot_saved = True
+                except Exception as screenshot_error:
+                    logger.error(f"Error taking post-action screenshot: {str(screenshot_error)}")
+            # Continue the loop if the action is not "None"
+            return True, action_screenshot_saved
+        except Exception as e:
+            logger.error(f"Error executing action: {str(e)}")
+            # Update the last assistant message with error
+            error_message = [{"type": "text", "text": f"Error executing action: {str(e)}"}]
+            # Replace the last assistant message with the error
+            self.message_manager.add_assistant_message(error_message)
+            return False, action_screenshot_saved

cua-agent 0.1.6__py3-none-any.whl → 0.1.17__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.17py3-none-any.whl