PyPI - cua-agent - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

agent/README.md +63 -0
agent/__init__.py +10 -0
agent/core/README.md +101 -0
agent/core/__init__.py +34 -0
agent/core/agent.py +284 -0
agent/core/base_agent.py +164 -0
agent/core/callbacks.py +147 -0
agent/core/computer_agent.py +69 -0
agent/core/experiment.py +222 -0
agent/core/factory.py +102 -0
agent/core/loop.py +244 -0
agent/core/messages.py +230 -0
agent/core/tools/__init__.py +21 -0
agent/core/tools/base.py +74 -0
agent/core/tools/bash.py +52 -0
agent/core/tools/collection.py +46 -0
agent/core/tools/computer.py +113 -0
agent/core/tools/edit.py +67 -0
agent/core/tools/manager.py +56 -0
agent/providers/__init__.py +4 -0
agent/providers/anthropic/__init__.py +6 -0
agent/providers/anthropic/api/client.py +222 -0
agent/providers/anthropic/api/logging.py +150 -0
agent/providers/anthropic/callbacks/manager.py +55 -0
agent/providers/anthropic/loop.py +521 -0
agent/providers/anthropic/messages/manager.py +110 -0
agent/providers/anthropic/prompts.py +20 -0
agent/providers/anthropic/tools/__init__.py +33 -0
agent/providers/anthropic/tools/base.py +88 -0
agent/providers/anthropic/tools/bash.py +163 -0
agent/providers/anthropic/tools/collection.py +34 -0
agent/providers/anthropic/tools/computer.py +550 -0
agent/providers/anthropic/tools/edit.py +326 -0
agent/providers/anthropic/tools/manager.py +54 -0
agent/providers/anthropic/tools/run.py +42 -0
agent/providers/anthropic/types.py +16 -0
agent/providers/omni/__init__.py +27 -0
agent/providers/omni/callbacks.py +78 -0
agent/providers/omni/clients/anthropic.py +99 -0
agent/providers/omni/clients/base.py +44 -0
agent/providers/omni/clients/groq.py +101 -0
agent/providers/omni/clients/openai.py +159 -0
agent/providers/omni/clients/utils.py +25 -0
agent/providers/omni/experiment.py +273 -0
agent/providers/omni/image_utils.py +106 -0
agent/providers/omni/loop.py +961 -0
agent/providers/omni/messages.py +168 -0
agent/providers/omni/parser.py +252 -0
agent/providers/omni/prompts.py +78 -0
agent/providers/omni/tool_manager.py +91 -0
agent/providers/omni/tools/__init__.py +13 -0
agent/providers/omni/tools/bash.py +69 -0
agent/providers/omni/tools/computer.py +216 -0
agent/providers/omni/tools/manager.py +83 -0
agent/providers/omni/types.py +30 -0
agent/providers/omni/utils.py +155 -0
agent/providers/omni/visualization.py +130 -0
agent/types/__init__.py +26 -0
agent/types/base.py +52 -0
agent/types/messages.py +36 -0
agent/types/tools.py +32 -0
cua_agent-0.1.0.dist-info/METADATA +44 -0
cua_agent-0.1.0.dist-info/RECORD +65 -0
cua_agent-0.1.0.dist-info/WHEEL +4 -0
cua_agent-0.1.0.dist-info/entry_points.txt +4 -0

agent/providers/omni/messages.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Omni message manager implementation."""
+import base64
+from typing import Any, Dict, List, Optional
+from io import BytesIO
+from PIL import Image
+from ...core.messages import BaseMessageManager, ImageRetentionConfig
+class OmniMessageManager(BaseMessageManager):
+    """Message manager for multi-provider support."""
+    def __init__(self, config: Optional[ImageRetentionConfig] = None):
+        """Initialize the message manager.
+        Args:
+            config: Optional configuration for image retention
+        """
+        super().__init__(config)
+        self.messages: List[Dict[str, Any]] = []
+        self.config = config
+    def add_user_message(self, content: str, images: Optional[List[bytes]] = None) -> None:
+        """Add a user message to the history.
+        Args:
+            content: Message content
+            images: Optional list of image data
+        """
+        # Add images if present
+        if images:
+            # Initialize with proper typing for mixed content
+            message_content: List[Dict[str, Any]] = [{"type": "text", "text": content}]
+            # Add each image
+            for img in images:
+                message_content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{base64.b64encode(img).decode()}"
+                        },
+                    }
+                )
+            message = {"role": "user", "content": message_content}
+        else:
+            # Simple text message
+            message = {"role": "user", "content": content}
+        self.messages.append(message)
+        # Apply retention policy
+        if self.config and self.config.num_images_to_keep:
+            self._apply_image_retention_policy()
+    def add_assistant_message(self, content: str) -> None:
+        """Add an assistant message to the history.
+        Args:
+            content: Message content
+        """
+        self.messages.append({"role": "assistant", "content": content})
+    def add_system_message(self, content: str) -> None:
+        """Add a system message to the history.
+        Args:
+            content: Message content
+        """
+        self.messages.append({"role": "system", "content": content})
+    def _apply_image_retention_policy(self) -> None:
+        """Apply image retention policy to message history."""
+        if not self.config or not self.config.num_images_to_keep:
+            return
+        # Count images from newest to oldest
+        image_count = 0
+        for message in reversed(self.messages):
+            if message["role"] != "user":
+                continue
+            # Handle multimodal messages
+            if isinstance(message["content"], list):
+                new_content = []
+                for item in message["content"]:
+                    if item["type"] == "text":
+                        new_content.append(item)
+                    elif item["type"] == "image_url":
+                        if image_count < self.config.num_images_to_keep:
+                            new_content.append(item)
+                            image_count += 1
+                message["content"] = new_content
+    def get_formatted_messages(self, provider: str) -> List[Dict[str, Any]]:
+        """Get messages formatted for specific provider.
+        Args:
+            provider: Provider name to format messages for
+        Returns:
+            List of formatted messages
+        """
+        if provider == "anthropic":
+            return self._format_for_anthropic()
+        elif provider == "openai":
+            return self._format_for_openai()
+        elif provider == "groq":
+            return self._format_for_groq()
+        elif provider == "qwen":
+            return self._format_for_qwen()
+        else:
+            raise ValueError(f"Unsupported provider: {provider}")
+    def _format_for_anthropic(self) -> List[Dict[str, Any]]:
+        """Format messages for Anthropic API."""
+        formatted = []
+        for msg in self.messages:
+            formatted_msg = {"role": msg["role"]}
+            # Handle multimodal content
+            if isinstance(msg["content"], list):
+                formatted_msg["content"] = []
+                for item in msg["content"]:
+                    if item["type"] == "text":
+                        formatted_msg["content"].append({"type": "text", "text": item["text"]})
+                    elif item["type"] == "image_url":
+                        formatted_msg["content"].append(
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/png",
+                                    "data": item["image_url"]["url"].split(",")[1],
+                                },
+                            }
+                        )
+            else:
+                formatted_msg["content"] = msg["content"]
+            formatted.append(formatted_msg)
+        return formatted
+    def _format_for_openai(self) -> List[Dict[str, Any]]:
+        """Format messages for OpenAI API."""
+        # OpenAI already uses the same format
+        return self.messages
+    def _format_for_groq(self) -> List[Dict[str, Any]]:
+        """Format messages for Groq API."""
+        # Groq uses OpenAI-compatible format
+        return self.messages
+    def _format_for_qwen(self) -> List[Dict[str, Any]]:
+        """Format messages for Qwen API."""
+        formatted = []
+        for msg in self.messages:
+            if isinstance(msg["content"], list):
+                # Convert multimodal content to text-only
+                text_content = next(
+                    (item["text"] for item in msg["content"] if item["type"] == "text"), ""
+                )
+                formatted.append({"role": msg["role"], "content": text_content})
+            else:
+                formatted.append(msg)
+        return formatted

agent/providers/omni/parser.py ADDED Viewed

@@ -0,0 +1,252 @@
+"""Parser implementation for the Omni provider."""
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+import base64
+from PIL import Image
+from io import BytesIO
+import json
+import torch
+# Import from the SOM package
+from som import OmniParser as OmniDetectParser
+from som.models import ParseResult, BoundingBox, UIElement, ImageData, ParserMetadata
+logger = logging.getLogger(__name__)
+class OmniParser:
+    """Parser for handling responses from multiple providers."""
+    # Class-level shared OmniDetectParser instance
+    _shared_parser = None
+    def __init__(self, force_device: Optional[str] = None):
+        """Initialize the OmniParser.
+        Args:
+            force_device: Optional device to force for detection (cpu/cuda/mps)
+        """
+        self.response_buffer = []
+        # Use shared parser if available, otherwise create a new one
+        if OmniParser._shared_parser is None:
+            logger.info("Initializing shared OmniDetectParser...")
+            # Determine the best device to use
+            device = force_device
+            if not device:
+                if torch.cuda.is_available():
+                    device = "cuda"
+                elif (
+                    hasattr(torch, "backends")
+                    and hasattr(torch.backends, "mps")
+                    and torch.backends.mps.is_available()
+                ):
+                    device = "mps"
+                else:
+                    device = "cpu"
+            logger.info(f"Using device: {device} for OmniDetectParser")
+            self.detect_parser = OmniDetectParser(force_device=device)
+            # Preload the detection model to avoid repeated loading
+            try:
+                # Access the detector to trigger model loading
+                detector = self.detect_parser.detector
+                if detector.model is None:
+                    logger.info("Preloading detection model...")
+                    detector.load_model()
+                    logger.info("Detection model preloaded successfully")
+            except Exception as e:
+                logger.error(f"Error preloading detection model: {str(e)}")
+            # Store as shared instance
+            OmniParser._shared_parser = self.detect_parser
+        else:
+            logger.info("Using existing shared OmniDetectParser")
+            self.detect_parser = OmniParser._shared_parser
+    async def parse_screen(self, computer: Any) -> ParseResult:
+        """Parse a screenshot and extract screen information.
+        Args:
+            computer: Computer instance
+        Returns:
+            ParseResult with screen elements and image data
+        """
+        try:
+            # Get screenshot from computer
+            logger.info("Taking screenshot...")
+            screenshot = await computer.screenshot()
+            # Log screenshot info
+            logger.info(f"Screenshot type: {type(screenshot)}")
+            logger.info(f"Screenshot is bytes: {isinstance(screenshot, bytes)}")
+            logger.info(f"Screenshot is str: {isinstance(screenshot, str)}")
+            logger.info(f"Screenshot length: {len(screenshot) if screenshot else 0}")
+            # If screenshot is a string (likely base64), convert it to bytes
+            if isinstance(screenshot, str):
+                try:
+                    screenshot = base64.b64decode(screenshot)
+                    logger.info("Successfully converted base64 string to bytes")
+                    logger.info(f"Decoded bytes length: {len(screenshot)}")
+                except Exception as e:
+                    logger.error(f"Error decoding base64: {str(e)}")
+                    logger.error(f"First 100 chars of screenshot string: {screenshot[:100]}")
+            # Pass screenshot to OmniDetectParser
+            logger.info("Passing screenshot to OmniDetectParser...")
+            parse_result = self.detect_parser.parse(
+                screenshot_data=screenshot, box_threshold=0.3, iou_threshold=0.1, use_ocr=True
+            )
+            logger.info("Screenshot parsed successfully")
+            logger.info(f"Parse result has {len(parse_result.elements)} elements")
+            # Log element IDs for debugging
+            for i, elem in enumerate(parse_result.elements):
+                logger.info(
+                    f"Element {i+1} (ID: {elem.id}): {elem.type} with confidence {elem.confidence:.3f}"
+                )
+            return parse_result
+        except Exception as e:
+            logger.error(f"Error parsing screen: {str(e)}")
+            import traceback
+            logger.error(traceback.format_exc())
+            # Create a minimal valid result for error cases
+            return ParseResult(
+                elements=[],
+                annotated_image_base64="",
+                parsed_content_list=[f"Error: {str(e)}"],
+                metadata=ParserMetadata(
+                    image_size=(0, 0),
+                    num_icons=0,
+                    num_text=0,
+                    device="cpu",
+                    ocr_enabled=False,
+                    latency=0.0,
+                ),
+            )
+    def parse_tool_call(self, response: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Parse a tool call from the response.
+        Args:
+            response: Response from the provider
+        Returns:
+            Parsed tool call or None if no tool call found
+        """
+        try:
+            # Handle Anthropic format
+            if "tool_calls" in response:
+                tool_call = response["tool_calls"][0]
+                return {
+                    "name": tool_call["function"]["name"],
+                    "arguments": tool_call["function"]["arguments"],
+                }
+            # Handle OpenAI format
+            if "function_call" in response:
+                return {
+                    "name": response["function_call"]["name"],
+                    "arguments": response["function_call"]["arguments"],
+                }
+            # Handle Groq format (OpenAI-compatible)
+            if "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                if "function_call" in choice:
+                    return {
+                        "name": choice["function_call"]["name"],
+                        "arguments": choice["function_call"]["arguments"],
+                    }
+            return None
+        except Exception as e:
+            logger.error(f"Error parsing tool call: {str(e)}")
+            return None
+    def parse_response(self, response: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
+        """Parse a response from any provider.
+        Args:
+            response: Response from the provider
+        Returns:
+            Tuple of (content, metadata)
+        """
+        try:
+            content = ""
+            metadata = {}
+            # Handle Anthropic format
+            if "content" in response and isinstance(response["content"], list):
+                for item in response["content"]:
+                    if item["type"] == "text":
+                        content += item["text"]
+            # Handle OpenAI format
+            elif "choices" in response and response["choices"]:
+                content = response["choices"][0]["message"]["content"]
+            # Handle direct content
+            elif isinstance(response.get("content"), str):
+                content = response["content"]
+            # Extract metadata if present
+            if "metadata" in response:
+                metadata = response["metadata"]
+            return content, metadata
+        except Exception as e:
+            logger.error(f"Error parsing response: {str(e)}")
+            return str(e), {"error": True}
+    def format_for_provider(
+        self, messages: List[Dict[str, Any]], provider: str
+    ) -> List[Dict[str, Any]]:
+        """Format messages for a specific provider.
+        Args:
+            messages: List of messages to format
+            provider: Provider to format for
+        Returns:
+            Formatted messages
+        """
+        try:
+            formatted = []
+            for msg in messages:
+                formatted_msg = {"role": msg["role"]}
+                # Handle content formatting
+                if isinstance(msg["content"], list):
+                    # For providers that support multimodal
+                    if provider in ["anthropic", "openai"]:
+                        formatted_msg["content"] = msg["content"]
+                    else:
+                        # Extract text only for other providers
+                        text_content = next(
+                            (item["text"] for item in msg["content"] if item["type"] == "text"), ""
+                        )
+                        formatted_msg["content"] = text_content
+                else:
+                    formatted_msg["content"] = msg["content"]
+                formatted.append(formatted_msg)
+            return formatted
+        except Exception as e:
+            logger.error(f"Error formatting messages: {str(e)}")
+            return messages  # Return original messages on error

agent/providers/omni/prompts.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Prompts for the Omni agent."""
+SYSTEM_PROMPT = """
+You are using a macOS device.
+You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
+You may be given some history plan and actions, this is the response from the previous loop.
+You should carefully consider your plan base on the task, screenshot, and history actions.
+Your available "Next Action" only include:
+- type_text: types a string of text.
+- left_click: move mouse to box id and left clicks.
+- right_click: move mouse to box id and right clicks.
+- double_click: move mouse to box id and double clicks.
+- move_cursor: move mouse to box id.
+- scroll_up: scrolls the screen up to view previous content.
+- scroll_down: scrolls the screen down, when the desired button is not visible, or you need to see more content.
+- hotkey: press a sequence of keys.
+- wait: waits for 1 second for the device to load or respond.
+Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task.
+Output format:
+{
+    "Explanation": str, # describe what is in the current screen, taking into account the history, then describe your step-by-step thoughts on how to achieve the task, choose one action from available actions at a time.
+    "Action": "action_type, action description" | "None" # one action at a time, describe it in short and precisely.
+    "Box ID": n,
+    "Value": "xxx" # only provide value field if the action is type, else don't include value key
+}
+One Example:
+{
+    "Explanation": "The current screen shows google result of amazon, in previous action I have searched amazon on google. Then I need to click on the first search results to go to amazon.com.",
+    "Action": "left_click",
+    "Box ID": 4
+}
+Another Example:
+{
+    "Explanation": "The current screen shows the front page of amazon. There is no previous action. Therefore I need to type "Apple watch" in the search bar.",
+    "Action": "type_text",
+    "Box ID": 2,
+    "Value": "Apple watch"
+}
+Another Example:
+{
+    "Explanation": "I am starting a Spotlight search to find the Safari browser.",
+    "Action": "hotkey",
+    "Value": "command+space"
+}
+IMPORTANT NOTES:
+1. You should only give a single action at a time.
+2. The Box ID is the id of the element you should operate on, it is a number. Its background color corresponds to the color of the bounding box of the element.
+3. You should give an analysis to the current screen, and reflect on what has been done by looking at the history, then describe your step-by-step thoughts on how to achieve the task.
+4. Attach the next action prediction in the "Action" field.
+5. For starting applications, always use the "hotkey" action with command+space for starting a Spotlight search.
+6. When the task is completed, don't complete additional actions. You should say "Action": "None" in the json field.
+7. The tasks involve buying multiple products or navigating through multiple pages. You should break it into subgoals and complete each subgoal one by one in the order of the instructions.
+8. Avoid choosing the same action/elements multiple times in a row, if it happens, reflect to yourself, what may have gone wrong, and predict a different action.
+9. Reflect whether the element is clickable or not, for example reflect if it is an hyperlink or a button or a normal text.
+10. If you are prompted with login information page or captcha page, or you think it need user's permission to do the next action, you should say "Action": "None" in the json field.
+"""
+# SYSTEM_PROMPT1 = """You are an AI assistant helping users interact with their computer.
+# Analyze the screen information and respond with JSON containing:
+# {
+#     "Box ID": "Numeric ID of the relevant UI element",
+#     "Action": "One of: left_click, right_click, double_click, move_cursor, drag_to, type_text, press_key, hotkey, scroll_down, scroll_up, wait",
+#     "Value": "Text to type, key to press",
+#     "Explanation": "Why this action was chosen"
+# }
+# Notes:
+# - For starting applications, use the "hotkey" action with command+space for starting a Spotlight search.
+# - Each UI element is highlighted with a colored bounding box, and its Box ID appears nearby in the same color for easy identification.
+# """

agent/providers/omni/tool_manager.py ADDED Viewed

@@ -0,0 +1,91 @@
+# """Omni tool manager implementation."""
+# from typing import Dict, List, Type, Any
+# from computer import Computer
+# from ...core.tools import BaseToolManager, BashTool, EditTool
+# class OmniToolManager(BaseToolManager):
+#     """Tool manager for multi-provider support."""
+#     def __init__(self, computer: Computer):
+#         """Initialize Omni tool manager.
+#         Args:
+#             computer: Computer instance for tools
+#         """
+#         super().__init__(computer)
+#     def get_anthropic_tools(self) -> List[Dict[str, Any]]:
+#         """Get tools formatted for Anthropic API.
+#         Returns:
+#             List of tool parameters in Anthropic format
+#         """
+#         tools: List[Dict[str, Any]] = []
+#         # Map base tools to Anthropic format
+#         for tool in self.tools.values():
+#             if isinstance(tool, BashTool):
+#                 tools.append({
+#                     "type": "bash_20241022",
+#                     "name": tool.name
+#                 })
+#             elif isinstance(tool, EditTool):
+#                 tools.append({
+#                     "type": "text_editor_20241022",
+#                     "name": "str_replace_editor"
+#                 })
+#         return tools
+#     def get_openai_tools(self) -> List[Dict]:
+#         """Get tools formatted for OpenAI API.
+#         Returns:
+#             List of tool parameters in OpenAI format
+#         """
+#         tools = []
+#         # Map base tools to OpenAI format
+#         for tool in self.tools.values():
+#             tools.append({
+#                 "type": "function",
+#                 "function": tool.get_schema()
+#             })
+#         return tools
+#     def get_groq_tools(self) -> List[Dict]:
+#         """Get tools formatted for Groq API.
+#         Returns:
+#             List of tool parameters in Groq format
+#         """
+#         tools = []
+#         # Map base tools to Groq format
+#         for tool in self.tools.values():
+#             tools.append({
+#                 "type": "function",
+#                 "function": tool.get_schema()
+#             })
+#         return tools
+#     def get_qwen_tools(self) -> List[Dict]:
+#         """Get tools formatted for Qwen API.
+#         Returns:
+#             List of tool parameters in Qwen format
+#         """
+#         tools = []
+#         # Map base tools to Qwen format
+#         for tool in self.tools.values():
+#             tools.append({
+#                 "type": "function",
+#                 "function": tool.get_schema()
+#             })
+#         return tools

agent/providers/omni/tools/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Omni provider tools - compatible with multiple LLM providers."""
+from .bash import OmniBashTool
+from .computer import OmniComputerTool
+from .edit import OmniEditTool
+from .manager import OmniToolManager
+__all__ = [
+    "OmniBashTool",
+    "OmniComputerTool",
+    "OmniEditTool",
+    "OmniToolManager",
+]

agent/providers/omni/tools/bash.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Provider-agnostic implementation of the BashTool."""
+import logging
+from typing import Any, Dict
+from computer.computer import Computer
+from ....core.tools.bash import BaseBashTool
+from ....core.tools import ToolResult
+class OmniBashTool(BaseBashTool):
+    """A provider-agnostic implementation of the bash tool."""
+    name = "bash"
+    logger = logging.getLogger(__name__)
+    def __init__(self, computer: Computer):
+        """Initialize the BashTool.
+        Args:
+            computer: Computer instance, may be used for related operations
+        """
+        super().__init__(computer)
+    def to_params(self) -> Dict[str, Any]:
+        """Convert tool to provider-agnostic parameters.
+        Returns:
+            Dictionary with tool parameters
+        """
+        return {
+            "name": self.name,
+            "description": "A tool that allows the agent to run bash commands",
+            "parameters": {
+                "command": {"type": "string", "description": "The bash command to execute"},
+                "restart": {
+                    "type": "boolean",
+                    "description": "Whether to restart the bash session",
+                },
+            },
+        }
+    async def __call__(self, **kwargs) -> ToolResult:
+        """Execute the bash tool with the provided arguments.
+        Args:
+            command: The bash command to execute
+            restart: Whether to restart the bash session
+        Returns:
+            ToolResult with the command output
+        """
+        command = kwargs.get("command")
+        restart = kwargs.get("restart", False)
+        if not command:
+            return ToolResult(error="Command is required")
+        self.logger.info(f"Executing bash command: {command}")
+        exit_code, stdout, stderr = await self.run_command(command)
+        output = stdout
+        error = None
+        if exit_code != 0:
+            error = f"Command exited with code {exit_code}: {stderr}"
+        return ToolResult(output=output, error=error)