PyPI - cua-agent - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (57) hide show

agent/__init__.py +3 -2
agent/core/__init__.py +1 -6
agent/core/{computer_agent.py → agent.py} +31 -76
agent/core/{loop.py → base.py} +68 -127
agent/core/factory.py +104 -0
agent/core/messages.py +279 -125
agent/core/provider_config.py +15 -0
agent/core/types.py +45 -0
agent/core/visualization.py +197 -0
agent/providers/anthropic/api/client.py +142 -1
agent/providers/anthropic/api_handler.py +140 -0
agent/providers/anthropic/callbacks/__init__.py +5 -0
agent/providers/anthropic/loop.py +207 -221
agent/providers/anthropic/response_handler.py +226 -0
agent/providers/anthropic/tools/bash.py +0 -97
agent/providers/anthropic/utils.py +368 -0
agent/providers/omni/__init__.py +1 -20
agent/providers/omni/api_handler.py +42 -0
agent/providers/omni/clients/anthropic.py +4 -0
agent/providers/omni/image_utils.py +0 -72
agent/providers/omni/loop.py +491 -607
agent/providers/omni/parser.py +58 -4
agent/providers/omni/tools/__init__.py +25 -7
agent/providers/omni/tools/base.py +29 -0
agent/providers/omni/tools/bash.py +43 -38
agent/providers/omni/tools/computer.py +144 -182
agent/providers/omni/tools/manager.py +25 -45
agent/providers/omni/types.py +1 -3
agent/providers/omni/utils.py +224 -145
agent/providers/openai/__init__.py +6 -0
agent/providers/openai/api_handler.py +453 -0
agent/providers/openai/loop.py +440 -0
agent/providers/openai/response_handler.py +205 -0
agent/providers/openai/tools/__init__.py +15 -0
agent/providers/openai/tools/base.py +79 -0
agent/providers/openai/tools/computer.py +319 -0
agent/providers/openai/tools/manager.py +106 -0
agent/providers/openai/types.py +36 -0
agent/providers/openai/utils.py +98 -0
cua_agent-0.1.18.dist-info/METADATA +165 -0
cua_agent-0.1.18.dist-info/RECORD +73 -0
agent/README.md +0 -63
agent/providers/anthropic/messages/manager.py +0 -112
agent/providers/omni/callbacks.py +0 -78
agent/providers/omni/clients/groq.py +0 -101
agent/providers/omni/experiment.py +0 -276
agent/providers/omni/messages.py +0 -171
agent/providers/omni/tool_manager.py +0 -91
agent/providers/omni/visualization.py +0 -130
agent/types/__init__.py +0 -23
agent/types/base.py +0 -41
agent/types/messages.py +0 -36
cua_agent-0.1.6.dist-info/METADATA +0 -120
cua_agent-0.1.6.dist-info/RECORD +0 -64
/agent/{types → core}/tools.py +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
{cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0

agent/providers/omni/experiment.py DELETED Viewed

@@ -1,276 +0,0 @@
-"""Experiment management for the Cua provider."""
-import os
-import logging
-import copy
-import base64
-from io import BytesIO
-from datetime import datetime
-from typing import Any, Dict, List, Optional
-from PIL import Image
-import json
-import time
-logger = logging.getLogger(__name__)
-class ExperimentManager:
-    """Manages experiment directories and logging for the agent."""
-    def __init__(
-        self,
-        base_dir: Optional[str] = None,
-        only_n_most_recent_images: Optional[int] = None,
-    ):
-        """Initialize the experiment manager.
-        Args:
-            base_dir: Base directory for saving experiment data
-            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
-        """
-        self.base_dir = base_dir
-        self.only_n_most_recent_images = only_n_most_recent_images
-        self.run_dir = None
-        self.current_turn_dir = None
-        self.turn_count = 0
-        self.screenshot_count = 0
-        # Track all screenshots for potential API request inclusion
-        self.screenshot_paths = []
-        # Set up experiment directories if base_dir is provided
-        if self.base_dir:
-            self.setup_experiment_dirs()
-    def setup_experiment_dirs(self) -> None:
-        """Setup the experiment directory structure."""
-        if not self.base_dir:
-            return
-        # Create base experiments directory if it doesn't exist
-        os.makedirs(self.base_dir, exist_ok=True)
-        # Use the base_dir directly as the run_dir
-        self.run_dir = self.base_dir
-        logger.info(f"Using directory for experiment: {self.run_dir}")
-        # Create first turn directory
-        self.create_turn_dir()
-    def create_turn_dir(self) -> None:
-        """Create a new directory for the current turn."""
-        if not self.run_dir:
-            return
-        self.turn_count += 1
-        self.current_turn_dir = os.path.join(self.run_dir, f"turn_{self.turn_count:03d}")
-        os.makedirs(self.current_turn_dir, exist_ok=True)
-        logger.info(f"Created turn directory: {self.current_turn_dir}")
-    def sanitize_log_data(self, data: Any) -> Any:
-        """Sanitize data for logging by removing large base64 strings.
-        Args:
-            data: Data to sanitize (dict, list, or primitive)
-        Returns:
-            Sanitized copy of the data
-        """
-        if isinstance(data, dict):
-            result = copy.deepcopy(data)
-            # Handle nested dictionaries and lists
-            for key, value in result.items():
-                # Process content arrays that contain image data
-                if key == "content" and isinstance(value, list):
-                    for i, item in enumerate(value):
-                        if isinstance(item, dict):
-                            # Handle Anthropic format
-                            if item.get("type") == "image" and isinstance(item.get("source"), dict):
-                                source = item["source"]
-                                if "data" in source and isinstance(source["data"], str):
-                                    # Replace base64 data with a placeholder and length info
-                                    data_len = len(source["data"])
-                                    source["data"] = f"[BASE64_IMAGE_DATA_LENGTH_{data_len}]"
-                            # Handle OpenAI format
-                            elif item.get("type") == "image_url" and isinstance(
-                                item.get("image_url"), dict
-                            ):
-                                url_dict = item["image_url"]
-                                if "url" in url_dict and isinstance(url_dict["url"], str):
-                                    url = url_dict["url"]
-                                    if url.startswith("data:"):
-                                        # Replace base64 data with placeholder
-                                        data_len = len(url)
-                                        url_dict["url"] = f"[BASE64_IMAGE_URL_LENGTH_{data_len}]"
-                # Handle other nested structures recursively
-                if isinstance(value, dict):
-                    result[key] = self.sanitize_log_data(value)
-                elif isinstance(value, list):
-                    result[key] = [self.sanitize_log_data(item) for item in value]
-            return result
-        elif isinstance(data, list):
-            return [self.sanitize_log_data(item) for item in data]
-        else:
-            return data
-    def save_debug_image(self, image_data: str, filename: str) -> None:
-        """Save a debug image to the experiment directory.
-        Args:
-            image_data: Base64 encoded image data
-            filename: Filename to save the image as
-        """
-        # Since we no longer want to use the images/ folder, we'll skip this functionality
-        return
-    def save_screenshot(self, img_base64: str, action_type: str = "") -> Optional[str]:
-        """Save a screenshot to the experiment directory.
-        Args:
-            img_base64: Base64 encoded screenshot
-            action_type: Type of action that triggered the screenshot
-        Returns:
-            Optional[str]: Path to the saved screenshot, or None if saving failed
-        """
-        if not self.current_turn_dir:
-            return None
-        try:
-            # Increment screenshot counter
-            self.screenshot_count += 1
-            # Create a descriptive filename
-            timestamp = int(time.time() * 1000)
-            action_suffix = f"_{action_type}" if action_type else ""
-            filename = f"screenshot_{self.screenshot_count:03d}{action_suffix}_{timestamp}.png"
-            # Save directly to the turn directory (no screenshots subdirectory)
-            filepath = os.path.join(self.current_turn_dir, filename)
-            # Save the screenshot
-            img_data = base64.b64decode(img_base64)
-            with open(filepath, "wb") as f:
-                f.write(img_data)
-            # Keep track of the file path for reference
-            self.screenshot_paths.append(filepath)
-            return filepath
-        except Exception as e:
-            logger.error(f"Error saving screenshot: {str(e)}")
-            return None
-    def should_save_debug_image(self) -> bool:
-        """Determine if debug images should be saved.
-        Returns:
-            Boolean indicating if debug images should be saved
-        """
-        # We no longer need to save debug images, so always return False
-        return False
-    def save_action_visualization(
-        self, img: Image.Image, action_name: str, details: str = ""
-    ) -> str:
-        """Save a visualization of an action.
-        Args:
-            img: Image to save
-            action_name: Name of the action
-            details: Additional details about the action
-        Returns:
-            Path to the saved image
-        """
-        if not self.current_turn_dir:
-            return ""
-        try:
-            # Create a descriptive filename
-            timestamp = int(time.time() * 1000)
-            details_suffix = f"_{details}" if details else ""
-            filename = f"vis_{action_name}{details_suffix}_{timestamp}.png"
-            # Save directly to the turn directory (no visualizations subdirectory)
-            filepath = os.path.join(self.current_turn_dir, filename)
-            # Save the image
-            img.save(filepath)
-            # Keep track of the file path for cleanup
-            self.screenshot_paths.append(filepath)
-            return filepath
-        except Exception as e:
-            logger.error(f"Error saving action visualization: {str(e)}")
-            return ""
-    def extract_and_save_images(self, data: Any, prefix: str) -> None:
-        """Extract and save images from response data.
-        Args:
-            data: Response data to extract images from
-            prefix: Prefix for saved image filenames
-        """
-        # Since we no longer want to save extracted images separately,
-        # we'll skip this functionality entirely
-        return
-    def log_api_call(
-        self,
-        call_type: str,
-        request: Any,
-        provider: str,
-        model: str,
-        response: Any = None,
-        error: Optional[Exception] = None,
-    ) -> None:
-        """Log API call details to file.
-        Args:
-            call_type: Type of API call (e.g., 'request', 'response', 'error')
-            request: The API request data
-            provider: The AI provider used
-            model: The AI model used
-            response: Optional API response data
-            error: Optional error information
-        """
-        if not self.current_turn_dir:
-            return
-        try:
-            # Create a unique filename with timestamp
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            filename = f"api_call_{timestamp}_{call_type}.json"
-            filepath = os.path.join(self.current_turn_dir, filename)
-            # Sanitize data to remove large base64 strings
-            sanitized_request = self.sanitize_log_data(request)
-            sanitized_response = self.sanitize_log_data(response) if response is not None else None
-            # Prepare log data
-            log_data = {
-                "timestamp": timestamp,
-                "provider": provider,
-                "model": model,
-                "type": call_type,
-                "request": sanitized_request,
-            }
-            if sanitized_response is not None:
-                log_data["response"] = sanitized_response
-            if error is not None:
-                log_data["error"] = str(error)
-            # Write to file
-            with open(filepath, "w") as f:
-                json.dump(log_data, f, indent=2, default=str)
-            logger.info(f"Logged API {call_type} to {filepath}")
-        except Exception as e:
-            logger.error(f"Error logging API call: {str(e)}")

agent/providers/omni/messages.py DELETED Viewed

@@ -1,171 +0,0 @@
-"""Omni message manager implementation."""
-import base64
-from typing import Any, Dict, List, Optional
-from io import BytesIO
-from PIL import Image
-from ...core.messages import BaseMessageManager, ImageRetentionConfig
-class OmniMessageManager(BaseMessageManager):
-    """Message manager for multi-provider support."""
-    def __init__(self, config: Optional[ImageRetentionConfig] = None):
-        """Initialize the message manager.
-        Args:
-            config: Optional configuration for image retention
-        """
-        super().__init__(config)
-        self.messages: List[Dict[str, Any]] = []
-        self.config = config
-    def add_user_message(self, content: str, images: Optional[List[bytes]] = None) -> None:
-        """Add a user message to the history.
-        Args:
-            content: Message content
-            images: Optional list of image data
-        """
-        # Add images if present
-        if images:
-            # Initialize with proper typing for mixed content
-            message_content: List[Dict[str, Any]] = [{"type": "text", "text": content}]
-            # Add each image
-            for img in images:
-                message_content.append(
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{base64.b64encode(img).decode()}"
-                        },
-                    }
-                )
-            message = {"role": "user", "content": message_content}
-        else:
-            # Simple text message
-            message = {"role": "user", "content": content}
-        self.messages.append(message)
-        # Apply retention policy
-        if self.config and self.config.num_images_to_keep:
-            self._apply_image_retention_policy()
-    def add_assistant_message(self, content: str) -> None:
-        """Add an assistant message to the history.
-        Args:
-            content: Message content
-        """
-        self.messages.append({"role": "assistant", "content": content})
-    def add_system_message(self, content: str) -> None:
-        """Add a system message to the history.
-        Args:
-            content: Message content
-        """
-        self.messages.append({"role": "system", "content": content})
-    def _apply_image_retention_policy(self) -> None:
-        """Apply image retention policy to message history."""
-        if not self.config or not self.config.num_images_to_keep:
-            return
-        # Count images from newest to oldest
-        image_count = 0
-        for message in reversed(self.messages):
-            if message["role"] != "user":
-                continue
-            # Handle multimodal messages
-            if isinstance(message["content"], list):
-                new_content = []
-                for item in message["content"]:
-                    if item["type"] == "text":
-                        new_content.append(item)
-                    elif item["type"] == "image_url":
-                        if image_count < self.config.num_images_to_keep:
-                            new_content.append(item)
-                            image_count += 1
-                message["content"] = new_content
-    def get_formatted_messages(self, provider: str) -> List[Dict[str, Any]]:
-        """Get messages formatted for specific provider.
-        Args:
-            provider: Provider name to format messages for
-        Returns:
-            List of formatted messages
-        """
-        # Set the provider for message formatting
-        self.set_provider(provider)
-        if provider == "anthropic":
-            return self._format_for_anthropic()
-        elif provider == "openai":
-            return self._format_for_openai()
-        elif provider == "groq":
-            return self._format_for_groq()
-        elif provider == "qwen":
-            return self._format_for_qwen()
-        else:
-            raise ValueError(f"Unsupported provider: {provider}")
-    def _format_for_anthropic(self) -> List[Dict[str, Any]]:
-        """Format messages for Anthropic API."""
-        formatted = []
-        for msg in self.messages:
-            formatted_msg = {"role": msg["role"]}
-            # Handle multimodal content
-            if isinstance(msg["content"], list):
-                formatted_msg["content"] = []
-                for item in msg["content"]:
-                    if item["type"] == "text":
-                        formatted_msg["content"].append({"type": "text", "text": item["text"]})
-                    elif item["type"] == "image_url":
-                        formatted_msg["content"].append(
-                            {
-                                "type": "image",
-                                "source": {
-                                    "type": "base64",
-                                    "media_type": "image/png",
-                                    "data": item["image_url"]["url"].split(",")[1],
-                                },
-                            }
-                        )
-            else:
-                formatted_msg["content"] = msg["content"]
-            formatted.append(formatted_msg)
-        return formatted
-    def _format_for_openai(self) -> List[Dict[str, Any]]:
-        """Format messages for OpenAI API."""
-        # OpenAI already uses the same format
-        return self.messages
-    def _format_for_groq(self) -> List[Dict[str, Any]]:
-        """Format messages for Groq API."""
-        # Groq uses OpenAI-compatible format
-        return self.messages
-    def _format_for_qwen(self) -> List[Dict[str, Any]]:
-        """Format messages for Qwen API."""
-        formatted = []
-        for msg in self.messages:
-            if isinstance(msg["content"], list):
-                # Convert multimodal content to text-only
-                text_content = next(
-                    (item["text"] for item in msg["content"] if item["type"] == "text"), ""
-                )
-                formatted.append({"role": msg["role"], "content": text_content})
-            else:
-                formatted.append(msg)
-        return formatted

agent/providers/omni/tool_manager.py DELETED Viewed

@@ -1,91 +0,0 @@
-# """Omni tool manager implementation."""
-# from typing import Dict, List, Type, Any
-# from computer import Computer
-# from ...core.tools import BaseToolManager, BashTool, EditTool
-# class OmniToolManager(BaseToolManager):
-#     """Tool manager for multi-provider support."""
-#     def __init__(self, computer: Computer):
-#         """Initialize Omni tool manager.
-#         Args:
-#             computer: Computer instance for tools
-#         """
-#         super().__init__(computer)
-#     def get_anthropic_tools(self) -> List[Dict[str, Any]]:
-#         """Get tools formatted for Anthropic API.
-#         Returns:
-#             List of tool parameters in Anthropic format
-#         """
-#         tools: List[Dict[str, Any]] = []
-#         # Map base tools to Anthropic format
-#         for tool in self.tools.values():
-#             if isinstance(tool, BashTool):
-#                 tools.append({
-#                     "type": "bash_20241022",
-#                     "name": tool.name
-#                 })
-#             elif isinstance(tool, EditTool):
-#                 tools.append({
-#                     "type": "text_editor_20241022",
-#                     "name": "str_replace_editor"
-#                 })
-#         return tools
-#     def get_openai_tools(self) -> List[Dict]:
-#         """Get tools formatted for OpenAI API.
-#         Returns:
-#             List of tool parameters in OpenAI format
-#         """
-#         tools = []
-#         # Map base tools to OpenAI format
-#         for tool in self.tools.values():
-#             tools.append({
-#                 "type": "function",
-#                 "function": tool.get_schema()
-#             })
-#         return tools
-#     def get_groq_tools(self) -> List[Dict]:
-#         """Get tools formatted for Groq API.
-#         Returns:
-#             List of tool parameters in Groq format
-#         """
-#         tools = []
-#         # Map base tools to Groq format
-#         for tool in self.tools.values():
-#             tools.append({
-#                 "type": "function",
-#                 "function": tool.get_schema()
-#             })
-#         return tools
-#     def get_qwen_tools(self) -> List[Dict]:
-#         """Get tools formatted for Qwen API.
-#         Returns:
-#             List of tool parameters in Qwen format
-#         """
-#         tools = []
-#         # Map base tools to Qwen format
-#         for tool in self.tools.values():
-#             tools.append({
-#                 "type": "function",
-#                 "function": tool.get_schema()
-#             })
-#         return tools

agent/providers/omni/visualization.py DELETED Viewed

@@ -1,130 +0,0 @@
-"""Visualization utilities for the Cua provider."""
-import base64
-import logging
-from io import BytesIO
-from typing import Tuple
-from PIL import Image, ImageDraw
-logger = logging.getLogger(__name__)
-def visualize_click(x: int, y: int, img_base64: str) -> Image.Image:
-    """Visualize a click action by drawing on the screenshot.
-    Args:
-        x: X coordinate of the click
-        y: Y coordinate of the click
-        img_base64: Base64 encoded image to draw on
-    Returns:
-        PIL Image with visualization
-    """
-    try:
-        # Decode the base64 image
-        img_data = base64.b64decode(img_base64)
-        img = Image.open(BytesIO(img_data))
-        # Create a drawing context
-        draw = ImageDraw.Draw(img)
-        # Draw concentric circles at the click position
-        small_radius = 10
-        large_radius = 30
-        # Draw filled inner circle
-        draw.ellipse(
-            [(x - small_radius, y - small_radius), (x + small_radius, y + small_radius)],
-            fill="red",
-        )
-        # Draw outlined outer circle
-        draw.ellipse(
-            [(x - large_radius, y - large_radius), (x + large_radius, y + large_radius)],
-            outline="red",
-            width=3,
-        )
-        return img
-    except Exception as e:
-        logger.error(f"Error visualizing click: {str(e)}")
-        # Return a blank image in case of error
-        return Image.new("RGB", (800, 600), color="white")
-def visualize_scroll(direction: str, clicks: int, img_base64: str) -> Image.Image:
-    """Visualize a scroll action by drawing arrows on the screenshot.
-    Args:
-        direction: 'up' or 'down'
-        clicks: Number of scroll clicks
-        img_base64: Base64 encoded image to draw on
-    Returns:
-        PIL Image with visualization
-    """
-    try:
-        # Decode the base64 image
-        img_data = base64.b64decode(img_base64)
-        img = Image.open(BytesIO(img_data))
-        # Get image dimensions
-        width, height = img.size
-        # Create a drawing context
-        draw = ImageDraw.Draw(img)
-        # Determine arrow direction and positions
-        center_x = width // 2
-        arrow_width = 100
-        if direction.lower() == "up":
-            # Draw up arrow in the middle of the screen
-            arrow_y = height // 2
-            # Arrow points
-            points = [
-                (center_x, arrow_y - 50),  # Top point
-                (center_x - arrow_width // 2, arrow_y + 50),  # Bottom left
-                (center_x + arrow_width // 2, arrow_y + 50),  # Bottom right
-            ]
-            color = "blue"
-        else:  # down
-            # Draw down arrow in the middle of the screen
-            arrow_y = height // 2
-            # Arrow points
-            points = [
-                (center_x, arrow_y + 50),  # Bottom point
-                (center_x - arrow_width // 2, arrow_y - 50),  # Top left
-                (center_x + arrow_width // 2, arrow_y - 50),  # Top right
-            ]
-            color = "green"
-        # Draw filled arrow
-        draw.polygon(points, fill=color)
-        # Add text showing number of clicks
-        text_y = arrow_y + 70 if direction.lower() == "down" else arrow_y - 70
-        draw.text((center_x - 40, text_y), f"{clicks} clicks", fill="black")
-        return img
-    except Exception as e:
-        logger.error(f"Error visualizing scroll: {str(e)}")
-        # Return a blank image in case of error
-        return Image.new("RGB", (800, 600), color="white")
-def calculate_element_center(box: Tuple[int, int, int, int]) -> Tuple[int, int]:
-    """Calculate the center coordinates of a bounding box.
-    Args:
-        box: Tuple of (left, top, right, bottom) coordinates
-    Returns:
-        Tuple of (center_x, center_y) coordinates
-    """
-    left, top, right, bottom = box
-    center_x = (left + right) // 2
-    center_y = (top + bottom) // 2
-    return center_x, center_y

agent/types/__init__.py DELETED Viewed

@@ -1,23 +0,0 @@
-"""Type definitions for the agent package."""
-from .base import HostConfig, TaskResult, Annotation
-from .messages import Message, Request, Response, StepMessage, DisengageMessage
-from .tools import ToolInvocation, ToolInvocationState, ClientAttachment, ToolResult
-__all__ = [
-    # Base types
-    "HostConfig",
-    "TaskResult",
-    "Annotation",
-    # Message types
-    "Message",
-    "Request",
-    "Response",
-    "StepMessage",
-    "DisengageMessage",
-    # Tool types
-    "ToolInvocation",
-    "ToolInvocationState",
-    "ClientAttachment",
-    "ToolResult",
-]

cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.6py3-none-any.whl → 0.1.18py3-none-any.whl