PyPI - cua-agent - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

agent/README.md +63 -0
agent/__init__.py +10 -0
agent/core/README.md +101 -0
agent/core/__init__.py +34 -0
agent/core/agent.py +284 -0
agent/core/base_agent.py +164 -0
agent/core/callbacks.py +147 -0
agent/core/computer_agent.py +69 -0
agent/core/experiment.py +222 -0
agent/core/factory.py +102 -0
agent/core/loop.py +244 -0
agent/core/messages.py +230 -0
agent/core/tools/__init__.py +21 -0
agent/core/tools/base.py +74 -0
agent/core/tools/bash.py +52 -0
agent/core/tools/collection.py +46 -0
agent/core/tools/computer.py +113 -0
agent/core/tools/edit.py +67 -0
agent/core/tools/manager.py +56 -0
agent/providers/__init__.py +4 -0
agent/providers/anthropic/__init__.py +6 -0
agent/providers/anthropic/api/client.py +222 -0
agent/providers/anthropic/api/logging.py +150 -0
agent/providers/anthropic/callbacks/manager.py +55 -0
agent/providers/anthropic/loop.py +521 -0
agent/providers/anthropic/messages/manager.py +110 -0
agent/providers/anthropic/prompts.py +20 -0
agent/providers/anthropic/tools/__init__.py +33 -0
agent/providers/anthropic/tools/base.py +88 -0
agent/providers/anthropic/tools/bash.py +163 -0
agent/providers/anthropic/tools/collection.py +34 -0
agent/providers/anthropic/tools/computer.py +550 -0
agent/providers/anthropic/tools/edit.py +326 -0
agent/providers/anthropic/tools/manager.py +54 -0
agent/providers/anthropic/tools/run.py +42 -0
agent/providers/anthropic/types.py +16 -0
agent/providers/omni/__init__.py +27 -0
agent/providers/omni/callbacks.py +78 -0
agent/providers/omni/clients/anthropic.py +99 -0
agent/providers/omni/clients/base.py +44 -0
agent/providers/omni/clients/groq.py +101 -0
agent/providers/omni/clients/openai.py +159 -0
agent/providers/omni/clients/utils.py +25 -0
agent/providers/omni/experiment.py +273 -0
agent/providers/omni/image_utils.py +106 -0
agent/providers/omni/loop.py +961 -0
agent/providers/omni/messages.py +168 -0
agent/providers/omni/parser.py +252 -0
agent/providers/omni/prompts.py +78 -0
agent/providers/omni/tool_manager.py +91 -0
agent/providers/omni/tools/__init__.py +13 -0
agent/providers/omni/tools/bash.py +69 -0
agent/providers/omni/tools/computer.py +216 -0
agent/providers/omni/tools/manager.py +83 -0
agent/providers/omni/types.py +30 -0
agent/providers/omni/utils.py +155 -0
agent/providers/omni/visualization.py +130 -0
agent/types/__init__.py +26 -0
agent/types/base.py +52 -0
agent/types/messages.py +36 -0
agent/types/tools.py +32 -0
cua_agent-0.1.0.dist-info/METADATA +44 -0
cua_agent-0.1.0.dist-info/RECORD +65 -0
cua_agent-0.1.0.dist-info/WHEEL +4 -0
cua_agent-0.1.0.dist-info/entry_points.txt +4 -0

agent/core/loop.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""Base agent loop implementation."""
+import logging
+import asyncio
+import json
+import os
+from abc import ABC, abstractmethod
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from datetime import datetime
+import base64
+from computer import Computer
+from .experiment import ExperimentManager
+logger = logging.getLogger(__name__)
+class BaseLoop(ABC):
+    """Base class for agent loops that handle message processing and tool execution."""
+    def __init__(
+        self,
+        computer: Computer,
+        model: str,
+        api_key: str,
+        max_tokens: int = 4096,
+        max_retries: int = 3,
+        retry_delay: float = 1.0,
+        base_dir: Optional[str] = "trajectories",
+        save_trajectory: bool = True,
+        only_n_most_recent_images: Optional[int] = 2,
+        **kwargs,
+    ):
+        """Initialize base agent loop.
+        Args:
+            computer: Computer instance to control
+            model: Model name to use
+            api_key: API key for provider
+            max_tokens: Maximum tokens to generate
+            max_retries: Maximum number of retries
+            retry_delay: Delay between retries in seconds
+            base_dir: Base directory for saving experiment data
+            save_trajectory: Whether to save trajectory data
+            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
+            **kwargs: Additional provider-specific arguments
+        """
+        self.computer = computer
+        self.model = model
+        self.api_key = api_key
+        self.max_tokens = max_tokens
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self.base_dir = base_dir
+        self.save_trajectory = save_trajectory
+        self.only_n_most_recent_images = only_n_most_recent_images
+        self._kwargs = kwargs
+        self.message_history = []
+        # self.tool_manager = BaseToolManager(computer)
+        # Initialize experiment manager
+        if self.save_trajectory and self.base_dir:
+            self.experiment_manager = ExperimentManager(
+                base_dir=self.base_dir,
+                only_n_most_recent_images=only_n_most_recent_images,
+            )
+            # Track directories for convenience
+            self.run_dir = self.experiment_manager.run_dir
+            self.current_turn_dir = self.experiment_manager.current_turn_dir
+        else:
+            self.experiment_manager = None
+            self.run_dir = None
+            self.current_turn_dir = None
+        # Initialize basic tracking
+        self.turn_count = 0
+    def _setup_experiment_dirs(self) -> None:
+        """Setup the experiment directory structure."""
+        if self.experiment_manager:
+            # Use the experiment manager to set up directories
+            self.experiment_manager.setup_experiment_dirs()
+            # Update local tracking variables
+            self.run_dir = self.experiment_manager.run_dir
+            self.current_turn_dir = self.experiment_manager.current_turn_dir
+    def _create_turn_dir(self) -> None:
+        """Create a new directory for the current turn."""
+        if self.experiment_manager:
+            # Use the experiment manager to create the turn directory
+            self.experiment_manager.create_turn_dir()
+            # Update local tracking variables
+            self.current_turn_dir = self.experiment_manager.current_turn_dir
+            self.turn_count = self.experiment_manager.turn_count
+    def _log_api_call(
+        self, call_type: str, request: Any, response: Any = None, error: Optional[Exception] = None
+    ) -> None:
+        """Log API call details to file.
+        Args:
+            call_type: Type of API call (e.g., 'request', 'response', 'error')
+            request: The API request data
+            response: Optional API response data
+            error: Optional error information
+        """
+        if self.experiment_manager:
+            # Use the experiment manager to log the API call
+            provider = getattr(self, "provider", "unknown")
+            provider_str = str(provider) if provider else "unknown"
+            self.experiment_manager.log_api_call(
+                call_type=call_type,
+                request=request,
+                provider=provider_str,
+                model=self.model,
+                response=response,
+                error=error,
+            )
+    def _save_screenshot(self, img_base64: str, action_type: str = "") -> None:
+        """Save a screenshot to the experiment directory.
+        Args:
+            img_base64: Base64 encoded screenshot
+            action_type: Type of action that triggered the screenshot
+        """
+        if self.experiment_manager:
+            self.experiment_manager.save_screenshot(img_base64, action_type)
+    async def initialize(self) -> None:
+        """Initialize both the API client and computer interface with retries."""
+        for attempt in range(self.max_retries):
+            try:
+                logger.info(
+                    f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..."
+                )
+                # Initialize API client
+                await self.initialize_client()
+                # Initialize computer
+                await self.computer.initialize()
+                logger.info("Initialization complete.")
+                return
+            except Exception as e:
+                if attempt < self.max_retries - 1:
+                    logger.warning(
+                        f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..."
+                    )
+                    await asyncio.sleep(self.retry_delay)
+                else:
+                    logger.error(
+                        f"Initialization failed after {self.max_retries} attempts: {str(e)}"
+                    )
+                    raise RuntimeError(f"Failed to initialize: {str(e)}")
+    async def _get_parsed_screen_som(self) -> Dict[str, Any]:
+        """Get parsed screen information.
+        Returns:
+            Dict containing screen information
+        """
+        try:
+            # Take screenshot
+            screenshot = await self.computer.screenshot()
+            # Initialize with default values
+            width, height = 1024, 768
+            base64_image = ""
+            # Handle different types of screenshot returns
+            if isinstance(screenshot, bytes):
+                # Raw bytes screenshot
+                base64_image = base64.b64encode(screenshot).decode("utf-8")
+            elif hasattr(screenshot, "base64_image"):
+                # Object-style screenshot with attributes
+                base64_image = screenshot.base64_image
+                if hasattr(screenshot, "width") and hasattr(screenshot, "height"):
+                    width = screenshot.width
+                    height = screenshot.height
+            # Create parsed screen data
+            parsed_screen = {
+                "width": width,
+                "height": height,
+                "parsed_content_list": [],
+                "timestamp": datetime.now().isoformat(),
+                "screenshot_base64": base64_image,
+            }
+            # Save screenshot if requested
+            if self.save_trajectory and self.experiment_manager:
+                try:
+                    img_data = base64_image
+                    if "," in img_data:
+                        img_data = img_data.split(",")[1]
+                    self._save_screenshot(img_data, action_type="state")
+                except Exception as e:
+                    logger.error(f"Error saving screenshot: {str(e)}")
+            return parsed_screen
+        except Exception as e:
+            logger.error(f"Error taking screenshot: {str(e)}")
+            return {
+                "width": 1024,
+                "height": 768,
+                "parsed_content_list": [],
+                "timestamp": datetime.now().isoformat(),
+                "error": f"Error taking screenshot: {str(e)}",
+                "screenshot_base64": "",
+            }
+    @abstractmethod
+    async def initialize_client(self) -> None:
+        """Initialize the API client and any provider-specific components."""
+        raise NotImplementedError
+    @abstractmethod
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
+        """Run the agent loop with provided messages.
+        Args:
+            messages: List of message objects
+        Yields:
+            Dict containing response data
+        """
+        raise NotImplementedError
+    @abstractmethod
+    async def _process_screen(
+        self, parsed_screen: Dict[str, Any], messages: List[Dict[str, Any]]
+    ) -> None:
+        """Process screen information and add to messages.
+        Args:
+            parsed_screen: Dictionary containing parsed screen info
+            messages: List of messages to update
+        """
+        raise NotImplementedError

agent/core/messages.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""Message handling utilities for agent."""
+import base64
+from datetime import datetime
+from io import BytesIO
+import logging
+from typing import Any, Dict, List, Optional, Union
+from PIL import Image
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class ImageRetentionConfig:
+    """Configuration for image retention in messages."""
+    num_images_to_keep: Optional[int] = None
+    min_removal_threshold: int = 1
+    enable_caching: bool = True
+    def should_retain_images(self) -> bool:
+        """Check if image retention is enabled."""
+        return self.num_images_to_keep is not None and self.num_images_to_keep > 0
+class BaseMessageManager:
+    """Base class for message preparation and management."""
+    def __init__(self, image_retention_config: Optional[ImageRetentionConfig] = None):
+        """Initialize the message manager.
+        Args:
+            image_retention_config: Configuration for image retention
+        """
+        self.image_retention_config = image_retention_config or ImageRetentionConfig()
+        if self.image_retention_config.min_removal_threshold < 1:
+            raise ValueError("min_removal_threshold must be at least 1")
+    def prepare_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Prepare messages by applying image retention and caching as configured.
+        Args:
+            messages: List of messages to prepare
+        Returns:
+            Prepared messages
+        """
+        if self.image_retention_config.should_retain_images():
+            self._filter_images(messages)
+        if self.image_retention_config.enable_caching:
+            self._inject_caching(messages)
+        return messages
+    def _filter_images(self, messages: List[Dict[str, Any]]) -> None:
+        """Filter messages to retain only the specified number of most recent images.
+        Args:
+            messages: Messages to filter
+        """
+        # Find all tool result blocks that contain images
+        tool_results = [
+            item
+            for message in messages
+            for item in (message["content"] if isinstance(message["content"], list) else [])
+            if isinstance(item, dict) and item.get("type") == "tool_result"
+        ]
+        # Count total images
+        total_images = sum(
+            1
+            for result in tool_results
+            for content in result.get("content", [])
+            if isinstance(content, dict) and content.get("type") == "image"
+        )
+        # Calculate how many images to remove
+        images_to_remove = total_images - (self.image_retention_config.num_images_to_keep or 0)
+        images_to_remove -= images_to_remove % self.image_retention_config.min_removal_threshold
+        # Remove oldest images first
+        for result in tool_results:
+            if isinstance(result.get("content"), list):
+                new_content = []
+                for content in result["content"]:
+                    if isinstance(content, dict) and content.get("type") == "image":
+                        if images_to_remove > 0:
+                            images_to_remove -= 1
+                            continue
+                    new_content.append(content)
+                result["content"] = new_content
+    def _inject_caching(self, messages: List[Dict[str, Any]]) -> None:
+        """Inject caching control for recent message turns.
+        Args:
+            messages: Messages to inject caching into
+        """
+        # Default to caching last 3 turns
+        turns_to_cache = 3
+        for message in reversed(messages):
+            if message["role"] == "user" and isinstance(content := message["content"], list):
+                if turns_to_cache:
+                    turns_to_cache -= 1
+                    content[-1]["cache_control"] = {"type": "ephemeral"}
+                else:
+                    content[-1].pop("cache_control", None)
+                    break
+def create_user_message(text: str) -> Dict[str, str]:
+    """Create a user message.
+    Args:
+        text: The message text
+    Returns:
+        Message dictionary
+    """
+    return {
+        "role": "user",
+        "content": text,
+    }
+def create_assistant_message(text: str) -> Dict[str, str]:
+    """Create an assistant message.
+    Args:
+        text: The message text
+    Returns:
+        Message dictionary
+    """
+    return {
+        "role": "assistant",
+        "content": text,
+    }
+def create_system_message(text: str) -> Dict[str, str]:
+    """Create a system message.
+    Args:
+        text: The message text
+    Returns:
+        Message dictionary
+    """
+    return {
+        "role": "system",
+        "content": text,
+    }
+def create_image_message(
+    image_base64: Optional[str] = None,
+    image_path: Optional[str] = None,
+    image_obj: Optional[Image.Image] = None,
+) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
+    """Create a message with an image.
+    Args:
+        image_base64: Base64 encoded image
+        image_path: Path to image file
+        image_obj: PIL Image object
+    Returns:
+        Message dictionary with content list
+    Raises:
+        ValueError: If no image source is provided
+    """
+    if not any([image_base64, image_path, image_obj]):
+        raise ValueError("Must provide one of image_base64, image_path, or image_obj")
+    # Convert to base64 if needed
+    if image_path and not image_base64:
+        with open(image_path, "rb") as f:
+            image_bytes = f.read()
+            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+    elif image_obj and not image_base64:
+        buffer = BytesIO()
+        image_obj.save(buffer, format="PNG")
+        image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    return {
+        "role": "user",
+        "content": [
+            {"type": "image", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
+        ],
+    }
+def create_screen_message(
+    parsed_screen: Dict[str, Any],
+    include_raw: bool = False,
+) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
+    """Create a message with screen information.
+    Args:
+        parsed_screen: Dictionary containing parsed screen info
+        include_raw: Whether to include raw screenshot base64
+    Returns:
+        Message dictionary with content
+    """
+    if include_raw and "screenshot_base64" in parsed_screen:
+        # Create content list with both image and text
+        return {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{parsed_screen['screenshot_base64']}"
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": f"Screen dimensions: {parsed_screen['width']}x{parsed_screen['height']}",
+                },
+            ],
+        }
+    else:
+        # Create text-only message with screen info
+        return {
+            "role": "user",
+            "content": f"Screen dimensions: {parsed_screen['width']}x{parsed_screen['height']}",
+        }

agent/core/tools/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Core tools package."""
+from .base import BaseTool, ToolResult, ToolError, ToolFailure, CLIResult
+from .bash import BaseBashTool
+from .collection import ToolCollection
+from .computer import BaseComputerTool
+from .edit import BaseEditTool
+from .manager import BaseToolManager
+__all__ = [
+    "BaseTool",
+    "ToolResult",
+    "ToolError",
+    "ToolFailure",
+    "CLIResult",
+    "BaseBashTool",
+    "BaseComputerTool",
+    "BaseEditTool",
+    "ToolCollection",
+    "BaseToolManager",
+]

agent/core/tools/base.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Abstract base classes for tools that can be used with any provider."""
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass, fields, replace
+from typing import Any, Dict
+class BaseTool(metaclass=ABCMeta):
+    """Abstract base class for provider-agnostic tools."""
+    name: str
+    @abstractmethod
+    async def __call__(self, **kwargs) -> Any:
+        """Executes the tool with the given arguments."""
+        ...
+    @abstractmethod
+    def to_params(self) -> Dict[str, Any]:
+        """Convert tool to provider-specific API parameters.
+        Returns:
+            Dictionary with tool parameters specific to the LLM provider
+        """
+        raise NotImplementedError
+@dataclass(kw_only=True, frozen=True)
+class ToolResult:
+    """Represents the result of a tool execution."""
+    output: str | None = None
+    error: str | None = None
+    base64_image: str | None = None
+    system: str | None = None
+    content: list[dict] | None = None
+    def __bool__(self):
+        return any(getattr(self, field.name) for field in fields(self))
+    def __add__(self, other: "ToolResult"):
+        def combine_fields(field: str | None, other_field: str | None, concatenate: bool = True):
+            if field and other_field:
+                if concatenate:
+                    return field + other_field
+                raise ValueError("Cannot combine tool results")
+            return field or other_field
+        return ToolResult(
+            output=combine_fields(self.output, other.output),
+            error=combine_fields(self.error, other.error),
+            base64_image=combine_fields(self.base64_image, other.base64_image, False),
+            system=combine_fields(self.system, other.system),
+            content=self.content or other.content,  # Use first non-None content
+        )
+    def replace(self, **kwargs):
+        """Returns a new ToolResult with the given fields replaced."""
+        return replace(self, **kwargs)
+class CLIResult(ToolResult):
+    """A ToolResult that can be rendered as a CLI output."""
+class ToolFailure(ToolResult):
+    """A ToolResult that represents a failure."""
+class ToolError(Exception):
+    """Raised when a tool encounters an error."""
+    def __init__(self, message):
+        self.message = message

agent/core/tools/bash.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Abstract base bash/shell tool implementation."""
+import asyncio
+import logging
+from abc import abstractmethod
+from typing import Any, Dict, Tuple
+from computer.computer import Computer
+from .base import BaseTool, ToolResult
+class BaseBashTool(BaseTool):
+    """Base class for bash/shell command execution tools across different providers."""
+    name = "bash"
+    logger = logging.getLogger(__name__)
+    computer: Computer
+    def __init__(self, computer: Computer):
+        """Initialize the BashTool.
+        Args:
+            computer: Computer instance, may be used for related operations
+        """
+        self.computer = computer
+    async def run_command(self, command: str) -> Tuple[int, str, str]:
+        """Run a shell command and return exit code, stdout, and stderr.
+        Args:
+            command: Shell command to execute
+        Returns:
+            Tuple containing (exit_code, stdout, stderr)
+        """
+        try:
+            process = await asyncio.create_subprocess_shell(
+                command,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            stdout, stderr = await process.communicate()
+            return process.returncode or 0, stdout.decode(), stderr.decode()
+        except Exception as e:
+            self.logger.error(f"Error running command: {str(e)}")
+            return 1, "", str(e)
+    @abstractmethod
+    async def __call__(self, **kwargs) -> ToolResult:
+        """Execute the tool with the provided arguments."""
+        raise NotImplementedError

agent/core/tools/collection.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Collection classes for managing multiple tools."""
+from typing import Any, Dict, List, Type
+from .base import (
+    BaseTool,
+    ToolError,
+    ToolFailure,
+    ToolResult,
+)
+class ToolCollection:
+    """A collection of tools that can be used with any provider."""
+    def __init__(self, *tools: BaseTool):
+        self.tools = tools
+        self.tool_map = {tool.name: tool for tool in tools}
+    def to_params(self) -> List[Dict[str, Any]]:
+        """Convert all tools to provider-specific parameters.
+        Returns:
+            List of dictionaries with tool parameters
+        """
+        return [tool.to_params() for tool in self.tools]
+    async def run(self, *, name: str, tool_input: Dict[str, Any]) -> ToolResult:
+        """Run a tool with the given input.
+        Args:
+            name: Name of the tool to run
+            tool_input: Input parameters for the tool
+        Returns:
+            Result of the tool execution
+        """
+        tool = self.tool_map.get(name)
+        if not tool:
+            return ToolFailure(error=f"Tool {name} is invalid")
+        try:
+            return await tool(**tool_input)
+        except ToolError as e:
+            return ToolFailure(error=e.message)
+        except Exception as e:
+            return ToolFailure(error=f"Unexpected error in tool {name}: {str(e)}")