PyPI - cua-agent - Versions diffs - 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

cua-agent 0.1.17py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (29) hide show

agent/__init__.py +2 -2
agent/core/__init__.py +1 -1
agent/core/{computer_agent.py → agent.py} +15 -53
agent/core/{loop.py → base.py} +12 -25
agent/core/factory.py +104 -0
agent/core/provider_config.py +15 -0
agent/core/types.py +10 -0
agent/providers/anthropic/loop.py +1 -1
agent/providers/anthropic/response_handler.py +1 -4
agent/providers/anthropic/utils.py +1 -3
agent/providers/omni/loop.py +1 -1
agent/providers/omni/types.py +2 -0
agent/providers/openai/__init__.py +6 -0
agent/providers/openai/api_handler.py +453 -0
agent/providers/openai/loop.py +440 -0
agent/providers/openai/response_handler.py +205 -0
agent/providers/openai/tools/__init__.py +15 -0
agent/providers/openai/tools/base.py +79 -0
agent/providers/openai/tools/computer.py +319 -0
agent/providers/openai/tools/manager.py +106 -0
agent/providers/openai/types.py +36 -0
agent/providers/openai/utils.py +98 -0
cua_agent-0.1.18.dist-info/METADATA +165 -0
{cua_agent-0.1.17.dist-info → cua_agent-0.1.18.dist-info}/RECORD +26 -16
agent/README.md +0 -63
agent/providers/anthropic/messages/manager.py +0 -112
cua_agent-0.1.17.dist-info/METADATA +0 -90
{cua_agent-0.1.17.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
{cua_agent-0.1.17.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0

agent/providers/openai/tools/base.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""OpenAI-specific tool base classes."""
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass, fields, replace
+from typing import Any, Dict, List, Optional
+from ....core.tools.base import BaseTool
+class BaseOpenAITool(BaseTool, metaclass=ABCMeta):
+    """Abstract base class for OpenAI-defined tools."""
+    def __init__(self):
+        """Initialize the base OpenAI tool."""
+        # No specific initialization needed yet, but included for future extensibility
+        pass
+    @abstractmethod
+    async def __call__(self, **kwargs) -> Any:
+        """Executes the tool with the given arguments."""
+        ...
+    @abstractmethod
+    def to_params(self) -> Dict[str, Any]:
+        """Convert tool to OpenAI-specific API parameters.
+        Returns:
+            Dictionary with tool parameters for OpenAI API
+        """
+        raise NotImplementedError
+@dataclass(kw_only=True, frozen=True)
+class ToolResult:
+    """Represents the result of a tool execution."""
+    output: str | None = None
+    error: str | None = None
+    base64_image: str | None = None
+    system: str | None = None
+    content: list[dict] | None = None
+    def __bool__(self):
+        return any(getattr(self, field.name) for field in fields(self))
+    def __add__(self, other: "ToolResult"):
+        def combine_fields(field: str | None, other_field: str | None, concatenate: bool = True):
+            if field and other_field:
+                if concatenate:
+                    return field + other_field
+                raise ValueError("Cannot combine tool results")
+            return field or other_field
+        return ToolResult(
+            output=combine_fields(self.output, other.output),
+            error=combine_fields(self.error, other.error),
+            base64_image=combine_fields(self.base64_image, other.base64_image, False),
+            system=combine_fields(self.system, other.system),
+            content=self.content or other.content,  # Use first non-None content
+        )
+    def replace(self, **kwargs):
+        """Returns a new ToolResult with the given fields replaced."""
+        return replace(self, **kwargs)
+class CLIResult(ToolResult):
+    """A ToolResult that can be rendered as a CLI output."""
+class ToolFailure(ToolResult):
+    """A ToolResult that represents a failure."""
+class ToolError(Exception):
+    """Raised when a tool encounters an error."""
+    def __init__(self, message):
+        self.message = message

agent/providers/openai/tools/computer.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""Computer tool for OpenAI."""
+import asyncio
+import base64
+import logging
+from typing import Literal, Any, Dict, Optional, List, Union
+from computer.computer import Computer
+from .base import BaseOpenAITool, ToolError, ToolResult
+from ....core.tools.computer import BaseComputerTool
+TYPING_DELAY_MS = 12
+TYPING_GROUP_SIZE = 50
+# Key mapping for special keys
+KEY_MAPPING = {
+    "enter": "return",
+    "backspace": "delete",
+    "delete": "forwarddelete",
+    "escape": "esc",
+    "pageup": "page_up",
+    "pagedown": "page_down",
+    "arrowup": "up",
+    "arrowdown": "down",
+    "arrowleft": "left",
+    "arrowright": "right",
+    "home": "home",
+    "end": "end",
+    "tab": "tab",
+    "space": "space",
+    "shift": "shift",
+    "control": "control",
+    "alt": "alt",
+    "meta": "command",
+}
+Action = Literal[
+    "key",
+    "type",
+    "mouse_move",
+    "left_click",
+    "right_click",
+    "double_click",
+    "screenshot",
+    "scroll",
+]
+class ComputerTool(BaseComputerTool, BaseOpenAITool):
+    """
+    A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
+    """
+    name: Literal["computer"] = "computer"
+    api_type: Literal["computer_use_preview"] = "computer_use_preview"
+    width: Optional[int] = None
+    height: Optional[int] = None
+    display_num: Optional[int] = None
+    computer: Computer  # The CUA Computer instance
+    logger = logging.getLogger(__name__)
+    _screenshot_delay = 1.0  # macOS is generally faster than X11
+    _scaling_enabled = True
+    def __init__(self, computer: Computer):
+        """Initialize the computer tool.
+        Args:
+            computer: Computer instance
+        """
+        self.computer = computer
+        self.width = None
+        self.height = None
+        self.logger = logging.getLogger(__name__)
+        # Initialize the base computer tool first
+        BaseComputerTool.__init__(self, computer)
+        # Then initialize the OpenAI tool
+        BaseOpenAITool.__init__(self)
+        # Additional initialization
+        self.width = None  # Will be initialized from computer interface
+        self.height = None  # Will be initialized from computer interface
+        self.display_num = None
+    def to_params(self) -> Dict[str, Any]:
+        """Convert tool to API parameters.
+        Returns:
+            Dictionary with tool parameters
+        """
+        if self.width is None or self.height is None:
+            raise RuntimeError(
+                "Screen dimensions not initialized. Call initialize_dimensions() first."
+            )
+        return {
+            "type": self.api_type,
+            "display_width": self.width,
+            "display_height": self.height,
+            "display_number": self.display_num,
+        }
+    async def initialize_dimensions(self):
+        """Initialize screen dimensions from the computer interface."""
+        try:
+            display_size = await self.computer.interface.get_screen_size()
+            self.width = display_size["width"]
+            self.height = display_size["height"]
+            assert isinstance(self.width, int) and isinstance(self.height, int)
+            self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
+        except Exception as e:
+            # Fall back to defaults if we can't get accurate dimensions
+            self.width = 1024
+            self.height = 768
+            self.logger.warning(
+                f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
+            )
+    async def __call__(
+        self,
+        *,
+        type: str,  # OpenAI uses 'type' instead of 'action'
+        text: Optional[str] = None,
+        **kwargs,
+    ):
+        try:
+            # Ensure dimensions are initialized
+            if self.width is None or self.height is None:
+                await self.initialize_dimensions()
+                if self.width is None or self.height is None:
+                    raise ToolError("Failed to initialize screen dimensions")
+            if type == "type":
+                if text is None:
+                    raise ToolError("text is required for type action")
+                return await self.handle_typing(text)
+            elif type == "click":
+                # Map button to correct action name
+                button = kwargs.get("button")
+                if button is None:
+                    raise ToolError("button is required for click action")
+                return await self.handle_click(button, kwargs["x"], kwargs["y"])
+            elif type == "keypress":
+                # Check for keys in kwargs if text is None
+                if text is None:
+                    if "keys" in kwargs and isinstance(kwargs["keys"], list):
+                        # Pass the keys list directly instead of joining and then splitting
+                        return await self.handle_key(kwargs["keys"])
+                    else:
+                        raise ToolError("Either 'text' or 'keys' is required for keypress action")
+                return await self.handle_key(text)
+            elif type == "mouse_move":
+                if "coordinates" not in kwargs:
+                    raise ToolError("coordinates is required for mouse_move action")
+                return await self.handle_mouse_move(
+                    kwargs["coordinates"][0], kwargs["coordinates"][1]
+                )
+            elif type == "scroll":
+                # Get x, y coordinates directly from kwargs
+                x = kwargs.get("x")
+                y = kwargs.get("y")
+                if x is None or y is None:
+                    raise ToolError("x and y coordinates are required for scroll action")
+                scroll_x = kwargs.get("scroll_x", 0)
+                scroll_y = kwargs.get("scroll_y", 0)
+                return await self.handle_scroll(x, y, scroll_x, scroll_y)
+            elif type == "screenshot":
+                return await self.screenshot()
+            elif type == "wait":
+                duration = kwargs.get("duration", 1.0)
+                await asyncio.sleep(duration)
+                return await self.screenshot()
+            else:
+                raise ToolError(f"Unsupported action: {type}")
+        except Exception as e:
+            self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
+            raise ToolError(f"Failed to execute {type}: {str(e)}")
+    async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
+        """Handle different click actions."""
+        try:
+            # Perform requested click action
+            if button == "left":
+                await self.computer.interface.left_click(x, y)
+            elif button == "right":
+                await self.computer.interface.right_click(x, y)
+            elif button == "double":
+                await self.computer.interface.double_click(x, y)
+            # Wait for UI to update
+            await asyncio.sleep(0.5)
+            # Take screenshot after action
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(
+                output=f"Performed {button} click at ({x}, {y})",
+                base64_image=base64_screenshot,
+            )
+        except Exception as e:
+            self.logger.error(f"Error in handle_click: {str(e)}")
+            raise ToolError(f"Failed to perform {button} click at ({x}, {y}): {str(e)}")
+    async def handle_typing(self, text: str) -> ToolResult:
+        """Handle typing text with a small delay between characters."""
+        try:
+            # Type the text with a small delay
+            await self.computer.interface.type_text(text)
+            await asyncio.sleep(0.3)
+            # Take screenshot after typing
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
+        except Exception as e:
+            self.logger.error(f"Error in handle_typing: {str(e)}")
+            raise ToolError(f"Failed to type '{text}': {str(e)}")
+    async def handle_key(self, key: Union[str, List[str]]) -> ToolResult:
+        """Handle key press, supporting both single keys and combinations.
+        Args:
+            key: Either a string (e.g. "ctrl+c") or a list of keys (e.g. ["ctrl", "c"])
+        """
+        try:
+            # Check if key is already a list
+            if isinstance(key, list):
+                keys = [k.strip().lower() for k in key]
+            else:
+                # Split key string into list if it's a combination (e.g. "ctrl+c")
+                keys = [k.strip().lower() for k in key.split("+")]
+            # Map each key
+            mapped_keys = [KEY_MAPPING.get(k, k) for k in keys]
+            if len(mapped_keys) > 1:
+                # For key combinations (like Ctrl+C)
+                for k in mapped_keys:
+                    await self.computer.interface.press_key(k)
+                await asyncio.sleep(0.1)
+                for k in reversed(mapped_keys):
+                    await self.computer.interface.press_key(k)
+            else:
+                # Single key press
+                await self.computer.interface.press_key(mapped_keys[0])
+            # Wait briefly
+            await asyncio.sleep(0.3)
+            # Take screenshot after action
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
+        except Exception as e:
+            self.logger.error(f"Error in handle_key: {str(e)}")
+            raise ToolError(f"Failed to press key '{key}': {str(e)}")
+    async def handle_mouse_move(self, x: int, y: int) -> ToolResult:
+        """Handle mouse movement."""
+        try:
+            # Move cursor to position
+            await self.computer.interface.move_cursor(x, y)
+            # Wait briefly
+            await asyncio.sleep(0.2)
+            # Take screenshot after action
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
+        except Exception as e:
+            self.logger.error(f"Error in handle_mouse_move: {str(e)}")
+            raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
+    async def handle_scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> ToolResult:
+        """Handle scrolling."""
+        try:
+            # Move cursor to position first
+            await self.computer.interface.move_cursor(x, y)
+            # Scroll based on direction
+            if scroll_y > 0:
+                await self.computer.interface.scroll_down(abs(scroll_y))
+            elif scroll_y < 0:
+                await self.computer.interface.scroll_up(abs(scroll_y))
+            # Wait for UI to update
+            await asyncio.sleep(0.5)
+            # Take screenshot after action
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(
+                output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
+                base64_image=base64_screenshot,
+            )
+        except Exception as e:
+            self.logger.error(f"Error in handle_scroll: {str(e)}")
+            raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
+    async def screenshot(self) -> ToolResult:
+        """Take a screenshot."""
+        try:
+            # Take screenshot
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(output="Screenshot taken", base64_image=base64_screenshot)
+        except Exception as e:
+            self.logger.error(f"Error in screenshot: {str(e)}")
+            raise ToolError(f"Failed to take screenshot: {str(e)}")

agent/providers/openai/tools/manager.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Tool manager for the OpenAI provider."""
+import logging
+from typing import Dict, Any, Optional, List, Callable, Awaitable, Union
+from computer import Computer
+from ..types import ComputerAction, ResponseItemType
+from .computer import ComputerTool
+from ....core.tools.base import ToolResult, ToolFailure
+from ....core.tools.collection import ToolCollection
+logger = logging.getLogger(__name__)
+class ToolManager:
+    """Manager for computer tools in the OpenAI agent."""
+    def __init__(
+        self,
+        computer: Computer,
+        acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
+    ):
+        """Initialize the tool manager.
+        Args:
+            computer: Computer instance
+            acknowledge_safety_check_callback: Optional callback for safety check acknowledgment
+        """
+        self.computer = computer
+        self.acknowledge_safety_check_callback = acknowledge_safety_check_callback
+        self._initialized = False
+        self.computer_tool = ComputerTool(computer)
+        self.tools = None
+        logger.info("Initialized OpenAI ToolManager")
+    async def initialize(self) -> None:
+        """Initialize the tool manager."""
+        if not self._initialized:
+            logger.info("Initializing OpenAI ToolManager")
+            # Initialize the computer tool
+            await self.computer_tool.initialize_dimensions()
+            # Initialize tool collection
+            self.tools = ToolCollection(self.computer_tool)
+            self._initialized = True
+            logger.info("OpenAI ToolManager initialized")
+    async def get_tools_definition(self) -> List[Dict[str, Any]]:
+        """Get the tools definition for the OpenAI agent.
+        Returns:
+            Tools definition for the OpenAI agent
+        """
+        if not self.tools:
+            raise RuntimeError("Tools not initialized. Call initialize() first.")
+        # For the OpenAI Agent Response API, we use a special "computer-preview" tool
+        # which provides the correct interface for computer control
+        display_width, display_height = await self._get_computer_dimensions()
+        # Get environment, using "mac" as default since we're on macOS
+        environment = getattr(self.computer, "environment", "mac")
+        # Ensure environment is one of the allowed values
+        if environment not in ["windows", "mac", "linux", "browser"]:
+            logger.warning(f"Invalid environment value: {environment}, using 'mac' instead")
+            environment = "mac"
+        return [
+            {
+                "type": "computer-preview",
+                "display_width": display_width,
+                "display_height": display_height,
+                "environment": environment,
+            }
+        ]
+    async def _get_computer_dimensions(self) -> tuple[int, int]:
+        """Get the dimensions of the computer display.
+        Returns:
+            Tuple of (width, height)
+        """
+        # If computer tool is initialized, use its dimensions
+        if self.computer_tool.width is not None and self.computer_tool.height is not None:
+            return (self.computer_tool.width, self.computer_tool.height)
+        # Try to get from computer.interface if available
+        screen_size = await self.computer.interface.get_screen_size()
+        return (int(screen_size["width"]), int(screen_size["height"]))
+    async def execute_tool(self, name: str, tool_input: Dict[str, Any]) -> ToolResult:
+        """Execute a tool with the given input.
+        Args:
+            name: Name of the tool to execute
+            tool_input: Input parameters for the tool
+        Returns:
+            Result of the tool execution
+        """
+        if not self.tools:
+            raise RuntimeError("Tools not initialized. Call initialize() first.")
+        return await self.tools.run(name=name, tool_input=tool_input)

agent/providers/openai/types.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Type definitions for the OpenAI provider."""
+from enum import StrEnum, auto
+from typing import Dict, List, Optional, Union, Any
+from dataclasses import dataclass
+class LLMProvider(StrEnum):
+    """OpenAI LLM provider types."""
+    OPENAI = "openai"
+class ResponseItemType(StrEnum):
+    """Types of items in OpenAI Agent Response output."""
+    MESSAGE = "message"
+    COMPUTER_CALL = "computer_call"
+    COMPUTER_CALL_OUTPUT = "computer_call_output"
+    REASONING = "reasoning"
+@dataclass
+class ComputerAction:
+    """Represents a computer action to be performed."""
+    type: str
+    x: Optional[int] = None
+    y: Optional[int] = None
+    text: Optional[str] = None
+    button: Optional[str] = None
+    keys: Optional[List[str]] = None
+    ms: Optional[int] = None
+    scroll_x: Optional[int] = None
+    scroll_y: Optional[int] = None
+    path: Optional[List[Dict[str, int]]] = None

agent/providers/openai/utils.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Utility functions for the OpenAI provider."""
+import logging
+import json
+import base64
+from typing import Any, Dict, List, Optional
+from ...core.types import AgentResponse
+logger = logging.getLogger(__name__)
+def format_images_for_openai(images_base64: List[str]) -> List[Dict[str, Any]]:
+    """Format images for OpenAI Agent Response API.
+    Args:
+        images_base64: List of base64 encoded images
+    Returns:
+        List of formatted image items for Agent Response API
+    """
+    return [
+        {"type": "input_image", "image_url": f"data:image/png;base64,{image}"}
+        for image in images_base64
+    ]
+def extract_message_content(message: Dict[str, Any]) -> str:
+    """Extract text content from a message.
+    Args:
+        message: Message to extract content from
+    Returns:
+        Text content from the message
+    """
+    if isinstance(message.get("content"), str):
+        return message["content"]
+    if isinstance(message.get("content"), list):
+        text = ""
+        role = message.get("role", "user")
+        for item in message["content"]:
+            if isinstance(item, dict):
+                # For user messages
+                if role == "user" and item.get("type") == "input_text":
+                    text += item.get("text", "")
+                # For standard format
+                elif item.get("type") == "text":
+                    text += item.get("text", "")
+                # For assistant messages in Agent Response API format
+                elif item.get("type") == "output_text":
+                    text += item.get("text", "")
+        return text
+    return ""
+def sanitize_message(msg: Dict[str, Any]) -> Dict[str, Any]:
+    """Sanitize a message for logging by removing large image data.
+    Args:
+        msg: Message to sanitize
+    Returns:
+        Sanitized message
+    """
+    if not isinstance(msg, dict):
+        return msg
+    sanitized = msg.copy()
+    # Handle message content
+    if isinstance(sanitized.get("content"), list):
+        sanitized_content = []
+        for item in sanitized["content"]:
+            if isinstance(item, dict):
+                # Handle various image types
+                if item.get("type") == "image_url" and "image_url" in item:
+                    sanitized_content.append({"type": "image_url", "image_url": "[omitted]"})
+                elif item.get("type") == "input_image" and "image_url" in item:
+                    sanitized_content.append({"type": "input_image", "image_url": "[omitted]"})
+                elif item.get("type") == "image" and "source" in item:
+                    sanitized_content.append({"type": "image", "source": "[omitted]"})
+                else:
+                    sanitized_content.append(item)
+            else:
+                sanitized_content.append(item)
+        sanitized["content"] = sanitized_content
+    # Handle computer_call_output
+    if sanitized.get("type") == "computer_call_output" and "output" in sanitized:
+        output = sanitized["output"]
+        if isinstance(output, dict) and "image_url" in output:
+            sanitized["output"] = {**output, "image_url": "[omitted]"}
+    return sanitized

cua-agent 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl

Potentially problematic release.

cua-agent 0.1.17py3-none-any.whl → 0.1.18py3-none-any.whl