PyPI - cua-agent - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

cua-agent 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (112) hide show

agent/__init__.py +21 -12
agent/__main__.py +21 -0
agent/adapters/__init__.py +9 -0
agent/adapters/huggingfacelocal_adapter.py +229 -0
agent/agent.py +594 -0
agent/callbacks/__init__.py +19 -0
agent/callbacks/base.py +153 -0
agent/callbacks/budget_manager.py +44 -0
agent/callbacks/image_retention.py +139 -0
agent/callbacks/logging.py +247 -0
agent/callbacks/pii_anonymization.py +259 -0
agent/callbacks/telemetry.py +210 -0
agent/callbacks/trajectory_saver.py +305 -0
agent/cli.py +297 -0
agent/computer_handler.py +107 -0
agent/decorators.py +90 -0
agent/loops/__init__.py +11 -0
agent/loops/anthropic.py +728 -0
agent/loops/omniparser.py +339 -0
agent/loops/openai.py +95 -0
agent/loops/uitars.py +688 -0
agent/responses.py +207 -0
agent/telemetry.py +135 -14
agent/types.py +79 -0
agent/ui/__init__.py +7 -1
agent/ui/__main__.py +2 -13
agent/ui/gradio/__init__.py +6 -19
agent/ui/gradio/app.py +94 -1313
agent/ui/gradio/ui_components.py +721 -0
cua_agent-0.4.0.dist-info/METADATA +424 -0
cua_agent-0.4.0.dist-info/RECORD +33 -0
agent/core/__init__.py +0 -27
agent/core/agent.py +0 -210
agent/core/base.py +0 -217
agent/core/callbacks.py +0 -200
agent/core/experiment.py +0 -249
agent/core/factory.py +0 -122
agent/core/messages.py +0 -332
agent/core/provider_config.py +0 -21
agent/core/telemetry.py +0 -142
agent/core/tools/__init__.py +0 -21
agent/core/tools/base.py +0 -74
agent/core/tools/bash.py +0 -52
agent/core/tools/collection.py +0 -46
agent/core/tools/computer.py +0 -113
agent/core/tools/edit.py +0 -67
agent/core/tools/manager.py +0 -56
agent/core/tools.py +0 -32
agent/core/types.py +0 -88
agent/core/visualization.py +0 -197
agent/providers/__init__.py +0 -4
agent/providers/anthropic/__init__.py +0 -6
agent/providers/anthropic/api/client.py +0 -360
agent/providers/anthropic/api/logging.py +0 -150
agent/providers/anthropic/api_handler.py +0 -140
agent/providers/anthropic/callbacks/__init__.py +0 -5
agent/providers/anthropic/callbacks/manager.py +0 -65
agent/providers/anthropic/loop.py +0 -568
agent/providers/anthropic/prompts.py +0 -23
agent/providers/anthropic/response_handler.py +0 -226
agent/providers/anthropic/tools/__init__.py +0 -33
agent/providers/anthropic/tools/base.py +0 -88
agent/providers/anthropic/tools/bash.py +0 -66
agent/providers/anthropic/tools/collection.py +0 -34
agent/providers/anthropic/tools/computer.py +0 -396
agent/providers/anthropic/tools/edit.py +0 -326
agent/providers/anthropic/tools/manager.py +0 -54
agent/providers/anthropic/tools/run.py +0 -42
agent/providers/anthropic/types.py +0 -16
agent/providers/anthropic/utils.py +0 -381
agent/providers/omni/__init__.py +0 -8
agent/providers/omni/api_handler.py +0 -42
agent/providers/omni/clients/anthropic.py +0 -103
agent/providers/omni/clients/base.py +0 -35
agent/providers/omni/clients/oaicompat.py +0 -195
agent/providers/omni/clients/ollama.py +0 -122
agent/providers/omni/clients/openai.py +0 -155
agent/providers/omni/clients/utils.py +0 -25
agent/providers/omni/image_utils.py +0 -34
agent/providers/omni/loop.py +0 -990
agent/providers/omni/parser.py +0 -307
agent/providers/omni/prompts.py +0 -64
agent/providers/omni/tools/__init__.py +0 -30
agent/providers/omni/tools/base.py +0 -29
agent/providers/omni/tools/bash.py +0 -74
agent/providers/omni/tools/computer.py +0 -179
agent/providers/omni/tools/manager.py +0 -61
agent/providers/omni/utils.py +0 -236
agent/providers/openai/__init__.py +0 -6
agent/providers/openai/api_handler.py +0 -456
agent/providers/openai/loop.py +0 -472
agent/providers/openai/response_handler.py +0 -205
agent/providers/openai/tools/__init__.py +0 -15
agent/providers/openai/tools/base.py +0 -79
agent/providers/openai/tools/computer.py +0 -326
agent/providers/openai/tools/manager.py +0 -106
agent/providers/openai/types.py +0 -36
agent/providers/openai/utils.py +0 -98
agent/providers/uitars/__init__.py +0 -1
agent/providers/uitars/clients/base.py +0 -35
agent/providers/uitars/clients/mlxvlm.py +0 -263
agent/providers/uitars/clients/oaicompat.py +0 -214
agent/providers/uitars/loop.py +0 -660
agent/providers/uitars/prompts.py +0 -63
agent/providers/uitars/tools/__init__.py +0 -1
agent/providers/uitars/tools/computer.py +0 -283
agent/providers/uitars/tools/manager.py +0 -60
agent/providers/uitars/utils.py +0 -264
cua_agent-0.3.2.dist-info/METADATA +0 -295
cua_agent-0.3.2.dist-info/RECORD +0 -87
{cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +0 -0
{cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0

agent/responses.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""
+Functions for making various Responses API items from different types of responses.
+Based on the OpenAI spec for Responses API items.
+"""
+import base64
+import json
+import uuid
+from typing import List, Dict, Any, Literal, Union, Optional
+from openai.types.responses.response_computer_tool_call_param import (
+    ResponseComputerToolCallParam,
+    ActionClick,
+    ActionDoubleClick,
+    ActionDrag,
+    ActionDragPath,
+    ActionKeypress,
+    ActionMove,
+    ActionScreenshot,
+    ActionScroll,
+    ActionType as ActionTypeAction,
+    ActionWait,
+    PendingSafetyCheck
+)
+from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
+from openai.types.responses.response_output_text_param import ResponseOutputTextParam
+from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
+from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
+from openai.types.responses.easy_input_message_param import EasyInputMessageParam
+from openai.types.responses.response_input_image_param import ResponseInputImageParam
+def random_id():
+    return str(uuid.uuid4())
+# User message items
+def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
+    return EasyInputMessageParam(
+        content=[
+            ResponseInputImageParam(
+                type="input_image",
+                image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
+            )
+        ],
+        role="user",
+        type="message"
+    )
+# Text items
+def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
+    return ResponseReasoningItemParam(
+        id=random_id(),
+        summary=[
+            Summary(text=reasoning, type="summary_text")
+        ],
+        type="reasoning"
+    )
+def make_output_text_item(content: str) -> ResponseOutputMessageParam:
+    return ResponseOutputMessageParam(
+        id=random_id(),
+        content=[
+            ResponseOutputTextParam(
+                text=content,
+                type="output_text",
+                annotations=[]
+            )
+        ],
+        role="assistant",
+        status="completed",
+        type="message"
+    )
+# Function call items
+def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
+    return ResponseFunctionToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        name=function_name,
+        arguments=json.dumps(arguments),
+        status="completed",
+        type="function_call"
+    )
+# Computer tool call items
+def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionClick(
+            button=button,
+            type="click",
+            x=x,
+            y=y
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionDoubleClick(
+            type="double_click",
+            x=x,
+            y=y
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionDrag(
+            path=drag_path,
+            type="drag"
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionKeypress(
+            keys=keys,
+            type="keypress"
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionMove(
+            type="move",
+            x=x,
+            y=y
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionScreenshot(
+            type="screenshot"
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionScroll(
+            scroll_x=scroll_x,
+            scroll_y=scroll_y,
+            type="scroll",
+            x=x,
+            y=y
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionTypeAction(
+            text=text,
+            type="type"
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )
+def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+    return ResponseComputerToolCallParam(
+        id=random_id(),
+        call_id=call_id if call_id else random_id(),
+        action=ActionWait(
+            type="wait"
+        ),
+        pending_safety_checks=[],
+        status="completed",
+        type="computer_call"
+    )

agent/telemetry.py CHANGED Viewed

@@ -1,21 +1,142 @@
-"""Telemetry support for Agent class."""
+"""Agent telemetry for tracking anonymous usage and feature usage."""
+import logging
 import os
 import platform
 import sys
-import time
-from typing import Any, Dict, Optional
-from core.telemetry import (
-    record_event,
-    is_telemetry_enabled,
-    flush,
-    get_telemetry_client,
-    increment,
-)
-# System information used for telemetry
+from typing import Dict, Any, Callable
+# Import the core telemetry module
+TELEMETRY_AVAILABLE = False
+# Local fallbacks in case core telemetry isn't available
+def _noop(*args: Any, **kwargs: Any) -> None:
+    """No-op function for when telemetry is not available."""
+    pass
+# Define default functions with unique names to avoid shadowing
+_default_record_event = _noop
+_default_increment_counter = _noop
+_default_set_dimension = _noop
+_default_get_telemetry_client = lambda: None
+_default_flush = _noop
+_default_is_telemetry_enabled = lambda: False
+_default_is_telemetry_globally_disabled = lambda: True
+# Set the actual functions to the defaults initially
+record_event = _default_record_event
+increment_counter = _default_increment_counter
+set_dimension = _default_set_dimension
+get_telemetry_client = _default_get_telemetry_client
+flush = _default_flush
+is_telemetry_enabled = _default_is_telemetry_enabled
+is_telemetry_globally_disabled = _default_is_telemetry_globally_disabled
+logger = logging.getLogger("agent.telemetry")
+try:
+    # Import from core telemetry
+    from core.telemetry import (
+        record_event as core_record_event,
+        increment as core_increment,
+        get_telemetry_client as core_get_telemetry_client,
+        flush as core_flush,
+        is_telemetry_enabled as core_is_telemetry_enabled,
+        is_telemetry_globally_disabled as core_is_telemetry_globally_disabled,
+    )
+    # Override the default functions with actual implementations
+    record_event = core_record_event
+    get_telemetry_client = core_get_telemetry_client
+    flush = core_flush
+    is_telemetry_enabled = core_is_telemetry_enabled
+    is_telemetry_globally_disabled = core_is_telemetry_globally_disabled
+    def increment_counter(counter_name: str, value: int = 1) -> None:
+        """Wrapper for increment to maintain backward compatibility."""
+        if is_telemetry_enabled():
+            core_increment(counter_name, value)
+    def set_dimension(name: str, value: Any) -> None:
+        """Set a dimension that will be attached to all events."""
+        logger.debug(f"Setting dimension {name}={value}")
+    TELEMETRY_AVAILABLE = True
+    logger.info("Successfully imported telemetry")
+except ImportError as e:
+    logger.warning(f"Could not import telemetry: {e}")
+    logger.debug("Telemetry not available, using no-op functions")
+# Get system info once to use in telemetry
 SYSTEM_INFO = {
-    "os": sys.platform,
+    "os": platform.system().lower(),
+    "os_version": platform.release(),
     "python_version": platform.python_version(),
 }
+def enable_telemetry() -> bool:
+    """Enable telemetry if available.
+    Returns:
+        bool: True if telemetry was successfully enabled, False otherwise
+    """
+    global TELEMETRY_AVAILABLE, record_event, increment_counter, get_telemetry_client, flush, is_telemetry_enabled, is_telemetry_globally_disabled
+    # Check if globally disabled using core function
+    if TELEMETRY_AVAILABLE and is_telemetry_globally_disabled():
+        logger.info("Telemetry is globally disabled via environment variable - cannot enable")
+        return False
+    # Already enabled
+    if TELEMETRY_AVAILABLE:
+        return True
+    # Try to import and enable
+    try:
+        from core.telemetry import (
+            record_event,
+            increment,
+            get_telemetry_client,
+            flush,
+            is_telemetry_globally_disabled,
+        )
+        # Check again after import
+        if is_telemetry_globally_disabled():
+            logger.info("Telemetry is globally disabled via environment variable - cannot enable")
+            return False
+        TELEMETRY_AVAILABLE = True
+        logger.info("Telemetry successfully enabled")
+        return True
+    except ImportError as e:
+        logger.warning(f"Could not enable telemetry: {e}")
+        return False
+def is_telemetry_enabled() -> bool:
+    """Check if telemetry is enabled.
+    Returns:
+        bool: True if telemetry is enabled, False otherwise
+    """
+    # Use the core function if available, otherwise use our local flag
+    if TELEMETRY_AVAILABLE:
+        from core.telemetry import is_telemetry_enabled as core_is_enabled
+        return core_is_enabled()
+    return False
+def record_agent_initialization() -> None:
+    """Record when an agent instance is initialized."""
+    if TELEMETRY_AVAILABLE and is_telemetry_enabled():
+        record_event("agent_initialized", SYSTEM_INFO)
+        # Set dimensions that will be attached to all events
+        set_dimension("os", SYSTEM_INFO["os"])
+        set_dimension("os_version", SYSTEM_INFO["os_version"])
+        set_dimension("python_version", SYSTEM_INFO["python_version"])

agent/types.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""
+Type definitions for agent
+"""
+from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
+from pydantic import BaseModel
+import re
+from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
+from collections.abc import Iterable
+# Agent input types
+Messages = str | ResponseInputParam
+Tools = Optional[Iterable[ToolParam]]
+# Agent output types
+AgentResponse = ResponsesAPIResponse
+# Agent loop registration
+class AgentLoopInfo(BaseModel):
+    """Information about a registered agent loop"""
+    func: Callable
+    models_regex: str
+    priority: int = 0
+    def matches_model(self, model: str) -> bool:
+        """Check if this loop matches the given model"""
+        return bool(re.match(self.models_regex, model))
+# Computer tool interface
+class Computer(Protocol):
+    """Protocol defining the interface for computer interactions."""
+    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
+        """Get the current environment type."""
+        ...
+    async def get_dimensions(self) -> tuple[int, int]:
+        """Get screen dimensions as (width, height)."""
+        ...
+    async def screenshot(self) -> str:
+        """Take a screenshot and return as base64 string."""
+        ...
+    async def click(self, x: int, y: int, button: str = "left") -> None:
+        """Click at coordinates with specified button."""
+        ...
+    async def double_click(self, x: int, y: int) -> None:
+        """Double click at coordinates."""
+        ...
+    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
+        """Scroll at coordinates with specified scroll amounts."""
+        ...
+    async def type(self, text: str) -> None:
+        """Type text."""
+        ...
+    async def wait(self, ms: int = 1000) -> None:
+        """Wait for specified milliseconds."""
+        ...
+    async def move(self, x: int, y: int) -> None:
+        """Move cursor to coordinates."""
+        ...
+    async def keypress(self, keys: List[str]) -> None:
+        """Press key combination."""
+        ...
+    async def drag(self, path: List[Dict[str, int]]) -> None:
+        """Drag along specified path."""
+        ...
+    async def get_current_url(self) -> str:
+        """Get current URL (for browser environments)."""
+        ...

agent/ui/__init__.py CHANGED Viewed

@@ -1 +1,7 @@
-"""UI modules for the Computer-Use Agent."""
+"""
+UI components for agent
+"""
+from .gradio import launch_ui, create_gradio_ui
+__all__ = ["launch_ui", "create_gradio_ui"]

agent/ui/__main__.py CHANGED Viewed

@@ -1,15 +1,4 @@
-"""
-Main entry point for agent.ui module.
-This allows running the agent UI with:
-    python -m agent.ui
-Instead of:
-    python -m agent.ui.gradio.app
-"""
-from .gradio.app import create_gradio_ui
+from .gradio import launch_ui
 if __name__ == "__main__":
-    app = create_gradio_ui()
-    app.launch(share=False, inbrowser=True)
+    launch_ui()

agent/ui/gradio/__init__.py CHANGED Viewed

@@ -1,21 +1,8 @@
-"""Gradio UI for Computer-Use Agent."""
+"""
+Gradio UI for agent
+"""
-import gradio as gr
-from typing import Optional
+from .app import launch_ui
+from .ui_components import create_gradio_ui
-from .app import create_gradio_ui
-def registry(name: str = "cua:gpt-4o") -> gr.Blocks:
-    """Create and register a Gradio UI for the Computer-Use Agent.
-    Args:
-        name: The name to use for the Gradio app, in format 'provider:model'
-    Returns:
-        A Gradio Blocks application
-    """
-    provider, model = name.split(":", 1) if ":" in name else ("openai", name)
-    # Create and return the Gradio UI
-    return create_gradio_ui(provider_name=provider, model_name=model)
+__all__ = ["launch_ui", "create_gradio_ui"]

cua-agent 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

cua-agent 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl