PyPI - cua-agent - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl - Mend

cua-agent 0.3.2py3-none-any.whl → 0.4.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show

agent/__init__.py +15 -51
agent/__main__.py +21 -0
agent/adapters/__init__.py +9 -0
agent/adapters/huggingfacelocal_adapter.py +216 -0
agent/agent.py +577 -0
agent/callbacks/__init__.py +17 -0
agent/callbacks/base.py +153 -0
agent/callbacks/budget_manager.py +44 -0
agent/callbacks/image_retention.py +139 -0
agent/callbacks/logging.py +247 -0
agent/callbacks/pii_anonymization.py +259 -0
agent/callbacks/trajectory_saver.py +305 -0
agent/cli.py +290 -0
agent/computer_handler.py +107 -0
agent/decorators.py +90 -0
agent/loops/__init__.py +11 -0
agent/loops/anthropic.py +728 -0
agent/loops/omniparser.py +339 -0
agent/loops/openai.py +95 -0
agent/loops/uitars.py +688 -0
agent/responses.py +207 -0
agent/types.py +79 -0
agent/ui/__init__.py +7 -1
agent/ui/gradio/__init__.py +6 -19
agent/ui/gradio/app.py +80 -1299
agent/ui/gradio/ui_components.py +703 -0
cua_agent-0.4.0b1.dist-info/METADATA +424 -0
cua_agent-0.4.0b1.dist-info/RECORD +30 -0
agent/core/__init__.py +0 -27
agent/core/agent.py +0 -210
agent/core/base.py +0 -217
agent/core/callbacks.py +0 -200
agent/core/experiment.py +0 -249
agent/core/factory.py +0 -122
agent/core/messages.py +0 -332
agent/core/provider_config.py +0 -21
agent/core/telemetry.py +0 -142
agent/core/tools/__init__.py +0 -21
agent/core/tools/base.py +0 -74
agent/core/tools/bash.py +0 -52
agent/core/tools/collection.py +0 -46
agent/core/tools/computer.py +0 -113
agent/core/tools/edit.py +0 -67
agent/core/tools/manager.py +0 -56
agent/core/tools.py +0 -32
agent/core/types.py +0 -88
agent/core/visualization.py +0 -197
agent/providers/__init__.py +0 -4
agent/providers/anthropic/__init__.py +0 -6
agent/providers/anthropic/api/client.py +0 -360
agent/providers/anthropic/api/logging.py +0 -150
agent/providers/anthropic/api_handler.py +0 -140
agent/providers/anthropic/callbacks/__init__.py +0 -5
agent/providers/anthropic/callbacks/manager.py +0 -65
agent/providers/anthropic/loop.py +0 -568
agent/providers/anthropic/prompts.py +0 -23
agent/providers/anthropic/response_handler.py +0 -226
agent/providers/anthropic/tools/__init__.py +0 -33
agent/providers/anthropic/tools/base.py +0 -88
agent/providers/anthropic/tools/bash.py +0 -66
agent/providers/anthropic/tools/collection.py +0 -34
agent/providers/anthropic/tools/computer.py +0 -396
agent/providers/anthropic/tools/edit.py +0 -326
agent/providers/anthropic/tools/manager.py +0 -54
agent/providers/anthropic/tools/run.py +0 -42
agent/providers/anthropic/types.py +0 -16
agent/providers/anthropic/utils.py +0 -381
agent/providers/omni/__init__.py +0 -8
agent/providers/omni/api_handler.py +0 -42
agent/providers/omni/clients/anthropic.py +0 -103
agent/providers/omni/clients/base.py +0 -35
agent/providers/omni/clients/oaicompat.py +0 -195
agent/providers/omni/clients/ollama.py +0 -122
agent/providers/omni/clients/openai.py +0 -155
agent/providers/omni/clients/utils.py +0 -25
agent/providers/omni/image_utils.py +0 -34
agent/providers/omni/loop.py +0 -990
agent/providers/omni/parser.py +0 -307
agent/providers/omni/prompts.py +0 -64
agent/providers/omni/tools/__init__.py +0 -30
agent/providers/omni/tools/base.py +0 -29
agent/providers/omni/tools/bash.py +0 -74
agent/providers/omni/tools/computer.py +0 -179
agent/providers/omni/tools/manager.py +0 -61
agent/providers/omni/utils.py +0 -236
agent/providers/openai/__init__.py +0 -6
agent/providers/openai/api_handler.py +0 -456
agent/providers/openai/loop.py +0 -472
agent/providers/openai/response_handler.py +0 -205
agent/providers/openai/tools/__init__.py +0 -15
agent/providers/openai/tools/base.py +0 -79
agent/providers/openai/tools/computer.py +0 -326
agent/providers/openai/tools/manager.py +0 -106
agent/providers/openai/types.py +0 -36
agent/providers/openai/utils.py +0 -98
agent/providers/uitars/__init__.py +0 -1
agent/providers/uitars/clients/base.py +0 -35
agent/providers/uitars/clients/mlxvlm.py +0 -263
agent/providers/uitars/clients/oaicompat.py +0 -214
agent/providers/uitars/loop.py +0 -660
agent/providers/uitars/prompts.py +0 -63
agent/providers/uitars/tools/__init__.py +0 -1
agent/providers/uitars/tools/computer.py +0 -283
agent/providers/uitars/tools/manager.py +0 -60
agent/providers/uitars/utils.py +0 -264
agent/telemetry.py +0 -21
agent/ui/__main__.py +0 -15
cua_agent-0.3.2.dist-info/METADATA +0 -295
cua_agent-0.3.2.dist-info/RECORD +0 -87
{cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
{cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0

agent/loops/omniparser.py ADDED Viewed

@@ -0,0 +1,339 @@
+"""
+OpenAI computer-use-preview agent loop implementation using liteLLM
+"""
+import asyncio
+import json
+from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+import litellm
+import inspect
+import base64
+from ..decorators import agent_loop
+from ..types import Messages, AgentResponse, Tools
+SOM_TOOL_SCHEMA = {
+  "type": "function",
+  "name": "computer",
+  "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
+  "parameters": {
+    "type": "object",
+    "properties": {
+      "action": {
+        "type": "string",
+        "enum": [
+          "screenshot",
+          "click",
+          "double_click",
+          "drag",
+          "type",
+          "keypress",
+          "scroll",
+          "move",
+          "wait",
+          "get_current_url",
+          "get_dimensions",
+          "get_environment"
+        ],
+        "description": "The action to perform"
+      },
+      "element_id": {
+        "type": "integer",
+        "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
+      },
+      "start_element_id": {
+        "type": "integer",
+        "description": "The ID of the element to start dragging from (required for drag action)"
+      },
+      "end_element_id": {
+        "type": "integer",
+        "description": "The ID of the element to drag to (required for drag action)"
+      },
+      "text": {
+        "type": "string",
+        "description": "The text to type (required for type action)"
+      },
+      "keys": {
+        "type": "string",
+        "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
+      },
+      "button": {
+        "type": "string",
+        "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
+      },
+      "scroll_x": {
+        "type": "integer",
+        "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
+      },
+      "scroll_y": {
+        "type": "integer",
+        "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+      },
+    },
+    "required": [
+      "action"
+    ]
+  }
+}
+OMNIPARSER_AVAILABLE = False
+try:
+    from som import OmniParser
+    OMNIPARSER_AVAILABLE = True
+except ImportError:
+    pass
+OMNIPARSER_SINGLETON = None
+def get_parser() -> OmniParser:
+    global OMNIPARSER_SINGLETON
+    if OMNIPARSER_SINGLETON is None:
+        OMNIPARSER_SINGLETON = OmniParser()
+    return OMNIPARSER_SINGLETON
+def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """Get the last computer_call_output message from a messages list.
+    Args:
+        messages: List of messages to search through
+    Returns:
+        The last computer_call_output message dict, or None if not found
+    """
+    for message in reversed(messages):
+        if isinstance(message, dict) and message.get("type") == "computer_call_output":
+            return message
+    return None
+def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
+    """Prepare tools for OpenAI API format"""
+    omniparser_tools = []
+    id2xy = dict()
+    for schema in tool_schemas:
+        if schema["type"] == "computer":
+            omniparser_tools.append(SOM_TOOL_SCHEMA)
+            if "id2xy" in schema:
+                id2xy = schema["id2xy"]
+            else:
+                schema["id2xy"] = id2xy
+        elif schema["type"] == "function":
+            # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
+            # Schema should be: {type, name, description, parameters}
+            omniparser_tools.append({ "type": "function", **schema["function"] })
+    return omniparser_tools, id2xy
+async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
+  item_type = item.get("type")
+  def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
+    if element_id is None:
+      return (None, None)
+    return id2xy.get(element_id, (None, None))
+  if item_type == "function_call":
+    fn_name = item.get("name")
+    fn_args = json.loads(item.get("arguments", "{}"))
+    item_id = item.get("id")
+    call_id = item.get("call_id")
+    if fn_name == "computer":
+      action = fn_args.get("action")
+      element_id = fn_args.get("element_id")
+      start_element_id = fn_args.get("start_element_id")
+      end_element_id = fn_args.get("end_element_id")
+      text = fn_args.get("text")
+      keys = fn_args.get("keys")
+      button = fn_args.get("button")
+      scroll_x = fn_args.get("scroll_x")
+      scroll_y = fn_args.get("scroll_y")
+      x, y = _get_xy(element_id)
+      start_x, start_y = _get_xy(start_element_id)
+      end_x, end_y = _get_xy(end_element_id)
+      action_args = {
+          "type": action,
+          "x": x,
+          "y": y,
+          "start_x": start_x,
+          "start_y": start_y,
+          "end_x": end_x,
+          "end_y": end_y,
+          "text": text,
+          "keys": keys,
+          "button": button,
+          "scroll_x": scroll_x,
+          "scroll_y": scroll_y
+        }
+      # Remove None values to keep the JSON clean
+      action_args = {k: v for k, v in action_args.items() if v is not None}
+      return [{
+        "type": "computer_call",
+        "action": action_args,
+        "id": item_id,
+        "call_id": call_id,
+        "status": "completed"
+      }]
+  return [item]
+async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
+    """
+    Convert computer_call back to function_call format.
+    Also handles computer_call_output -> function_call_output conversion.
+    Args:
+        item: The item to convert
+        xy2id: Mapping from (x, y) coordinates to element IDs
+    """
+    item_type = item.get("type")
+    def _get_element_id(x: Optional[float], y: Optional[float]) -> Optional[int]:
+        """Get element ID from coordinates, return None if coordinates are None"""
+        if x is None or y is None:
+            return None
+        return xy2id.get((x, y))
+    if item_type == "computer_call":
+        action_data = item.get("action", {})
+        # Extract coordinates and convert back to element IDs
+        element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
+        start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
+        end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
+        # Build function arguments
+        fn_args = {
+            "action": action_data.get("type"),
+            "element_id": element_id,
+            "start_element_id": start_element_id,
+            "end_element_id": end_element_id,
+            "text": action_data.get("text"),
+            "keys": action_data.get("keys"),
+            "button": action_data.get("button"),
+            "scroll_x": action_data.get("scroll_x"),
+            "scroll_y": action_data.get("scroll_y")
+        }
+        # Remove None values to keep the JSON clean
+        fn_args = {k: v for k, v in fn_args.items() if v is not None}
+        return [{
+            "type": "function_call",
+            "name": "computer",
+            "arguments": json.dumps(fn_args),
+            "id": item.get("id"),
+            "call_id": item.get("call_id"),
+            "status": "completed",
+            # Fall back to string representation
+            "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
+        }]
+    elif item_type == "computer_call_output":
+        # Simple conversion: computer_call_output -> function_call_output
+        return [{
+            "type": "function_call_output",
+            "call_id": item.get("call_id"),
+            "content": [item.get("output")],
+            "id": item.get("id"),
+            "status": "completed"
+        }]
+    return [item]
+@agent_loop(models=r"omniparser\+.*|omni\+.*", priority=10)
+async def omniparser_loop(
+    messages: Messages,
+    model: str,
+    tools: Optional[List[Dict[str, Any]]] = None,
+    max_retries: Optional[int] = None,
+    stream: bool = False,
+    computer_handler=None,
+    use_prompt_caching: Optional[bool] = False,
+    _on_api_start=None,
+    _on_api_end=None,
+    _on_usage=None,
+    _on_screenshot=None,
+    **kwargs
+) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
+    """
+    OpenAI computer-use-preview agent loop using liteLLM responses.
+    Supports OpenAI's computer use preview models.
+    """
+    if not OMNIPARSER_AVAILABLE:
+        raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
+    tools = tools or []
+    llm_model = model.split('+')[-1]
+    # Prepare tools for OpenAI API
+    openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
+    # Find last computer_call_output
+    last_computer_call_output = get_last_computer_call_output(messages)
+    if last_computer_call_output:
+        image_url = last_computer_call_output.get("output", {}).get("image_url", "")
+        image_data = image_url.split(",")[-1]
+        if image_data:
+            parser = get_parser()
+            result = parser.parse(image_data)
+            if _on_screenshot:
+                await _on_screenshot(result.annotated_image_base64, "annotated_image")
+            for element in result.elements:
+                id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
+    # handle computer calls -> function calls
+    new_messages = []
+    for message in messages:
+        if not isinstance(message, dict):
+            message = message.__dict__
+        new_messages += await replace_computer_call_with_function(message, id2xy)
+    messages = new_messages
+    # Prepare API call kwargs
+    api_kwargs = {
+        "model": llm_model,
+        "input": messages,
+        "tools": openai_tools if openai_tools else None,
+        "stream": stream,
+        "reasoning": {"summary": "concise"},
+        "truncation": "auto",
+        "num_retries": max_retries,
+        **kwargs
+    }
+    # Call API start hook
+    if _on_api_start:
+        await _on_api_start(api_kwargs)
+    print(str(api_kwargs)[:1000])
+    # Use liteLLM responses
+    response = await litellm.aresponses(**api_kwargs)
+    # Call API end hook
+    if _on_api_end:
+        await _on_api_end(api_kwargs, response)
+    # Extract usage information
+    response.usage = {
+        **response.usage.model_dump(),
+        "response_cost": response._hidden_params.get("response_cost", 0.0),
+    }
+    if _on_usage:
+        await _on_usage(response.usage)
+    # handle som function calls -> xy computer calls
+    new_output = []
+    for i in range(len(response.output)):
+      new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
+    response.output = new_output
+    return response

agent/loops/openai.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+OpenAI computer-use-preview agent loop implementation using liteLLM
+"""
+import asyncio
+import json
+from typing import Dict, List, Any, AsyncGenerator, Union, Optional
+import litellm
+from ..decorators import agent_loop
+from ..types import Messages, AgentResponse, Tools
+def _map_computer_tool_to_openai(computer_tool: Any) -> Dict[str, Any]:
+    """Map a computer tool to OpenAI's computer-use-preview tool schema"""
+    return {
+        "type": "computer_use_preview",
+        "display_width": getattr(computer_tool, 'display_width', 1024),
+        "display_height": getattr(computer_tool, 'display_height', 768),
+        "environment": getattr(computer_tool, 'environment', "linux")  # mac, windows, linux, browser
+    }
+def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
+    """Prepare tools for OpenAI API format"""
+    openai_tools = []
+    for schema in tool_schemas:
+        if schema["type"] == "computer":
+            # Map computer tool to OpenAI format
+            openai_tools.append(_map_computer_tool_to_openai(schema["computer"]))
+        elif schema["type"] == "function":
+            # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
+            # Schema should be: {type, name, description, parameters}
+            openai_tools.append({ "type": "function", **schema["function"] })
+    return openai_tools
+@agent_loop(models=r".*computer-use-preview.*", priority=10)
+async def openai_computer_use_loop(
+    messages: Messages,
+    model: str,
+    tools: Optional[List[Dict[str, Any]]] = None,
+    max_retries: Optional[int] = None,
+    stream: bool = False,
+    computer_handler=None,
+    use_prompt_caching: Optional[bool] = False,
+    _on_api_start=None,
+    _on_api_end=None,
+    _on_usage=None,
+    _on_screenshot=None,
+    **kwargs
+) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
+    """
+    OpenAI computer-use-preview agent loop using liteLLM responses.
+    Supports OpenAI's computer use preview models.
+    """
+    tools = tools or []
+    # Prepare tools for OpenAI API
+    openai_tools = _prepare_tools_for_openai(tools)
+    # Prepare API call kwargs
+    api_kwargs = {
+        "model": model,
+        "input": messages,
+        "tools": openai_tools if openai_tools else None,
+        "stream": stream,
+        "reasoning": {"summary": "concise"},
+        "truncation": "auto",
+        "num_retries": max_retries,
+        **kwargs
+    }
+    # Call API start hook
+    if _on_api_start:
+        await _on_api_start(api_kwargs)
+    # Use liteLLM responses
+    response = await litellm.aresponses(**api_kwargs)
+    # Call API end hook
+    if _on_api_end:
+        await _on_api_end(api_kwargs, response)
+    # Extract usage information
+    response.usage = {
+        **response.usage.model_dump(),
+        "response_cost": response._hidden_params.get("response_cost", 0.0),
+    }
+    if _on_usage:
+        await _on_usage(response.usage)
+    return response

cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

Potentially problematic release.

cua-agent 0.3.2py3-none-any.whl → 0.4.0b1py3-none-any.whl