PyPI - cua-agent - Versions diffs - 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/__init__.py +4 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +337 -185
agent/callbacks/__init__.py +9 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +35 -33
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +99 -61
agent/callbacks/trajectory_saver.py +95 -69
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +38 -99
agent/integrations/hud/agent.py +369 -0
agent/integrations/hud/proxy.py +166 -52
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +579 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +136 -150
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +50 -51
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +247 -206
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +61 -57
agent/proxy/handlers.py +46 -39
agent/responses.py +447 -347
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
cua_agent-0.4.22.dist-info/METADATA +0 -436
cua_agent-0.4.22.dist-info/RECORD +0 -51
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/tools/browser_tool.py ADDED Viewed

@@ -0,0 +1,423 @@
+"""
+Browser Tool for agent interactions.
+Allows agents to control a browser programmatically via Playwright.
+Implements the computer_use action interface for comprehensive browser control.
+"""
+import asyncio
+import logging
+from typing import TYPE_CHECKING, Optional, Union
+from .base import BaseComputerTool, register_tool
+if TYPE_CHECKING:
+    from computer.interface import GenericComputerInterface
+logger = logging.getLogger(__name__)
+@register_tool("computer_use")
+class BrowserTool(BaseComputerTool):
+    """
+    Browser tool that uses the computer SDK's interface to control a browser.
+    Implements a comprehensive computer_use action interface for browser control.
+    """
+    def __init__(self, interface: "GenericComputerInterface", cfg: Optional[dict] = None):
+        """
+        Initialize the BrowserTool.
+        Args:
+            interface: A GenericComputerInterface instance that provides playwright_exec
+            cfg: Optional configuration dictionary
+        """
+        self.interface = interface
+        self._facts = []  # Store memorized facts
+        # Get initial screenshot to determine dimensions
+        self.viewport_width = None
+        self.viewport_height = None
+        self.resized_width = None
+        self.resized_height = None
+        # Try to initialize dimensions synchronously
+        try:
+            import asyncio
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                # If we're in an async context, dimensions will be lazy-loaded
+                pass
+            else:
+                loop.run_until_complete(self._initialize_dimensions())
+        except Exception:
+            # Dimensions will be lazy-loaded on first use
+            pass
+        super().__init__(cfg)
+    async def _initialize_dimensions(self):
+        """Initialize viewport and resized dimensions from screenshot."""
+        try:
+            import base64
+            import io
+            from PIL import Image
+            from qwen_vl_utils import smart_resize
+            # Take a screenshot to get actual dimensions
+            screenshot_b64 = await self.screenshot()
+            img_bytes = base64.b64decode(screenshot_b64)
+            im = Image.open(io.BytesIO(img_bytes))
+            # Store actual viewport size
+            self.viewport_width = im.width
+            self.viewport_height = im.height
+            # Calculate resized dimensions using smart_resize with factor=28
+            MIN_PIXELS = 3136
+            MAX_PIXELS = 12845056
+            rh, rw = smart_resize(
+                im.height, im.width, factor=28, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
+            )
+            self.resized_width = rw
+            self.resized_height = rh
+        except Exception as e:
+            # Fall back to defaults if initialization fails
+            logger.warning(f"Failed to initialize dimensions: {e}")
+            self.viewport_width = 1024
+            self.viewport_height = 768
+            self.resized_width = 1024
+            self.resized_height = 768
+    async def _proc_coords(self, x: float, y: float) -> tuple:
+        """
+        Process coordinates by converting from resized space to viewport space.
+        Args:
+            x: X coordinate in resized space (0 to resized_width)
+            y: Y coordinate in resized space (0 to resized_height)
+        Returns:
+            Tuple of (viewport_x, viewport_y) in actual viewport pixels
+        """
+        # Ensure dimensions are initialized
+        if self.resized_width is None or self.resized_height is None:
+            await self._initialize_dimensions()
+        # Convert from resized space to viewport space
+        # Normalize by resized dimensions, then scale to viewport dimensions
+        viewport_x = (x / self.resized_width) * self.viewport_width
+        viewport_y = (y / self.resized_height) * self.viewport_height
+        return int(round(viewport_x)), int(round(viewport_y))
+    @property
+    def description(self) -> str:
+        # Use resized dimensions if available, otherwise use defaults
+        width = self.resized_width if self.resized_width is not None else 1024
+        height = self.resized_height if self.resized_height is not None else 768
+        return f"Use a mouse and keyboard to interact with a computer, and take screenshots.\
+* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\
+* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\
+* The screen's resolution is {width}x{height}.\
+* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\
+* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\
+* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\
+* When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\
+* If a popup window appears that you want to close, if left_click() on the 'X' or close button doesn't work, try key(keys=['Escape']) to close it.\
+* On some search bars, when you type(), you may need to press_enter=False and instead separately call left_click() on the search button to submit the search query. This is especially true of search bars that have auto-suggest popups for e.g. locations\
+* For calendar widgets, you usually need to left_click() on arrows to move between months and left_click() on dates to select them; type() is not typically used to input dates there.".strip()
+    @property
+    def parameters(self) -> dict:
+        return {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "description": """The action to perform. The available actions are:
+* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order. Includes 'Enter', 'Alt', 'Shift', 'Tab', 'Control', 'Backspace', 'Delete', 'Escape', 'ArrowUp', 'ArrowDown', 'ArrowLeft', 'ArrowRight', 'PageDown', 'PageUp', 'Shift', etc.
+* type: Type a string of text on the keyboard.
+* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.
+* left_click: Click the left mouse button.
+* scroll: Performs a scroll of the mouse scroll wheel.
+* visit_url: Visit a specified URL.
+* web_search: Perform a web search with a specified query.
+* history_back: Go back to the previous page in the browser history.
+* pause_and_memorize_fact: Pause and memorize a fact for future reference.
+* wait: Wait specified seconds for the change to happen.
+* terminate: Terminate the current task and report its completion status.""",
+                    "enum": [
+                        "key",
+                        "type",
+                        "mouse_move",
+                        "left_click",
+                        "scroll",
+                        "visit_url",
+                        "web_search",
+                        "history_back",
+                        "pause_and_memorize_fact",
+                        "wait",
+                        "terminate",
+                    ],
+                    "type": "string",
+                },
+                "keys": {"description": "Required only by action=key.", "type": "array"},
+                "text": {"description": "Required only by action=type.", "type": "string"},
+                "coordinate": {
+                    "description": "(x, y) coordinates for mouse actions. Required only by action=left_click, action=mouse_move, and action=type.",
+                    "type": "array",
+                },
+                "pixels": {
+                    "description": "Amount of scrolling. Positive = up, Negative = down. Required only by action=scroll.",
+                    "type": "number",
+                },
+                "url": {
+                    "description": "The URL to visit. Required only by action=visit_url.",
+                    "type": "string",
+                },
+                "query": {
+                    "description": "The query to search for. Required only by action=web_search.",
+                    "type": "string",
+                },
+                "fact": {
+                    "description": "The fact to remember for the future. Required only by action=pause_and_memorize_fact.",
+                    "type": "string",
+                },
+                "time": {
+                    "description": "Seconds to wait. Required only by action=wait.",
+                    "type": "number",
+                },
+                "status": {
+                    "description": "Status of the task. Required only by action=terminate.",
+                    "type": "string",
+                    "enum": ["success", "failure"],
+                },
+            },
+            "required": ["action"],
+        }
+    def call(self, params: Union[str, dict], **kwargs) -> Union[str, dict]:
+        """
+        Execute a browser action.
+        Args:
+            params: Action parameters (JSON string or dict)
+            **kwargs: Additional keyword arguments
+        Returns:
+            Result of the action execution
+        """
+        # Verify and parse parameters
+        params_dict = self._verify_json_format_args(params)
+        action = params_dict.get("action")
+        if not action:
+            return {"success": False, "error": "action parameter is required"}
+        # Execute action synchronously by running async method in event loop
+        try:
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                # If we're already in an async context, we can't use run_until_complete
+                # Create a task and wait for it
+                import concurrent.futures
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future = executor.submit(asyncio.run, self._execute_action(action, params_dict))
+                    result = future.result()
+            else:
+                result = loop.run_until_complete(self._execute_action(action, params_dict))
+            return result
+        except Exception as e:
+            logger.error(f"Error executing action {action}: {e}")
+            return {"success": False, "error": str(e)}
+    async def _execute_action(self, action: str, params: dict) -> dict:
+        """Execute the specific action asynchronously."""
+        try:
+            if action == "key":
+                return await self._action_key(params)
+            elif action == "type":
+                return await self._action_type(params)
+            elif action == "mouse_move":
+                return await self._action_mouse_move(params)
+            elif action == "left_click":
+                return await self._action_left_click(params)
+            elif action == "scroll":
+                return await self._action_scroll(params)
+            elif action == "visit_url":
+                return await self._action_visit_url(params)
+            elif action == "web_search":
+                return await self._action_web_search(params)
+            elif action == "history_back":
+                return await self._action_history_back(params)
+            elif action == "pause_and_memorize_fact":
+                return await self._action_pause_and_memorize_fact(params)
+            elif action == "wait":
+                return await self._action_wait(params)
+            elif action == "terminate":
+                return await self._action_terminate(params)
+            else:
+                return {"success": False, "error": f"Unknown action: {action}"}
+        except Exception as e:
+            logger.error(f"Error in action {action}: {e}")
+            return {"success": False, "error": str(e)}
+    async def _action_key(self, params: dict) -> dict:
+        """Press keys in sequence."""
+        keys = params.get("keys", [])
+        if not keys:
+            return {"success": False, "error": "keys parameter is required"}
+        # Convert keys to proper format and press via hotkey
+        try:
+            await self.interface.interface.hotkey(*keys)
+            return {"success": True, "message": f"Pressed keys: {keys}"}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def _action_type(self, params: dict) -> dict:
+        """Type text."""
+        text = params.get("text")
+        if not text:
+            return {"success": False, "error": "text parameter is required"}
+        # If coordinate is provided, click there first
+        coordinate = params.get("coordinate")
+        if coordinate and len(coordinate) == 2:
+            await self.interface.playwright_exec("click", {"x": coordinate[0], "y": coordinate[1]})
+        result = await self.interface.playwright_exec("type", {"text": text})
+        return result
+    async def _action_mouse_move(self, params: dict) -> dict:
+        """Move mouse to coordinates."""
+        coordinate = params.get("coordinate")
+        if not coordinate or len(coordinate) != 2:
+            return {"success": False, "error": "coordinate parameter [x, y] is required"}
+        await self.interface.interface.move_cursor(coordinate[0], coordinate[1])
+        return {"success": True, "message": f"Moved cursor to {coordinate}"}
+    async def _action_left_click(self, params: dict) -> dict:
+        """Click at coordinates."""
+        coordinate = params.get("coordinate")
+        if not coordinate or len(coordinate) != 2:
+            return {"success": False, "error": "coordinate parameter [x, y] is required"}
+        result = await self.interface.playwright_exec(
+            "click", {"x": coordinate[0], "y": coordinate[1]}
+        )
+        return result
+    async def _action_scroll(self, params: dict) -> dict:
+        """Scroll the page."""
+        pixels = params.get("pixels", 0)
+        if pixels == 0:
+            return {"success": False, "error": "pixels parameter is required"}
+        # Positive = up (negative delta_y), Negative = down (positive delta_y)
+        result = await self.interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -pixels})
+        return result
+    async def _action_visit_url(self, params: dict) -> dict:
+        """Visit a URL."""
+        url = params.get("url")
+        if not url:
+            return {"success": False, "error": "url parameter is required"}
+        result = await self.interface.playwright_exec("visit_url", {"url": url})
+        return result
+    async def _action_web_search(self, params: dict) -> dict:
+        """Perform web search."""
+        query = params.get("query")
+        if not query:
+            return {"success": False, "error": "query parameter is required"}
+        result = await self.interface.playwright_exec("web_search", {"query": query})
+        return result
+    async def _action_history_back(self, params: dict) -> dict:
+        """Go back in browser history."""
+        # Press Alt+Left arrow key combination
+        try:
+            await self.interface.interface.hotkey("Alt", "ArrowLeft")
+            return {"success": True, "message": "Navigated back in history"}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def _action_pause_and_memorize_fact(self, params: dict) -> dict:
+        """Memorize a fact."""
+        fact = params.get("fact")
+        if not fact:
+            return {"success": False, "error": "fact parameter is required"}
+        self._facts.append(fact)
+        return {
+            "success": True,
+            "message": f"Memorized fact: {fact}",
+            "total_facts": len(self._facts),
+        }
+    async def _action_wait(self, params: dict) -> dict:
+        """Wait for specified seconds."""
+        time = params.get("time", 0)
+        if time <= 0:
+            return {"success": False, "error": "time parameter must be positive"}
+        await asyncio.sleep(time)
+        return {"success": True, "message": f"Waited {time} seconds"}
+    async def _action_terminate(self, params: dict) -> dict:
+        """Terminate and report status."""
+        status = params.get("status", "success")
+        message = f"Task terminated with status: {status}"
+        if self._facts:
+            message += f"\nMemorized facts: {self._facts}"
+        return {"success": True, "status": status, "message": message, "terminated": True}
+    # Legacy methods for backward compatibility
+    async def visit_url(self, url: str) -> dict:
+        """Navigate to a URL."""
+        return await self._action_visit_url({"url": url})
+    async def click(self, x: int, y: int) -> dict:
+        """Click at coordinates."""
+        return await self._action_left_click({"coordinate": [x, y]})
+    async def type(self, text: str) -> dict:
+        """Type text into the focused element."""
+        return await self._action_type({"text": text})
+    async def scroll(self, delta_x: int, delta_y: int) -> dict:
+        """Scroll the page."""
+        return await self._action_scroll({"pixels": -delta_y})
+    async def web_search(self, query: str) -> dict:
+        """Navigate to a Google search for the query."""
+        return await self._action_web_search({"query": query})
+    async def screenshot(self) -> str:
+        """Take a screenshot of the current browser page."""
+        result = await self.interface.playwright_exec("screenshot", {})
+        if result.get("success") and result.get("screenshot"):
+            screenshot_b64 = result["screenshot"]
+            return screenshot_b64
+        else:
+            error = result.get("error", "Unknown error")
+            raise RuntimeError(f"Failed to take screenshot: {error}")
+    async def get_current_url(self) -> str:
+        """Get the current URL of the browser page."""
+        result = await self.interface.playwright_exec("get_current_url", {})
+        if result.get("success") and result.get("url"):
+            return result["url"]
+        else:
+            error = result.get("error", "Unknown error")
+            raise RuntimeError(f"Failed to get current URL: {error}")

agent/types.py CHANGED Viewed

@@ -2,37 +2,43 @@
 Type definitions for agent
 """
-from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
-from pydantic import BaseModel
 import re
-from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
 from collections.abc import Iterable
+from typing import Any, Callable, Dict, List, Literal, Optional, Protocol
+from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
+from pydantic import BaseModel
 # Agent input types
 Messages = str | ResponseInputParam | List[Dict[str, Any]]
 Tools = Optional[Iterable[ToolParam]]
 # Agent output types
-AgentResponse = ResponsesAPIResponse
+AgentResponse = ResponsesAPIResponse
 AgentCapability = Literal["step", "click"]
 # Exception types
 class ToolError(RuntimeError):
     """Base exception for tool-related errors"""
     pass
 class IllegalArgumentError(ToolError):
     """Exception raised when function arguments are invalid"""
     pass
 # Agent config registration
 class AgentConfigInfo(BaseModel):
     """Information about a registered agent config"""
     agent_class: type
     models_regex: str
     priority: int = 0
     def matches_model(self, model: str) -> bool:
         """Check if this agent config matches the given model"""
         return bool(re.match(self.models_regex, model))

agent/ui/__init__.py CHANGED Viewed

@@ -2,6 +2,6 @@
 UI components for agent
 """
-from .gradio import launch_ui, create_gradio_ui
+from .gradio import create_gradio_ui, launch_ui
 __all__ = ["launch_ui", "create_gradio_ui"]

agent/ui/__main__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from .gradio import launch_ui
 if __name__ == "__main__":
-    launch_ui()
+    launch_ui()

agent/ui/gradio/app.py CHANGED Viewed

@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.
 Supported Agent Models:
 - OpenAI: openai/computer-use-preview
-- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
+- Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
 - UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
-- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
+- Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
 Requirements:
     - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
@@ -18,21 +18,21 @@ Requirements:
     - OpenAI or Anthropic API key
 """
-import os
 import asyncio
-import logging
 import json
+import logging
+import os
 import platform
 from pathlib import Path
-from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union, cast
 import gradio as gr
-from gradio.components.chatbot import MetadataDict
-from typing import cast
 # Import from agent package
 from agent import ComputerAgent
-from agent.types import Messages, AgentResponse
+from agent.types import AgentResponse, Messages
 from computer import Computer
+from gradio.components.chatbot import MetadataDict
 # Global variables
 global_agent = None
@@ -42,11 +42,13 @@ SETTINGS_FILE = Path(".gradio_settings.json")
 logging.basicConfig(level=logging.INFO)
 import dotenv
 if dotenv.load_dotenv():
     print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
 else:
     print("DEBUG - No .env file found")
 # --- Settings Load/Save Functions ---
 def load_settings() -> Dict[str, Any]:
     """Loads settings from the JSON file."""
@@ -84,7 +86,7 @@ def save_settings(settings: Dict[str, Any]):
 #     async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
 #         """Add screenshot to chatbot when a screenshot is taken."""
 #         image_markdown = f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
 #         if self.chatbot_history is not None:
 #             self.chatbot_history.append(
 #                 gr.ChatMessage(
@@ -114,14 +116,12 @@ MODEL_MAPPINGS = {
         "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
         "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
         "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
-        "Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
     },
     "omni": {
         "default": "omniparser+openai/gpt-4o",
         "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
         "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
         "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
-        "OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
     },
     "uitars": {
         "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
@@ -141,7 +141,7 @@ def get_model_string(model_name: str, loop_provider: str) -> str:
             ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
             return f"omniparser+ollama_chat/{ollama_model}"
         return "omniparser+ollama_chat/llama3"
     # Map based on loop provider
     mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
     return mapping.get(model_name, mapping["default"])
@@ -151,6 +151,7 @@ def get_ollama_models() -> List[str]:
     """Get available models from Ollama if installed."""
     try:
         import subprocess
         result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
         if result.returncode == 0:
             lines = result.stdout.strip().split("\n")
@@ -174,16 +175,14 @@ def create_computer_instance(
     os_type: str = "macos",
     provider_type: str = "lume",
     name: Optional[str] = None,
-    api_key: Optional[str] = None
+    api_key: Optional[str] = None,
 ) -> Computer:
     """Create or get the global Computer instance."""
     global global_computer
     if global_computer is None:
         if provider_type == "localhost":
             global_computer = Computer(
-                verbosity=verbosity,
-                os_type=os_type,
-                use_host_computer_server=True
+                verbosity=verbosity, os_type=os_type, use_host_computer_server=True
             )
         else:
             global_computer = Computer(
@@ -191,7 +190,7 @@ def create_computer_instance(
                 os_type=os_type,
                 provider_type=provider_type,
                 name=name if name else "",
-                api_key=api_key
+                api_key=api_key,
             )
     return global_computer
@@ -217,7 +216,7 @@ def create_agent(
         os_type=computer_os,
         provider_type=computer_provider,
         name=computer_name,
-        api_key=computer_api_key
+        api_key=computer_api_key,
     )
     # Handle custom models
@@ -233,12 +232,15 @@ def create_agent(
         "only_n_most_recent_images": only_n_most_recent_images,
         "verbosity": verbosity,
     }
     if save_trajectory:
         agent_kwargs["trajectory_dir"] = "trajectories"
     if max_trajectory_budget:
-        agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
+        agent_kwargs["max_trajectory_budget"] = {
+            "max_budget": max_trajectory_budget,
+            "raise_error": True,
+        }
     global_agent = ComputerAgent(**agent_kwargs)
     return global_agent
@@ -247,7 +249,8 @@ def create_agent(
 def launch_ui():
     """Standalone function to launch the Gradio app."""
     from agent.ui.gradio.ui_components import create_gradio_ui
-    print(f"Starting Gradio app for CUA Agent...")
+    print("Starting Gradio app for Cua Agent...")
     demo = create_gradio_ui()
     demo.launch(share=False, inbrowser=True)

cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl