PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +32 -19
agent/computers/cua.py +33 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +215 -210
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +510 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
cua_agent-0.4.36.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0

agent/cli.py CHANGED Viewed

@@ -3,7 +3,7 @@ CLI chat interface for agent - Computer Use Agent
 Usage:
     python -m agent.cli <model_string>
 Examples:
     python -m agent.cli openai/computer-use-preview
     python -m agent.cli anthropic/claude-3-5-sonnet-20241022
@@ -11,19 +11,22 @@ Examples:
 """
 try:
-    import asyncio
     import argparse
+    import asyncio
+    import base64
+    import json
     import os
+    import platform
     import sys
-    import json
-    from typing import List, Dict, Any
-    import dotenv
-    import base64
     import time
-    import platform
     from pathlib import Path
+    from typing import Any, Dict, List
+    import dotenv
     try:
         from PIL import Image, ImageDraw
         PIL_AVAILABLE = True
     except Exception:
         PIL_AVAILABLE = False
@@ -31,36 +34,44 @@ try:
 except ImportError:
     if __name__ == "__main__":
         raise ImportError(
-            "CLI dependencies not found. "
-            "Please install with: pip install \"cua-agent[cli]\""
+            "CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
         )
 # Load environment variables
 dotenv.load_dotenv()
 # Color codes for terminal output
 class Colors:
-    RESET = '\033[0m'
-    BOLD = '\033[1m'
-    DIM = '\033[2m'
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
     # Text colors
-    RED = '\033[31m'
-    GREEN = '\033[32m'
-    YELLOW = '\033[33m'
-    BLUE = '\033[34m'
-    MAGENTA = '\033[35m'
-    CYAN = '\033[36m'
-    WHITE = '\033[37m'
-    GRAY = '\033[90m'
-    # Background colors
-    BG_RED = '\033[41m'
-    BG_GREEN = '\033[42m'
-    BG_YELLOW = '\033[43m'
-    BG_BLUE = '\033[44m'
+    RED = "\033[31m"
+    GREEN = "\033[32m"
+    YELLOW = "\033[33m"
+    BLUE = "\033[34m"
+    MAGENTA = "\033[35m"
+    CYAN = "\033[36m"
+    WHITE = "\033[37m"
+    GRAY = "\033[90m"
-def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
+    # Background colors
+    BG_RED = "\033[41m"
+    BG_GREEN = "\033[42m"
+    BG_YELLOW = "\033[43m"
+    BG_BLUE = "\033[44m"
+def print_colored(
+    text: str,
+    color: str = "",
+    bold: bool = False,
+    dim: bool = False,
+    end: str = "\n",
+    right: str = "",
+):
     """Print colored text to terminal with optional right-aligned text."""
     prefix = ""
     if bold:
@@ -69,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
         prefix += Colors.DIM
     if color:
         prefix += color
     if right:
         # Get terminal width (default to 80 if unable to determine)
         try:
             import shutil
             terminal_width = shutil.get_terminal_size().columns
         except:
             terminal_width = 80
         # Add right margin
         terminal_width -= 1
         # Calculate padding needed
         # Account for ANSI escape codes not taking visual space
         visible_left_len = len(text)
         visible_right_len = len(right)
         padding = terminal_width - visible_left_len - visible_right_len
         if padding > 0:
             output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
         else:
@@ -94,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
             output = f"{prefix}{text} {right}{Colors.RESET}"
     else:
         output = f"{prefix}{text}{Colors.RESET}"
     print(output, end=end)
@@ -113,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
         args_str = f"('{details['text']}')"
     elif action_type == "scroll" and "x" in details and "y" in details:
         args_str = f"({details['x']}, {details['y']})"
     if total_cost > 0:
         print_colored(f"🛠️  {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
     else:
         print_colored(f"🛠️  {action_type}{args_str}", dim=True)
 def print_welcome(model: str, agent_loop: str, container_name: str):
     """Print welcome message."""
     print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
     print_colored("Type 'exit' to quit.", dim=True)
 async def ainput(prompt: str = ""):
     return await asyncio.to_thread(input, prompt)
-async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
+async def chat_loop(
+    agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
+):
     """Main chat loop with the agent."""
     print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
     history = []
     if initial_prompt:
         history.append({"role": "user", "content": initial_prompt})
     total_cost = 0
     while True:
@@ -143,28 +160,28 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
             # Get user input with prompt
             print_colored("> ", end="")
             user_input = await ainput()
-            if user_input.lower() in ['exit', 'quit', 'q']:
+            if user_input.lower() in ["exit", "quit", "q"]:
                 print_colored("\n👋 Goodbye!")
                 break
             if not user_input:
                 continue
             # Add user message to history
             history.append({"role": "user", "content": user_input})
         # Stream responses from the agent with spinner
         with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
             spinner.hide()
             async for result in agent.run(history):
                 # Add agent responses to history
                 history.extend(result.get("output", []))
                 if show_usage:
                     total_cost += result.get("usage", {}).get("response_cost", 0)
                 # Process and display the output
                 for item in result.get("output", []):
                     if item.get("type") == "message" and item.get("role") == "assistant":
@@ -176,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
                                 if text:
                                     spinner.hide()
                                     print_colored(text)
                     elif item.get("type") == "computer_call":
                         # Display computer action
                         action = item.get("action", {})
@@ -186,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
                             print_action(action_type, action, total_cost)
                             spinner.text = f"Performing {action_type}..."
                             spinner.show()
                     elif item.get("type") == "function_call":
                         # Display function call
                         function_name = item.get("name", "")
@@ -194,18 +211,18 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
                         print_colored(f"🔧 Calling function: {function_name}", dim=True)
                         spinner.text = f"Calling {function_name}..."
                         spinner.show()
                     elif item.get("type") == "function_call_output":
                         # Display function output (dimmed)
                         output = item.get("output", "")
                         if output and len(output.strip()) > 0:
                             spinner.hide()
                             print_colored(f"📤 {output}", dim=True)
             spinner.hide()
             if show_usage and total_cost > 0:
                 print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
 async def main():
     """Main CLI function."""
@@ -218,90 +235,74 @@ Examples:
   python -m agent.cli anthropic/claude-3-5-sonnet-20241022
   python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
   python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
-        """
+        """,
     )
     parser.add_argument(
         "model",
-        help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
+        help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')",
     )
     parser.add_argument(
         "--provider",
         choices=["cloud", "lume", "winsandbox", "docker"],
         default="cloud",
-        help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
+        help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
     )
     parser.add_argument(
         "--images",
         type=int,
         default=3,
-        help="Number of recent images to keep in context (default: 3)"
-    )
-    parser.add_argument(
-        "--trajectory",
-        action="store_true",
-        help="Save trajectory for debugging"
-    )
-    parser.add_argument(
-        "--budget",
-        type=float,
-        help="Maximum budget for the session (in dollars)"
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Enable verbose logging"
+        help="Number of recent images to keep in context (default: 3)",
     )
+    parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
+    parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
     parser.add_argument(
-        "-p", "--prompt",
+        "-p",
+        "--prompt",
         type=str,
-        help="Initial prompt to send to the agent. Leave blank for interactive mode."
+        help="Initial prompt to send to the agent. Leave blank for interactive mode.",
     )
     parser.add_argument(
         "--prompt-file",
         type=Path,
-        help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
+        help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
     )
     parser.add_argument(
         "--predict-click",
         dest="predict_click",
         type=str,
-        help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
+        help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
     )
-    parser.add_argument(
-        "-c", "--cache",
-        action="store_true",
-        help="Tell the API to enable caching"
-    )
+    parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
     parser.add_argument(
-        "-u", "--usage",
-        action="store_true",
-        help="Show total cost of the agent runs"
+        "-u", "--usage", action="store_true", help="Show total cost of the agent runs"
     )
     parser.add_argument(
-        "-r", "--max-retries",
+        "-r",
+        "--max-retries",
         type=int,
         default=3,
-        help="Maximum number of retries for the LLM API calls"
+        help="Maximum number of retries for the LLM API calls",
     )
     args = parser.parse_args()
     # Check for required environment variables
     container_name = os.getenv("CUA_CONTAINER_NAME")
     cua_api_key = os.getenv("CUA_API_KEY")
     # Prompt for missing environment variables (container name always required)
     if not container_name:
         if args.provider == "cloud":
@@ -321,13 +322,13 @@ Examples:
         if not cua_api_key:
             print_colored("❌ API key is required for cloud provider.")
             sys.exit(1)
     # Check for provider-specific API keys based on model
     provider_api_keys = {
         "openai/": "OPENAI_API_KEY",
         "anthropic/": "ANTHROPIC_API_KEY",
     }
     # Find matching provider and check for API key
     for prefix, env_var in provider_api_keys.items():
         if prefix in args.model:
@@ -340,7 +341,7 @@ Examples:
                 # Set the environment variable for the session
                 os.environ[env_var] = api_key
             break
     # Import here to avoid import errors if dependencies are missing
     try:
         from agent import ComputerAgent
@@ -349,7 +350,7 @@ Examples:
         print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
         print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
         sys.exit(1)
     # Resolve provider -> os_type, provider_type, api key requirement
     provider_map = {
         "cloud": ("linux", "cloud", True),
@@ -365,42 +366,46 @@ Examples:
         "name": container_name,
     }
     if needs_api_key:
-        computer_kwargs["api_key"] = cua_api_key # type: ignore
+        computer_kwargs["api_key"] = cua_api_key  # type: ignore
     # Create computer instance
-    async with Computer(**computer_kwargs) as computer: # type: ignore
+    async with Computer(**computer_kwargs) as computer:  # type: ignore
         # Create agent
         agent_kwargs = {
             "model": args.model,
             "tools": [computer],
-            "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
+            "trust_remote_code": True,  # needed for some local models (e.g., InternVL, OpenCUA)
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
-            "max_retries": args.max_retries
+            "max_retries": args.max_retries,
         }
         if args.images > 0:
             agent_kwargs["only_n_most_recent_images"] = args.images
         if args.trajectory:
             agent_kwargs["trajectory_dir"] = "trajectories"
         if args.budget:
             agent_kwargs["max_trajectory_budget"] = {
                 "max_budget": args.budget,
                 "raise_error": True,
-                "reset_after_each_run": False
+                "reset_after_each_run": False,
             }
         if args.cache:
             agent_kwargs["use_prompt_caching"] = True
         agent = ComputerAgent(**agent_kwargs)
         # If predict-click mode is requested, run once and exit
         if args.predict_click:
             if not PIL_AVAILABLE:
-                print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
+                print_colored(
+                    "❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
+                    Colors.RED,
+                    bold=True,
+                )
                 sys.exit(1)
             instruction = args.predict_click
@@ -435,6 +440,7 @@ Examples:
             try:
                 from io import BytesIO
                 with Image.open(BytesIO(img_bytes)) as img:
                     img = img.convert("RGB")
                     draw = ImageDraw.Draw(img)
@@ -457,9 +463,9 @@ Examples:
                         if system == "windows":
                             os.startfile(str(out_path))  # type: ignore[attr-defined]
                         elif system == "darwin":
-                            os.system(f"open \"{out_path}\"")
+                            os.system(f'open "{out_path}"')
                         else:
-                            os.system(f"xdg-open \"{out_path}\"")
+                            os.system(f'xdg-open "{out_path}"')
                     except Exception:
                         pass
             except Exception as e:
@@ -482,9 +488,8 @@ Examples:
         await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
 if __name__ == "__main__":
     try:
         asyncio.run(main())
     except (KeyboardInterrupt, EOFError) as _:
-        print_colored("\n\n👋 Goodbye!")
+        print_colored("\n\n👋 Goodbye!")

agent/computers/__init__.py CHANGED Viewed

@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
 Computer library interface.
 """
+from computer import Computer as cuaComputer
 from .base import AsyncComputerHandler
 from .cua import cuaComputerHandler
 from .custom import CustomComputerHandler
-from computer import Computer as cuaComputer
 def is_agent_computer(computer):
     """Check if the given computer is a ComputerHandler or CUA Computer."""
-    return isinstance(computer, AsyncComputerHandler) or \
-        isinstance(computer, cuaComputer) or \
-        (isinstance(computer, dict)) #and "screenshot" in computer)
+    return (
+        isinstance(computer, AsyncComputerHandler)
+        or isinstance(computer, cuaComputer)
+        or (isinstance(computer, dict))
+    )  # and "screenshot" in computer)
 async def make_computer_handler(computer):
     """
     Create a computer handler from a computer interface.
     Args:
         computer: Either a ComputerHandler instance, Computer instance, or dict of functions
     Returns:
         ComputerHandler: A computer handler instance
     Raises:
         ValueError: If the computer type is not supported
     """
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
         return computer_handler
     if isinstance(computer, dict):
         return CustomComputerHandler(computer)
-    raise ValueError(f"Unsupported computer type: {type(computer)}")
+    raise ValueError(f"Unsupported computer type: {type(computer)}")

agent/computers/base.py CHANGED Viewed

@@ -2,69 +2,82 @@
 Base computer interface protocol for agent interactions.
 """
-from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    Union,
+    runtime_checkable,
+)
 @runtime_checkable
 class AsyncComputerHandler(Protocol):
     """Protocol defining the interface for computer interactions."""
-    # ==== Computer-Use-Preview Action Space ====
+    # ==== Computer-Use-Preview Action Space ====
     async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
         """Get the current environment type."""
         ...
     async def get_dimensions(self) -> tuple[int, int]:
         """Get screen dimensions as (width, height)."""
         ...
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         ...
     async def click(self, x: int, y: int, button: str = "left") -> None:
         """Click at coordinates with specified button."""
         ...
     async def double_click(self, x: int, y: int) -> None:
         """Double click at coordinates."""
         ...
     async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
         """Scroll at coordinates with specified scroll amounts."""
         ...
     async def type(self, text: str) -> None:
         """Type text."""
         ...
     async def wait(self, ms: int = 1000) -> None:
         """Wait for specified milliseconds."""
         ...
     async def move(self, x: int, y: int) -> None:
         """Move cursor to coordinates."""
         ...
     async def keypress(self, keys: Union[List[str], str]) -> None:
         """Press key combination."""
         ...
     async def drag(self, path: List[Dict[str, int]]) -> None:
         """Drag along specified path."""
         ...
     async def get_current_url(self) -> str:
         """Get current URL (for browser environments)."""
         ...
-    # ==== Anthropic Action Space ====
+    # ==== Anthropic Action Space ====
     async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse down at coordinates."""
         ...
     async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse up at coordinates."""
         ...

cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.36py3-none-any.whl