PyPI - cua-agent - Versions diffs - 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl - Mend

cua-agent 0.4.31py3-none-any.whl → 0.4.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (25) hide show

agent/adapters/huggingfacelocal_adapter.py +15 -66
agent/adapters/models/__init__.py +33 -0
agent/adapters/models/generic.py +75 -0
agent/adapters/models/internvl.py +254 -0
agent/adapters/models/opencua.py +100 -0
agent/adapters/models/qwen2_5_vl.py +75 -0
agent/agent.py +5 -1
agent/callbacks/trajectory_saver.py +2 -0
agent/cli.py +147 -22
agent/loops/__init__.py +19 -1
agent/loops/anthropic.py +3 -4
agent/loops/composed_grounded.py +1 -1
agent/loops/gemini.py +391 -0
agent/loops/glm45v.py +3 -2
agent/loops/gta1.py +1 -1
agent/loops/holo.py +216 -0
agent/loops/internvl.py +185 -0
agent/loops/moondream3.py +464 -0
agent/loops/openai.py +1 -2
agent/loops/opencua.py +142 -0
agent/loops/uitars.py +1 -1
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/METADATA +23 -4
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/RECORD +25 -15
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/WHEEL +0 -0
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/entry_points.txt +0 -0

agent/callbacks/trajectory_saver.py CHANGED Viewed

@@ -188,6 +188,8 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
             if "+" in model:
                 model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
+            # strip non-alphanumeric characters from model_name_short
+            model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
             # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
             now = datetime.now()

agent/cli.py CHANGED Viewed

@@ -18,6 +18,15 @@ try:
     import json
     from typing import List, Dict, Any
     import dotenv
+    import base64
+    import time
+    import platform
+    from pathlib import Path
+    try:
+        from PIL import Image, ImageDraw
+        PIL_AVAILABLE = True
+    except Exception:
+        PIL_AVAILABLE = False
     from yaspin import yaspin
 except ImportError:
     if __name__ == "__main__":
@@ -158,7 +167,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
                 # Process and display the output
                 for item in result.get("output", []):
-                    if item.get("type") == "message":
+                    if item.get("type") == "message" and item.get("role") == "assistant":
                         # Display agent text response
                         content = item.get("content", [])
                         for content_part in content:
@@ -217,6 +226,13 @@ Examples:
         help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
     )
+    parser.add_argument(
+        "--provider",
+        choices=["cloud", "lume", "winsandbox", "docker"],
+        default="cloud",
+        help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
+    )
     parser.add_argument(
         "--images",
         type=int,
@@ -248,6 +264,19 @@ Examples:
         help="Initial prompt to send to the agent. Leave blank for interactive mode."
     )
+    parser.add_argument(
+        "--prompt-file",
+        type=Path,
+        help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
+    )
+    parser.add_argument(
+        "--predict-click",
+        dest="predict_click",
+        type=str,
+        help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
+    )
     parser.add_argument(
         "-c", "--cache",
         action="store_true",
@@ -273,33 +302,35 @@ Examples:
     container_name = os.getenv("CUA_CONTAINER_NAME")
     cua_api_key = os.getenv("CUA_API_KEY")
-    # Prompt for missing environment variables
+    # Prompt for missing environment variables (container name always required)
     if not container_name:
-        print_colored("CUA_CONTAINER_NAME not set.", dim=True)
-        print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
-        container_name = input("Enter your CUA container name: ").strip()
-        if not container_name:
-            print_colored("❌ Container name is required.")
-            sys.exit(1)
-    if not cua_api_key:
+        if args.provider == "cloud":
+            print_colored("CUA_CONTAINER_NAME not set.", dim=True)
+            print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
+            container_name = input("Enter your CUA container name: ").strip()
+            if not container_name:
+                print_colored("❌ Container name is required.")
+                sys.exit(1)
+        else:
+            container_name = "cli-sandbox"
+    # Only require API key for cloud provider
+    if args.provider == "cloud" and not cua_api_key:
         print_colored("CUA_API_KEY not set.", dim=True)
         cua_api_key = input("Enter your CUA API key: ").strip()
         if not cua_api_key:
-            print_colored("❌ API key is required.")
+            print_colored("❌ API key is required for cloud provider.")
             sys.exit(1)
     # Check for provider-specific API keys based on model
     provider_api_keys = {
         "openai/": "OPENAI_API_KEY",
         "anthropic/": "ANTHROPIC_API_KEY",
-        "omniparser+": "OPENAI_API_KEY",
-        "omniparser+": "ANTHROPIC_API_KEY",
     }
     # Find matching provider and check for API key
     for prefix, env_var in provider_api_keys.items():
-        if args.model.startswith(prefix):
+        if prefix in args.model:
             if not os.getenv(env_var):
                 print_colored(f"{env_var} not set.", dim=True)
                 api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
@@ -319,18 +350,31 @@ Examples:
         print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
         sys.exit(1)
+    # Resolve provider -> os_type, provider_type, api key requirement
+    provider_map = {
+        "cloud": ("linux", "cloud", True),
+        "lume": ("macos", "lume", False),
+        "winsandbox": ("windows", "winsandbox", False),
+        "docker": ("linux", "docker", False),
+    }
+    os_type, provider_type, needs_api_key = provider_map[args.provider]
+    computer_kwargs = {
+        "os_type": os_type,
+        "provider_type": provider_type,
+        "name": container_name,
+    }
+    if needs_api_key:
+        computer_kwargs["api_key"] = cua_api_key # type: ignore
     # Create computer instance
-    async with Computer(
-        os_type="linux",
-        provider_type="cloud",
-        name=container_name,
-        api_key=cua_api_key
-    ) as computer:
+    async with Computer(**computer_kwargs) as computer: # type: ignore
         # Create agent
         agent_kwargs = {
             "model": args.model,
             "tools": [computer],
+            "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
             "max_retries": args.max_retries
         }
@@ -353,8 +397,89 @@ Examples:
         agent = ComputerAgent(**agent_kwargs)
-        # Start chat loop
-        await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
+        # If predict-click mode is requested, run once and exit
+        if args.predict_click:
+            if not PIL_AVAILABLE:
+                print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
+                sys.exit(1)
+            instruction = args.predict_click
+            print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
+            # Take a fresh screenshot FIRST
+            try:
+                img_bytes = await computer.interface.screenshot()
+            except Exception as e:
+                print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            # Encode screenshot to base64 for predict_click
+            try:
+                image_b64 = base64.b64encode(img_bytes).decode("utf-8")
+            except Exception as e:
+                print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            try:
+                coords = await agent.predict_click(instruction, image_b64=image_b64)
+            except Exception as e:
+                print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            if not coords:
+                print_colored("⚠️  No coordinates returned.", Colors.YELLOW)
+                sys.exit(2)
+            x, y = coords
+            print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
+            try:
+                from io import BytesIO
+                with Image.open(BytesIO(img_bytes)) as img:
+                    img = img.convert("RGB")
+                    draw = ImageDraw.Draw(img)
+                    # Draw crosshair
+                    size = 12
+                    color = (255, 0, 0)
+                    draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
+                    draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
+                    # Optional small circle
+                    r = 6
+                    draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
+                    out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
+                    img.save(out_path)
+                    print_colored(f"🖼️  Saved to {out_path}")
+                    # Open the image with default viewer
+                    try:
+                        system = platform.system().lower()
+                        if system == "windows":
+                            os.startfile(str(out_path))  # type: ignore[attr-defined]
+                        elif system == "darwin":
+                            os.system(f"open \"{out_path}\"")
+                        else:
+                            os.system(f"xdg-open \"{out_path}\"")
+                    except Exception:
+                        pass
+            except Exception as e:
+                print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            # Done
+            sys.exit(0)
+        # Resolve initial prompt from --prompt-file or --prompt
+        initial_prompt = args.prompt or ""
+        if args.prompt_file:
+            try:
+                initial_prompt = args.prompt_file.read_text(encoding="utf-8")
+            except Exception as e:
+                print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+        # Start chat loop (default interactive mode)
+        await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)

agent/loops/__init__.py CHANGED Viewed

@@ -10,5 +10,23 @@ from . import omniparser
 from . import gta1
 from . import composed_grounded
 from . import glm45v
+from . import opencua
+from . import internvl
+from . import holo
+from . import moondream3
+from . import gemini
-__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
+__all__ = [
+    "anthropic",
+    "openai",
+    "uitars",
+    "omniparser",
+    "gta1",
+    "composed_grounded",
+    "glm45v",
+    "opencua",
+    "internvl",
+    "holo",
+    "moondream3",
+    "gemini"
+]

agent/loops/anthropic.py CHANGED Viewed

@@ -33,7 +33,7 @@ from ..responses import (
 MODEL_TOOL_MAPPING = [
     # Claude 4 models
     {
-        "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
+        "pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
         "tool_version": "computer_20250124",
         "beta_flag": "computer-use-2025-01-24"
     },
@@ -1577,11 +1577,10 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
                 isinstance(item.get("action"), dict)):
                 action = item["action"]
-                if action.get("type") == "click":
+                if action.get("x") and action.get("y"):
                     x = action.get("x")
                     y = action.get("y")
-                    if x is not None and y is not None:
-                        return (int(x), int(y))
+                    return (int(x), int(y))
         return None

agent/loops/composed_grounded.py CHANGED Viewed

@@ -126,7 +126,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
 @register_agent(r".*\+.*", priority=1)
-class ComposedGroundedConfig:
+class ComposedGroundedConfig(AsyncAgentConfig):
     """
     Composed-grounded agent configuration that uses both grounding and thinking models.

cua-agent 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.31py3-none-any.whl → 0.4.33py3-none-any.whl