PyPI - cua-agent - Versions diffs - 0.4.32__py3-none-any.whl → 0.4.33__py3-none-any.whl - Mend

cua-agent 0.4.32py3-none-any.whl → 0.4.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (10) hide show

agent/cli.py +57 -21
agent/loops/__init__.py +4 -0
agent/loops/anthropic.py +1 -1
agent/loops/gemini.py +391 -0
agent/loops/moondream3.py +464 -0
agent/loops/openai.py +1 -2
{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/METADATA +6 -3
{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/RECORD +10 -8
{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/WHEEL +0 -0
{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/entry_points.txt +0 -0

agent/cli.py CHANGED Viewed

@@ -167,7 +167,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
                 # Process and display the output
                 for item in result.get("output", []):
-                    if item.get("type") == "message":
+                    if item.get("type") == "message" and item.get("role") == "assistant":
                         # Display agent text response
                         content = item.get("content", [])
                         for content_part in content:
@@ -226,6 +226,13 @@ Examples:
         help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
     )
+    parser.add_argument(
+        "--provider",
+        choices=["cloud", "lume", "winsandbox", "docker"],
+        default="cloud",
+        help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
+    )
     parser.add_argument(
         "--images",
         type=int,
@@ -257,6 +264,12 @@ Examples:
         help="Initial prompt to send to the agent. Leave blank for interactive mode."
     )
+    parser.add_argument(
+        "--prompt-file",
+        type=Path,
+        help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
+    )
     parser.add_argument(
         "--predict-click",
         dest="predict_click",
@@ -289,33 +302,35 @@ Examples:
     container_name = os.getenv("CUA_CONTAINER_NAME")
     cua_api_key = os.getenv("CUA_API_KEY")
-    # Prompt for missing environment variables
+    # Prompt for missing environment variables (container name always required)
     if not container_name:
-        print_colored("CUA_CONTAINER_NAME not set.", dim=True)
-        print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
-        container_name = input("Enter your CUA container name: ").strip()
-        if not container_name:
-            print_colored("❌ Container name is required.")
-            sys.exit(1)
-    if not cua_api_key:
+        if args.provider == "cloud":
+            print_colored("CUA_CONTAINER_NAME not set.", dim=True)
+            print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
+            container_name = input("Enter your CUA container name: ").strip()
+            if not container_name:
+                print_colored("❌ Container name is required.")
+                sys.exit(1)
+        else:
+            container_name = "cli-sandbox"
+    # Only require API key for cloud provider
+    if args.provider == "cloud" and not cua_api_key:
         print_colored("CUA_API_KEY not set.", dim=True)
         cua_api_key = input("Enter your CUA API key: ").strip()
         if not cua_api_key:
-            print_colored("❌ API key is required.")
+            print_colored("❌ API key is required for cloud provider.")
             sys.exit(1)
     # Check for provider-specific API keys based on model
     provider_api_keys = {
         "openai/": "OPENAI_API_KEY",
         "anthropic/": "ANTHROPIC_API_KEY",
-        "omniparser+": "OPENAI_API_KEY",
-        "omniparser+": "ANTHROPIC_API_KEY",
     }
     # Find matching provider and check for API key
     for prefix, env_var in provider_api_keys.items():
-        if args.model.startswith(prefix):
+        if prefix in args.model:
             if not os.getenv(env_var):
                 print_colored(f"{env_var} not set.", dim=True)
                 api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
@@ -335,13 +350,25 @@ Examples:
         print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
         sys.exit(1)
+    # Resolve provider -> os_type, provider_type, api key requirement
+    provider_map = {
+        "cloud": ("linux", "cloud", True),
+        "lume": ("macos", "lume", False),
+        "winsandbox": ("windows", "winsandbox", False),
+        "docker": ("linux", "docker", False),
+    }
+    os_type, provider_type, needs_api_key = provider_map[args.provider]
+    computer_kwargs = {
+        "os_type": os_type,
+        "provider_type": provider_type,
+        "name": container_name,
+    }
+    if needs_api_key:
+        computer_kwargs["api_key"] = cua_api_key # type: ignore
     # Create computer instance
-    async with Computer(
-        os_type="linux",
-        provider_type="cloud",
-        name=container_name,
-        api_key=cua_api_key
-    ) as computer:
+    async with Computer(**computer_kwargs) as computer: # type: ignore
         # Create agent
         agent_kwargs = {
@@ -442,8 +469,17 @@ Examples:
             # Done
             sys.exit(0)
+        # Resolve initial prompt from --prompt-file or --prompt
+        initial_prompt = args.prompt or ""
+        if args.prompt_file:
+            try:
+                initial_prompt = args.prompt_file.read_text(encoding="utf-8")
+            except Exception as e:
+                print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
+                sys.exit(1)
         # Start chat loop (default interactive mode)
-        await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
+        await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)

agent/loops/__init__.py CHANGED Viewed

@@ -13,6 +13,8 @@ from . import glm45v
 from . import opencua
 from . import internvl
 from . import holo
+from . import moondream3
+from . import gemini
 __all__ = [
     "anthropic",
@@ -25,4 +27,6 @@ __all__ = [
     "opencua",
     "internvl",
     "holo",
+    "moondream3",
+    "gemini"
 ]

agent/loops/anthropic.py CHANGED Viewed

@@ -33,7 +33,7 @@ from ..responses import (
 MODEL_TOOL_MAPPING = [
     # Claude 4 models
     {
-        "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
+        "pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
         "tool_version": "computer_20250124",
         "beta_flag": "computer-use-2025-01-24"
     },

agent/loops/gemini.py ADDED Viewed

@@ -0,0 +1,391 @@
+"""
+Gemini 2.5 Computer Use agent loop
+Maps internal Agent SDK message format to Google's Gemini Computer Use API and back.
+Key features:
+- Lazy import of google.genai
+- Configure Computer Use tool with excluded browser-specific predefined functions
+- Optional custom function declarations hook for computer-call specific functions
+- Convert Gemini function_call parts into internal computer_call actions
+"""
+from __future__ import annotations
+import base64
+import io
+import uuid
+from typing import Any, Dict, List, Optional, Tuple
+from PIL import Image
+from ..decorators import register_agent
+from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability
+def _lazy_import_genai():
+    """Import google.genai lazily to avoid hard dependency unless used."""
+    try:
+        from google import genai  # type: ignore
+        from google.genai import types  # type: ignore
+        return genai, types
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "google.genai is required for the Gemini Computer Use loop. Install the Google Gemini SDK."
+        ) from e
+def _data_url_to_bytes(data_url: str) -> Tuple[bytes, str]:
+    """Convert a data URL to raw bytes and mime type."""
+    if not data_url.startswith("data:"):
+        # Assume it's base64 png payload
+        try:
+            return base64.b64decode(data_url), "image/png"
+        except Exception:
+            return b"", "application/octet-stream"
+    header, b64 = data_url.split(",", 1)
+    mime = "image/png"
+    if ";" in header:
+        mime = header.split(";")[0].split(":", 1)[1] or "image/png"
+    return base64.b64decode(b64), mime
+def _bytes_image_size(img_bytes: bytes) -> Tuple[int, int]:
+    try:
+        img = Image.open(io.BytesIO(img_bytes))
+        return img.size
+    except Exception:
+        return (1024, 768)
+def _find_last_user_text(messages: List[Dict[str, Any]]) -> List[str]:
+    texts: List[str] = []
+    for msg in reversed(messages):
+        if msg.get("type") in (None, "message") and msg.get("role") == "user":
+            content = msg.get("content")
+            if isinstance(content, str):
+                return [content]
+            elif isinstance(content, list):
+                for c in content:
+                    if c.get("type") in ("input_text", "output_text") and c.get("text"):
+                        texts.append(c["text"])  # newest first
+                if texts:
+                    return list(reversed(texts))
+    return []
+def _find_last_screenshot(messages: List[Dict[str, Any]]) -> Optional[bytes]:
+    for msg in reversed(messages):
+        if msg.get("type") == "computer_call_output":
+            out = msg.get("output", {})
+            if isinstance(out, dict) and out.get("type") in ("input_image", "computer_screenshot"):
+                image_url = out.get("image_url", "")
+                if image_url:
+                    data, _ = _data_url_to_bytes(image_url)
+                    return data
+    return None
+def _denormalize(v: int, size: int) -> int:
+    # Gemini returns 0-999 normalized
+    try:
+        return max(0, min(size - 1, int(round(v / 1000 * size))))
+    except Exception:
+        return 0
+def _map_gemini_fc_to_computer_call(
+    fc: Dict[str, Any],
+    screen_w: int,
+    screen_h: int,
+) -> Optional[Dict[str, Any]]:
+    name = fc.get("name")
+    args = fc.get("args", {}) or {}
+    action: Dict[str, Any] = {}
+    if name == "click_at":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        action = {"type": "click", "x": x, "y": y, "button": "left"}
+    elif name == "type_text_at":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        text = args.get("text", "")
+        if args.get("press_enter") == True:
+            text += "\n"
+        action = {"type": "type", "x": x, "y": y, "text": text}
+    elif name == "hover_at":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        action = {"type": "move", "x": x, "y": y}
+    elif name == "key_combination":
+        keys = str(args.get("keys", ""))
+        action = {"type": "keypress", "keys": keys}
+    elif name == "scroll_document":
+        direction = args.get("direction", "down")
+        magnitude = 800
+        dx, dy = 0, 0
+        if direction == "down":
+            dy = magnitude
+        elif direction == "up":
+            dy = -magnitude
+        elif direction == "right":
+            dx = magnitude
+        elif direction == "left":
+            dx = -magnitude
+        action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)}
+    elif name == "scroll_at":
+        x = _denormalize(int(args.get("x", 500)), screen_w)
+        y = _denormalize(int(args.get("y", 500)), screen_h)
+        direction = args.get("direction", "down")
+        magnitude = int(args.get("magnitude", 800))
+        dx, dy = 0, 0
+        if direction == "down":
+            dy = magnitude
+        elif direction == "up":
+            dy = -magnitude
+        elif direction == "right":
+            dx = magnitude
+        elif direction == "left":
+            dx = -magnitude
+        action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": x, "y": y}
+    elif name == "drag_and_drop":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        dx = _denormalize(int(args.get("destination_x", x)), screen_w)
+        dy = _denormalize(int(args.get("destination_y", y)), screen_h)
+        action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"}
+    elif name == "wait_5_seconds":
+        action = {"type": "wait"}
+    else:
+        # Unsupported / excluded browser-specific or custom function; ignore
+        return None
+    return {
+        "type": "computer_call",
+        "call_id": uuid.uuid4().hex,
+        "status": "completed",
+        "action": action,
+    }
+@register_agent(models=r"^gemini-2\.5-computer-use-preview-10-2025$")
+class GeminiComputerUseConfig(AsyncAgentConfig):
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        genai, types = _lazy_import_genai()
+        client = genai.Client()
+        # Build excluded predefined functions for browser-specific behavior
+        excluded = [
+            "open_web_browser",
+            "search",
+            "navigate",
+            "go_forward",
+            "go_back",
+            "scroll_document",
+        ]
+        # Optional custom functions: can be extended by host code via `tools` parameter later if desired
+        CUSTOM_FUNCTION_DECLARATIONS: List[Any] = []
+        # Compose tools config
+        generate_content_config = types.GenerateContentConfig(
+            tools=[
+                types.Tool(
+                    computer_use=types.ComputerUse(
+                        environment=types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=excluded,
+                    )
+                ),
+                # types.Tool(function_declarations=CUSTOM_FUNCTION_DECLARATIONS),  # enable when custom functions needed
+            ]
+        )
+        # Prepare contents: last user text + latest screenshot
+        user_texts = _find_last_user_text(messages)
+        screenshot_bytes = _find_last_screenshot(messages)
+        parts: List[Any] = []
+        for t in user_texts:
+            parts.append(types.Part(text=t))
+        screen_w, screen_h = 1024, 768
+        if screenshot_bytes:
+            screen_w, screen_h = _bytes_image_size(screenshot_bytes)
+            parts.append(types.Part.from_bytes(data=screenshot_bytes, mime_type="image/png"))
+        # If we don't have any content, at least pass an empty user part to prompt reasoning
+        if not parts:
+            parts = [types.Part(text="Proceed to the next action.")]
+        contents = [types.Content(role="user", parts=parts)]
+        api_kwargs = {
+            "model": model,
+            "contents": contents,
+            "config": generate_content_config,
+        }
+        if _on_api_start:
+            await _on_api_start({
+                "model": api_kwargs["model"],
+                # "contents": api_kwargs["contents"], # Disabled for now
+                "config": api_kwargs["config"],
+            })
+        response = client.models.generate_content(**api_kwargs)
+        if _on_api_end:
+            await _on_api_end({
+                "model": api_kwargs["model"],
+                # "contents": api_kwargs["contents"], # Disabled for now
+                "config": api_kwargs["config"],
+            }, response)
+        # Usage (Gemini SDK may not always provide token usage; populate when available)
+        usage: Dict[str, Any] = {}
+        try:
+            # Some SDKs expose response.usage; if available, copy
+            if getattr(response, "usage_metadata", None):
+                md = response.usage_metadata
+                usage = {
+                    "prompt_tokens": getattr(md, "prompt_token_count", None) or 0,
+                    "completion_tokens": getattr(md, "candidates_token_count", None) or 0,
+                    "total_tokens": getattr(md, "total_token_count", None) or 0,
+                }
+        except Exception:
+            pass
+        if _on_usage and usage:
+            await _on_usage(usage)
+        # Parse output into internal items
+        output_items: List[Dict[str, Any]] = []
+        candidate = response.candidates[0]
+        # Text parts from the model (assistant message)
+        text_parts: List[str] = []
+        function_calls: List[Dict[str, Any]] = []
+        for p in candidate.content.parts:
+            if getattr(p, "text", None):
+                text_parts.append(p.text)
+            if getattr(p, "function_call", None):
+                # p.function_call has name and args
+                fc = {
+                    "name": getattr(p.function_call, "name", None),
+                    "args": dict(getattr(p.function_call, "args", {}) or {}),
+                }
+                function_calls.append(fc)
+        if text_parts:
+            output_items.append(
+                {
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [{"type": "output_text", "text": "\n".join(text_parts)}],
+                }
+            )
+        # Map function calls to internal computer_call actions
+        for fc in function_calls:
+            item = _map_gemini_fc_to_computer_call(fc, screen_w, screen_h)
+            if item is not None:
+                output_items.append(item)
+        return {"output": output_items, "usage": usage}
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[float, float]]:
+        """Ask Gemini CUA to output a single click action for the given instruction.
+        Excludes all predefined tools except `click_at` and sends the screenshot.
+        Returns pixel (x, y) if a click is proposed, else None.
+        """
+        genai, types = _lazy_import_genai()
+        client = genai.Client()
+        # Exclude all but click_at
+        exclude_all_but_click = [
+            "open_web_browser",
+            "wait_5_seconds",
+            "go_back",
+            "go_forward",
+            "search",
+            "navigate",
+            "hover_at",
+            "type_text_at",
+            "key_combination",
+            "scroll_document",
+            "scroll_at",
+            "drag_and_drop",
+        ]
+        config = types.GenerateContentConfig(
+            tools=[
+                types.Tool(
+                    computer_use=types.ComputerUse(
+                        environment=types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=exclude_all_but_click,
+                    )
+                )
+            ]
+        )
+        # Prepare prompt parts
+        try:
+            img_bytes = base64.b64decode(image_b64)
+        except Exception:
+            img_bytes = b""
+        w, h = _bytes_image_size(img_bytes) if img_bytes else (1024, 768)
+        parts: List[Any] = [types.Part(text=f"Click {instruction}.")]
+        if img_bytes:
+            parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png"))
+        contents = [types.Content(role="user", parts=parts)]
+        response = client.models.generate_content(
+            model=model,
+            contents=contents,
+            config=config,
+        )
+        # Parse first click_at
+        try:
+            candidate = response.candidates[0]
+            for p in candidate.content.parts:
+                fc = getattr(p, "function_call", None)
+                if fc and getattr(fc, "name", None) == "click_at":
+                    args = dict(getattr(fc, "args", {}) or {})
+                    x = _denormalize(int(args.get("x", 0)), w)
+                    y = _denormalize(int(args.get("y", 0)), h)
+                    return float(x), float(y)
+        except Exception:
+            return None
+        return None
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]

agent/loops/moondream3.py ADDED Viewed

@@ -0,0 +1,464 @@
+"""
+Moondream3+ composed-grounded agent loop implementation.
+Grounding is handled by a local Moondream3 preview model via Transformers.
+Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
+Differences from composed_grounded:
+- Provides a singleton Moondream3 client outside the class.
+- predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
+- If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
+  run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
+- Add a user message listing all detected form UI names so the thinker can reference them.
+- If the thinking model doesn't support vision, filter out image content before calling litellm.
+"""
+from __future__ import annotations
+import uuid
+import base64
+import io
+from typing import Dict, List, Any, Optional, Tuple, Any
+from PIL import Image, ImageDraw, ImageFont
+import torch
+from transformers import AutoModelForCausalLM
+import litellm
+from ..decorators import register_agent
+from ..types import AgentCapability
+from ..loops.base import AsyncAgentConfig
+from ..responses import (
+    convert_computer_calls_xy2desc,
+    convert_responses_items_to_completion_messages,
+    convert_completion_messages_to_responses_items,
+    convert_computer_calls_desc2xy,
+    get_all_element_descriptions,
+)
+_MOONDREAM_SINGLETON = None
+def get_moondream_model() -> Any:
+    """Get a singleton instance of the Moondream3 preview model."""
+    global _MOONDREAM_SINGLETON
+    if _MOONDREAM_SINGLETON is None:
+        _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
+            "moondream/moondream3-preview",
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda",
+        )
+    return _MOONDREAM_SINGLETON
+def _decode_image_b64(image_b64: str) -> Image.Image:
+    data = base64.b64decode(image_b64)
+    return Image.open(io.BytesIO(data)).convert("RGB")
+def _image_to_b64(img: Image.Image) -> str:
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+def _supports_vision(model: str) -> bool:
+    """Heuristic vision support detection for thinking model."""
+    m = model.lower()
+    vision_markers = [
+        "gpt-4o",
+        "gpt-4.1",
+        "o1",
+        "o3",
+        "claude-3",
+        "claude-3.5",
+        "sonnet",
+        "haiku",
+        "opus",
+        "gemini-1.5",
+        "llava",
+    ]
+    return any(v in m for v in vision_markers)
+def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    filtered: List[Dict[str, Any]] = []
+    for msg in messages:
+        msg_copy = {**msg}
+        content = msg_copy.get("content")
+        if isinstance(content, list):
+            msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
+        filtered.append(msg_copy)
+    return filtered
+def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
+    """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
+    Args:
+        base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
+        model_md: Moondream model instance with .detect() and .query() methods.
+    Returns:
+        A tuple of (annotated_image_base64_png, detected_names)
+    """
+    # Ensure RGBA for semi-transparent fills
+    if base_img.mode != "RGBA":
+        base_img = base_img.convert("RGBA")
+    W, H = base_img.width, base_img.height
+    # Detect objects
+    try:
+        detect_result = model_md.detect(base_img, "all ui elements")
+        objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
+    except Exception:
+        objects = []
+    draw = ImageDraw.Draw(base_img)
+    try:
+        font = ImageFont.load_default()
+    except Exception:
+        font = None
+    detected_names: List[str] = []
+    for i, obj in enumerate(objects):
+        try:
+            # Clamp normalized coords and crop
+            x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
+            y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
+            x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
+            y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
+            left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
+            left, top = max(0, left), max(0, top)
+            right, bottom = min(W - 1, right), min(H - 1, bottom)
+            crop = base_img.crop((left, top, right, bottom))
+            # Prompted short caption
+            try:
+                result = model_md.query(crop, "Caption this UI element in few words.")
+                caption_text = (result or {}).get("answer", "")
+            except Exception:
+                caption_text = ""
+            name = (caption_text or "").strip() or f"element_{i+1}"
+            detected_names.append(name)
+            # Draw bbox
+            draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
+            # Label background with padding and rounded corners
+            label = f"{i+1}. {name}"
+            padding = 3
+            if font:
+                text_bbox = draw.textbbox((0, 0), label, font=font)
+            else:
+                text_bbox = draw.textbbox((0, 0), label)
+            text_w = text_bbox[2] - text_bbox[0]
+            text_h = text_bbox[3] - text_bbox[1]
+            tx = left + 3
+            ty = top - (text_h + 2 * padding + 4)
+            if ty < 0:
+                ty = top + 3
+            bg_left = tx - padding
+            bg_top = ty - padding
+            bg_right = tx + text_w + padding
+            bg_bottom = ty + text_h + padding
+            try:
+                draw.rounded_rectangle(
+                    [bg_left, bg_top, bg_right, bg_bottom],
+                    radius=4,
+                    fill=(0, 0, 0, 160),
+                    outline=(255, 215, 0, 200),
+                    width=1,
+                )
+            except Exception:
+                draw.rectangle(
+                    [bg_left, bg_top, bg_right, bg_bottom],
+                    fill=(0, 0, 0, 160),
+                    outline=(255, 215, 0, 200),
+                    width=1,
+                )
+            text_fill = (255, 255, 255, 255)
+            if font:
+                draw.text((tx, ty), label, fill=text_fill, font=font)
+            else:
+                draw.text((tx, ty), label, fill=text_fill)
+        except Exception:
+            continue
+    # Encode PNG base64
+    annotated = base_img
+    if annotated.mode not in ("RGBA", "RGB"):
+        annotated = annotated.convert("RGBA")
+    annotated_b64 = _image_to_b64(annotated)
+    return annotated_b64, detected_names
+GROUNDED_COMPUTER_TOOL_SCHEMA = {
+    "type": "function",
+    "function": {
+        "name": "computer",
+        "description": (
+            "Control a computer by taking screenshots and interacting with UI elements. "
+            "The screenshot action will include a list of detected form UI element names when available. "
+            "Use element descriptions to locate and interact with UI elements on the screen."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform (required for all actions)",
+                },
+                "element_description": {
+                    "type": "string",
+                    "description": "Description of the element to interact with (required for click/double_click/move/scroll)",
+                },
+                "start_element_description": {
+                    "type": "string",
+                    "description": "Description of the element to start dragging from (required for drag)",
+                },
+                "end_element_description": {
+                    "type": "string",
+                    "description": "Description of the element to drag to (required for drag)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type)",
+                },
+                "keys": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Key(s) to press (required for keypress)",
+                },
+                "button": {
+                    "type": "string",
+                    "enum": ["left", "right", "wheel", "back", "forward"],
+                    "description": "The mouse button to use for click/double_click",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount (required for scroll)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount (required for scroll)",
+                },
+            },
+            "required": ["action"],
+        },
+    },
+}
+@register_agent(r"moondream3\+.*", priority=2)
+class Moondream3PlusConfig(AsyncAgentConfig):
+    def __init__(self):
+        self.desc2xy: Dict[str, Tuple[float, float]] = {}
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        # Parse composed model: moondream3+<thinking_model>
+        if "+" not in model:
+            raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
+        _, thinking_model = model.split("+", 1)
+        pre_output_items: List[Dict[str, Any]] = []
+        # Acquire last screenshot; if missing, take one
+        last_image_b64: Optional[str] = None
+        for message in reversed(messages):
+            if (
+                isinstance(message, dict)
+                and message.get("type") == "computer_call_output"
+                and isinstance(message.get("output"), dict)
+                and message["output"].get("type") == "input_image"
+            ):
+                image_url = message["output"].get("image_url", "")
+                if image_url.startswith("data:image/png;base64,"):
+                    last_image_b64 = image_url.split(",", 1)[1]
+                    break
+        if last_image_b64 is None and computer_handler is not None:
+            # Take a screenshot
+            screenshot_b64 = await computer_handler.screenshot()  # type: ignore
+            if screenshot_b64:
+                call_id = uuid.uuid4().hex
+                pre_output_items += [
+                    {
+                        "type": "message",
+                        "role": "assistant",
+                        "content": [
+                            {"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
+                        ],
+                    },
+                    {"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
+                    {
+                        "type": "computer_call_output",
+                        "call_id": call_id,
+                        "output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
+                    },
+                ]
+                last_image_b64 = screenshot_b64
+                if _on_screenshot:
+                    await _on_screenshot(screenshot_b64)
+        # If we have a last screenshot, run Moondream detection and labeling
+        detected_names: List[str] = []
+        if last_image_b64 is not None:
+            base_img = _decode_image_b64(last_image_b64)
+            model_md = get_moondream_model()
+            annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
+            if _on_screenshot:
+                await _on_screenshot(annotated_b64, "annotated_form_ui")
+            # Also push a user message listing all detected names
+            if detected_names:
+                names_text = "\n".join(f"- {n}" for n in detected_names)
+                pre_output_items.append(
+                    {
+                        "type": "message",
+                        "role": "user",
+                        "content": [
+                            {"type": "input_text", "text": "Detected form UI elements on screen:"},
+                            {"type": "input_text", "text": names_text},
+                            {"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
+                        ],
+                    }
+                )
+        tool_schemas = []
+        for schema in (tools or []):
+            if schema.get("type") == "computer":
+                tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
+            else:
+                tool_schemas.append(schema)
+        # Step 1: Convert computer calls from xy to descriptions
+        input_messages = messages + pre_output_items
+        messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
+        # Step 2: Convert responses items to completion messages
+        completion_messages = convert_responses_items_to_completion_messages(
+            messages_with_descriptions,
+            allow_images_in_tool_results=False,
+        )
+        # Optionally filter images if model lacks vision
+        if not _supports_vision(thinking_model):
+            completion_messages = _filter_images_from_completion_messages(completion_messages)
+        # Step 3: Call thinking model with litellm.acompletion
+        api_kwargs = {
+            "model": thinking_model,
+            "messages": completion_messages,
+            "tools": tool_schemas,
+            "max_retries": max_retries,
+            "stream": stream,
+            **kwargs,
+        }
+        if use_prompt_caching:
+            api_kwargs["use_prompt_caching"] = use_prompt_caching
+        if _on_api_start:
+            await _on_api_start(api_kwargs)
+        response = await litellm.acompletion(**api_kwargs)
+        if _on_api_end:
+            await _on_api_end(api_kwargs, response)
+        usage = {
+            **response.usage.model_dump(),  # type: ignore
+            "response_cost": response._hidden_params.get("response_cost", 0.0),
+        }
+        if _on_usage:
+            await _on_usage(usage)
+        # Step 4: Convert completion messages back to responses items format
+        response_dict = response.model_dump()  # type: ignore
+        choice_messages = [choice["message"] for choice in response_dict["choices"]]
+        thinking_output_items: List[Dict[str, Any]] = []
+        for choice_message in choice_messages:
+            thinking_output_items.extend(
+                convert_completion_messages_to_responses_items([choice_message])
+            )
+        # Step 5: Use Moondream to get coordinates for each description
+        element_descriptions = get_all_element_descriptions(thinking_output_items)
+        if element_descriptions and last_image_b64:
+            for desc in element_descriptions:
+                for _ in range(3):  # try 3 times
+                    coords = await self.predict_click(
+                        model=model,
+                        image_b64=last_image_b64,
+                        instruction=desc,
+                    )
+                    if coords:
+                        self.desc2xy[desc] = coords
+                        break
+        # Step 6: Convert computer calls from descriptions back to xy coordinates
+        final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
+        # Step 7: Return output and usage
+        return {"output": pre_output_items + final_output_items, "usage": usage}
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[float, float]]:
+        """Predict click coordinates using Moondream3's point API.
+        Returns pixel coordinates (x, y) as floats.
+        """
+        img = _decode_image_b64(image_b64)
+        W, H = img.width, img.height
+        model_md = get_moondream_model()
+        try:
+            result = model_md.point(img, instruction, settings={"max_objects": 1})
+        except Exception:
+            return None
+        try:
+            pt = (result or {}).get("points", [])[0]
+            x_norm = float(pt.get("x", 0.0))
+            y_norm = float(pt.get("y", 0.0))
+            x_px = max(0.0, min(float(W - 1), x_norm * W))
+            y_px = max(0.0, min(float(H - 1), y_norm * H))
+            return (x_px, y_px)
+        except Exception:
+            return None
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]

agent/loops/openai.py CHANGED Viewed

@@ -53,8 +53,7 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
     return openai_tools
-@register_agent(models=r".*computer-use-preview.*")
+@register_agent(models=r".*(^|/)computer-use-preview")
 class OpenAIComputerUseConfig:
     """
     OpenAI computer-use-preview agent configuration using liteLLM responses.

{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.4.32
+Version: 0.4.33
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.12
@@ -49,7 +49,9 @@ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
 Provides-Extra: cli
 Requires-Dist: yaspin>=3.1.0; extra == "cli"
 Provides-Extra: hud
-Requires-Dist: hud-python==0.4.26; extra == "hud"
+Requires-Dist: hud-python==0.4.52; extra == "hud"
+Provides-Extra: gemini
+Requires-Dist: google-genai>=1.41.0; extra == "gemini"
 Provides-Extra: all
 Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
 Requires-Dist: accelerate; extra == "all"
@@ -62,7 +64,8 @@ Requires-Dist: blobfile>=3.0.0; extra == "all"
 Requires-Dist: gradio>=5.23.3; extra == "all"
 Requires-Dist: python-dotenv>=1.0.1; extra == "all"
 Requires-Dist: yaspin>=3.1.0; extra == "all"
-Requires-Dist: hud-python==0.4.26; extra == "all"
+Requires-Dist: hud-python==0.4.52; extra == "all"
+Requires-Dist: google-genai>=1.41.0; extra == "all"
 Description-Content-Type: text/markdown
 <div align="center">

{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/RECORD RENAMED Viewed

@@ -20,7 +20,7 @@ agent/callbacks/pii_anonymization.py,sha256=NEkUTUjQBi82nqus7kT-1E4RaeQ2hQrY7YCn
 agent/callbacks/prompt_instructions.py,sha256=RUqsJhiNiXqaOM_P2AfyBinWUDdgDku46BExLMUJHn4,1517
 agent/callbacks/telemetry.py,sha256=RbUDhE41mTi8g9hNre0EpltK_NUZkLj8buJLWBzs0Ek,7363
 agent/callbacks/trajectory_saver.py,sha256=-XNgiKU6T8Qw_i2AZMQuw0HuUe6MHkU89rjn_T386Rw,16128
-agent/cli.py,sha256=HddU18IvvKdyvQu0ru21nAcNc6k7toYuyjgORIzX_qo,16110
+agent/cli.py,sha256=QsHx1w0THEdaq7YOXPZ_mprWtO_n64sgPAMFUOIJ3og,17603
 agent/computers/__init__.py,sha256=39ISJsaREaQIZckpzxSuLhuR763wUU3TxUux78EKjAg,1477
 agent/computers/base.py,sha256=hZntX4vgc1ahD3EnFeb9lUjtBmgka1vb27hndPl9tKQ,2187
 agent/computers/cua.py,sha256=xp2A34kT2C1NKqSRo2GB6766gkraM-UtpFjRv8LUTSc,4889
@@ -33,17 +33,19 @@ agent/human_tool/ui.py,sha256=wu9eZorhxCkyPTlBSZjYaVzutoHMlucAz8UGNpAT4bM,30644
 agent/integrations/hud/__init__.py,sha256=xir5BVAlG2cFc7rHSx_Ea_2b1kp2TtFuKJk07jny7qY,5969
 agent/integrations/hud/agent.py,sha256=GBikd9MhjDNKMiMG8J7PE3OMSmvmC_JLZ1p5xr2cZoc,14006
 agent/integrations/hud/proxy.py,sha256=8HUoh7uZ8Z3vkhPXK0dskgePGsP8oCqyYij0mE_E7X8,10902
-agent/loops/__init__.py,sha256=c6stEkT15smK8ZIf9j2kyOko84uz1YIvHXx0Mbe2wq8,472
-agent/loops/anthropic.py,sha256=ODrMvmTkyzIOLjGq6HbKzzgBu19TE_Xlsi--7vc5T6o,70196
+agent/loops/__init__.py,sha256=Nefn042YQMMaC6tTHvaQ17m9hNEVSPG4Xh2rpujfSos,549
+agent/loops/anthropic.py,sha256=hGqRcUYaajnOTIlEGCpLeHqUoIzS293M8sqFOC_NTUY,70211
 agent/loops/base.py,sha256=LK7kSTnc2CB88LI7qr2VP7LMq0eS5r2bSEnrxO6IN5U,2345
 agent/loops/composed_grounded.py,sha256=Um_8G0v5DEzF_A9wWIGp_IDPDMvv4IXDTFpEDH92Vto,12367
+agent/loops/gemini.py,sha256=m_bGdxujWBmzYpEnZg84OXDCyh06MYNiDrO3beVstCQ,13718
 agent/loops/glm45v.py,sha256=EKAoh-PWkcCdzBVebjXbdqoDNkXgcmJpIqmTNPiZ8TM,35127
 agent/loops/gta1.py,sha256=uGIcUH5ChzO75eGvoQxuKMBWjX-1J9-xmC7vPetobjU,5831
 agent/loops/holo.py,sha256=peQ0xx4XQDBQ3g2XKRLCgyrU_2PkXe3RaysNBqFyS90,7481
 agent/loops/internvl.py,sha256=iQs6DSoP9JOyUxRAz_HPuv4Hi2Sbv-Jc3022W-oPX5Y,6596
 agent/loops/model_types.csv,sha256=GmFn4x80yoUpQZuQ-GXtJkPVlOLYWZ5u_5A73HRyeNE,112
+agent/loops/moondream3.py,sha256=_h4k6Z7VyBU8bw-Av0RiOt-GanCJqU46ZLF03eyRgug,17828
 agent/loops/omniparser.py,sha256=-db8JUL2Orn47ERIaLbuNShAXn4LeIgYzRWphn_9Dg4,15071
-agent/loops/openai.py,sha256=3UEXdecqGkyknhTgp6zxr_cNCVg5vM-61I6SKMNl6m8,8692
+agent/loops/openai.py,sha256=2typWRS7j2sVm52AzwwigPniCrdw9IVvllypjXN2mKI,8694
 agent/loops/opencua.py,sha256=Chb4UASHDrdcX_fO__Gw2e9ay4Hl6Vq38K5x-IoHyuo,4432
 agent/loops/uitars.py,sha256=mVPt4V-HabX7ZiQnM55BVQt73CuZUjmUAsbm4Tf6TXk,32351
 agent/proxy/examples.py,sha256=GYFJ-sfDsSNZr9n_qpvDx_0rShqoKE5JW0ibbljWfoo,6192
@@ -55,7 +57,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
 agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
 agent/ui/gradio/app.py,sha256=Ol97YEbwREZZQ9_PMjVHlfOcu9BGsawxgAGAm79hT80,9117
 agent/ui/gradio/ui_components.py,sha256=dJUvKDmc1oSejtoR_gU_oWWYwxaOOQyPloSYRGMrUCQ,36068
-cua_agent-0.4.32.dist-info/METADATA,sha256=9DM4yfZ8hH6-JeNvke6WOgzZLEF0i3A8cDeb3aTGpyk,6340
-cua_agent-0.4.32.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
-cua_agent-0.4.32.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
-cua_agent-0.4.32.dist-info/RECORD,,
+cua_agent-0.4.33.dist-info/METADATA,sha256=LIl2V_RBTn8mrq69gBc_7fpWfQnxBHEnkInmnToJ8Qw,6470
+cua_agent-0.4.33.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
+cua_agent-0.4.33.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
+cua_agent-0.4.33.dist-info/RECORD,,

{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/WHEEL RENAMED Viewed

File without changes

{cua_agent-0.4.32.dist-info → cua_agent-0.4.33.dist-info}/entry_points.txt RENAMED Viewed

File without changes

cua-agent 0.4.32__py3-none-any.whl → 0.4.33__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.32py3-none-any.whl → 0.4.33py3-none-any.whl