PyPI - cua-agent - Versions diffs - 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl - Mend

cua-agent 0.4.31py3-none-any.whl → 0.4.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (25) hide show

agent/adapters/huggingfacelocal_adapter.py +15 -66
agent/adapters/models/__init__.py +33 -0
agent/adapters/models/generic.py +75 -0
agent/adapters/models/internvl.py +254 -0
agent/adapters/models/opencua.py +100 -0
agent/adapters/models/qwen2_5_vl.py +75 -0
agent/agent.py +5 -1
agent/callbacks/trajectory_saver.py +2 -0
agent/cli.py +147 -22
agent/loops/__init__.py +19 -1
agent/loops/anthropic.py +3 -4
agent/loops/composed_grounded.py +1 -1
agent/loops/gemini.py +391 -0
agent/loops/glm45v.py +3 -2
agent/loops/gta1.py +1 -1
agent/loops/holo.py +216 -0
agent/loops/internvl.py +185 -0
agent/loops/moondream3.py +464 -0
agent/loops/openai.py +1 -2
agent/loops/opencua.py +142 -0
agent/loops/uitars.py +1 -1
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/METADATA +23 -4
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/RECORD +25 -15
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/WHEEL +0 -0
{cua_agent-0.4.31.dist-info → cua_agent-0.4.33.dist-info}/entry_points.txt +0 -0

agent/loops/gemini.py ADDED Viewed

@@ -0,0 +1,391 @@
+"""
+Gemini 2.5 Computer Use agent loop
+Maps internal Agent SDK message format to Google's Gemini Computer Use API and back.
+Key features:
+- Lazy import of google.genai
+- Configure Computer Use tool with excluded browser-specific predefined functions
+- Optional custom function declarations hook for computer-call specific functions
+- Convert Gemini function_call parts into internal computer_call actions
+"""
+from __future__ import annotations
+import base64
+import io
+import uuid
+from typing import Any, Dict, List, Optional, Tuple
+from PIL import Image
+from ..decorators import register_agent
+from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability
+def _lazy_import_genai():
+    """Import google.genai lazily to avoid hard dependency unless used."""
+    try:
+        from google import genai  # type: ignore
+        from google.genai import types  # type: ignore
+        return genai, types
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "google.genai is required for the Gemini Computer Use loop. Install the Google Gemini SDK."
+        ) from e
+def _data_url_to_bytes(data_url: str) -> Tuple[bytes, str]:
+    """Convert a data URL to raw bytes and mime type."""
+    if not data_url.startswith("data:"):
+        # Assume it's base64 png payload
+        try:
+            return base64.b64decode(data_url), "image/png"
+        except Exception:
+            return b"", "application/octet-stream"
+    header, b64 = data_url.split(",", 1)
+    mime = "image/png"
+    if ";" in header:
+        mime = header.split(";")[0].split(":", 1)[1] or "image/png"
+    return base64.b64decode(b64), mime
+def _bytes_image_size(img_bytes: bytes) -> Tuple[int, int]:
+    try:
+        img = Image.open(io.BytesIO(img_bytes))
+        return img.size
+    except Exception:
+        return (1024, 768)
+def _find_last_user_text(messages: List[Dict[str, Any]]) -> List[str]:
+    texts: List[str] = []
+    for msg in reversed(messages):
+        if msg.get("type") in (None, "message") and msg.get("role") == "user":
+            content = msg.get("content")
+            if isinstance(content, str):
+                return [content]
+            elif isinstance(content, list):
+                for c in content:
+                    if c.get("type") in ("input_text", "output_text") and c.get("text"):
+                        texts.append(c["text"])  # newest first
+                if texts:
+                    return list(reversed(texts))
+    return []
+def _find_last_screenshot(messages: List[Dict[str, Any]]) -> Optional[bytes]:
+    for msg in reversed(messages):
+        if msg.get("type") == "computer_call_output":
+            out = msg.get("output", {})
+            if isinstance(out, dict) and out.get("type") in ("input_image", "computer_screenshot"):
+                image_url = out.get("image_url", "")
+                if image_url:
+                    data, _ = _data_url_to_bytes(image_url)
+                    return data
+    return None
+def _denormalize(v: int, size: int) -> int:
+    # Gemini returns 0-999 normalized
+    try:
+        return max(0, min(size - 1, int(round(v / 1000 * size))))
+    except Exception:
+        return 0
+def _map_gemini_fc_to_computer_call(
+    fc: Dict[str, Any],
+    screen_w: int,
+    screen_h: int,
+) -> Optional[Dict[str, Any]]:
+    name = fc.get("name")
+    args = fc.get("args", {}) or {}
+    action: Dict[str, Any] = {}
+    if name == "click_at":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        action = {"type": "click", "x": x, "y": y, "button": "left"}
+    elif name == "type_text_at":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        text = args.get("text", "")
+        if args.get("press_enter") == True:
+            text += "\n"
+        action = {"type": "type", "x": x, "y": y, "text": text}
+    elif name == "hover_at":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        action = {"type": "move", "x": x, "y": y}
+    elif name == "key_combination":
+        keys = str(args.get("keys", ""))
+        action = {"type": "keypress", "keys": keys}
+    elif name == "scroll_document":
+        direction = args.get("direction", "down")
+        magnitude = 800
+        dx, dy = 0, 0
+        if direction == "down":
+            dy = magnitude
+        elif direction == "up":
+            dy = -magnitude
+        elif direction == "right":
+            dx = magnitude
+        elif direction == "left":
+            dx = -magnitude
+        action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)}
+    elif name == "scroll_at":
+        x = _denormalize(int(args.get("x", 500)), screen_w)
+        y = _denormalize(int(args.get("y", 500)), screen_h)
+        direction = args.get("direction", "down")
+        magnitude = int(args.get("magnitude", 800))
+        dx, dy = 0, 0
+        if direction == "down":
+            dy = magnitude
+        elif direction == "up":
+            dy = -magnitude
+        elif direction == "right":
+            dx = magnitude
+        elif direction == "left":
+            dx = -magnitude
+        action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": x, "y": y}
+    elif name == "drag_and_drop":
+        x = _denormalize(int(args.get("x", 0)), screen_w)
+        y = _denormalize(int(args.get("y", 0)), screen_h)
+        dx = _denormalize(int(args.get("destination_x", x)), screen_w)
+        dy = _denormalize(int(args.get("destination_y", y)), screen_h)
+        action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"}
+    elif name == "wait_5_seconds":
+        action = {"type": "wait"}
+    else:
+        # Unsupported / excluded browser-specific or custom function; ignore
+        return None
+    return {
+        "type": "computer_call",
+        "call_id": uuid.uuid4().hex,
+        "status": "completed",
+        "action": action,
+    }
+@register_agent(models=r"^gemini-2\.5-computer-use-preview-10-2025$")
+class GeminiComputerUseConfig(AsyncAgentConfig):
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        genai, types = _lazy_import_genai()
+        client = genai.Client()
+        # Build excluded predefined functions for browser-specific behavior
+        excluded = [
+            "open_web_browser",
+            "search",
+            "navigate",
+            "go_forward",
+            "go_back",
+            "scroll_document",
+        ]
+        # Optional custom functions: can be extended by host code via `tools` parameter later if desired
+        CUSTOM_FUNCTION_DECLARATIONS: List[Any] = []
+        # Compose tools config
+        generate_content_config = types.GenerateContentConfig(
+            tools=[
+                types.Tool(
+                    computer_use=types.ComputerUse(
+                        environment=types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=excluded,
+                    )
+                ),
+                # types.Tool(function_declarations=CUSTOM_FUNCTION_DECLARATIONS),  # enable when custom functions needed
+            ]
+        )
+        # Prepare contents: last user text + latest screenshot
+        user_texts = _find_last_user_text(messages)
+        screenshot_bytes = _find_last_screenshot(messages)
+        parts: List[Any] = []
+        for t in user_texts:
+            parts.append(types.Part(text=t))
+        screen_w, screen_h = 1024, 768
+        if screenshot_bytes:
+            screen_w, screen_h = _bytes_image_size(screenshot_bytes)
+            parts.append(types.Part.from_bytes(data=screenshot_bytes, mime_type="image/png"))
+        # If we don't have any content, at least pass an empty user part to prompt reasoning
+        if not parts:
+            parts = [types.Part(text="Proceed to the next action.")]
+        contents = [types.Content(role="user", parts=parts)]
+        api_kwargs = {
+            "model": model,
+            "contents": contents,
+            "config": generate_content_config,
+        }
+        if _on_api_start:
+            await _on_api_start({
+                "model": api_kwargs["model"],
+                # "contents": api_kwargs["contents"], # Disabled for now
+                "config": api_kwargs["config"],
+            })
+        response = client.models.generate_content(**api_kwargs)
+        if _on_api_end:
+            await _on_api_end({
+                "model": api_kwargs["model"],
+                # "contents": api_kwargs["contents"], # Disabled for now
+                "config": api_kwargs["config"],
+            }, response)
+        # Usage (Gemini SDK may not always provide token usage; populate when available)
+        usage: Dict[str, Any] = {}
+        try:
+            # Some SDKs expose response.usage; if available, copy
+            if getattr(response, "usage_metadata", None):
+                md = response.usage_metadata
+                usage = {
+                    "prompt_tokens": getattr(md, "prompt_token_count", None) or 0,
+                    "completion_tokens": getattr(md, "candidates_token_count", None) or 0,
+                    "total_tokens": getattr(md, "total_token_count", None) or 0,
+                }
+        except Exception:
+            pass
+        if _on_usage and usage:
+            await _on_usage(usage)
+        # Parse output into internal items
+        output_items: List[Dict[str, Any]] = []
+        candidate = response.candidates[0]
+        # Text parts from the model (assistant message)
+        text_parts: List[str] = []
+        function_calls: List[Dict[str, Any]] = []
+        for p in candidate.content.parts:
+            if getattr(p, "text", None):
+                text_parts.append(p.text)
+            if getattr(p, "function_call", None):
+                # p.function_call has name and args
+                fc = {
+                    "name": getattr(p.function_call, "name", None),
+                    "args": dict(getattr(p.function_call, "args", {}) or {}),
+                }
+                function_calls.append(fc)
+        if text_parts:
+            output_items.append(
+                {
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [{"type": "output_text", "text": "\n".join(text_parts)}],
+                }
+            )
+        # Map function calls to internal computer_call actions
+        for fc in function_calls:
+            item = _map_gemini_fc_to_computer_call(fc, screen_w, screen_h)
+            if item is not None:
+                output_items.append(item)
+        return {"output": output_items, "usage": usage}
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[float, float]]:
+        """Ask Gemini CUA to output a single click action for the given instruction.
+        Excludes all predefined tools except `click_at` and sends the screenshot.
+        Returns pixel (x, y) if a click is proposed, else None.
+        """
+        genai, types = _lazy_import_genai()
+        client = genai.Client()
+        # Exclude all but click_at
+        exclude_all_but_click = [
+            "open_web_browser",
+            "wait_5_seconds",
+            "go_back",
+            "go_forward",
+            "search",
+            "navigate",
+            "hover_at",
+            "type_text_at",
+            "key_combination",
+            "scroll_document",
+            "scroll_at",
+            "drag_and_drop",
+        ]
+        config = types.GenerateContentConfig(
+            tools=[
+                types.Tool(
+                    computer_use=types.ComputerUse(
+                        environment=types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=exclude_all_but_click,
+                    )
+                )
+            ]
+        )
+        # Prepare prompt parts
+        try:
+            img_bytes = base64.b64decode(image_b64)
+        except Exception:
+            img_bytes = b""
+        w, h = _bytes_image_size(img_bytes) if img_bytes else (1024, 768)
+        parts: List[Any] = [types.Part(text=f"Click {instruction}.")]
+        if img_bytes:
+            parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png"))
+        contents = [types.Content(role="user", parts=parts)]
+        response = client.models.generate_content(
+            model=model,
+            contents=contents,
+            config=config,
+        )
+        # Parse first click_at
+        try:
+            candidate = response.candidates[0]
+            for p in candidate.content.parts:
+                fc = getattr(p, "function_call", None)
+                if fc and getattr(fc, "name", None) == "click_at":
+                    args = dict(getattr(fc, "args", {}) or {})
+                    x = _denormalize(int(args.get("x", 0)), w)
+                    y = _denormalize(int(args.get("y", 0)), h)
+                    return float(x), float(y)
+        except Exception:
+            return None
+        return None
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]

agent/loops/glm45v.py CHANGED Viewed

@@ -844,7 +844,7 @@ Where x,y are coordinates normalized to 0-999 range."""
             api_kwargs = {
                 "model": model,
                 "messages": litellm_messages,
-                "max_tokens": 100,
+                "max_tokens": 2056,
                 "temperature": 0.001,
                 "extra_body": {
                     "skip_special_tokens": False,
@@ -856,6 +856,7 @@ Where x,y are coordinates normalized to 0-999 range."""
             # Extract response content
             response_content = response.choices[0].message.content.strip()
+            print(response)
             # Parse response for click coordinates
             # Look for coordinates in the response, handling special tokens
@@ -866,7 +867,7 @@ Where x,y are coordinates normalized to 0-999 range."""
                 # Fallback: look for coordinates without special tokens
                 coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
                 match = re.search(coord_pattern, response_content)
             if match:
                 x, y = int(match.group(1)), int(match.group(2))

agent/loops/gta1.py CHANGED Viewed

@@ -155,7 +155,7 @@ class GTA1Config(AsyncAgentConfig):
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
-            "max_tokens": 32,
+            "max_tokens": 2056,
             "temperature": 0.0,
             **kwargs
         }

agent/loops/holo.py ADDED Viewed

@@ -0,0 +1,216 @@
+"""
+Holo 1.5 agent loop implementation for click prediction using litellm.acompletion.
+Implements the Holo1.5 grounding behavior:
+- Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int}
+- Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor)
+- If resized, maps predicted coordinates back to the original screenshot resolution
+Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+from __future__ import annotations
+import base64
+import json
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+import litellm
+from PIL import Image
+from ..decorators import register_agent
+from .base import AsyncAgentConfig
+from ..types import AgentCapability
+def _strip_hf_prefix(model: str) -> str:
+    """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load."""
+    if "/" in model and model.lower().startswith("huggingface-local/"):
+        return model.split("/", 1)[1]
+    return model
+def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]:
+    """
+    Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor.
+    Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable,
+    returns the original image and size without resizing.
+    """
+    orig_w, orig_h = image.size
+    try:
+        # Import lazily to avoid hard dependency if not installed
+        from transformers import AutoProcessor  # type: ignore
+        from transformers.models.qwen2_vl.image_processing_qwen2_vl import (  # type: ignore
+            smart_resize,
+        )
+        processor_name = _strip_hf_prefix(model)
+        processor = AutoProcessor.from_pretrained(processor_name)
+        image_processor = getattr(processor, "image_processor", None)
+        if image_processor is None:
+            return image, (orig_w, orig_h)
+        factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1)
+        min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
+        max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
+        resized_h, resized_w = smart_resize(
+            orig_h,
+            orig_w,
+            factor=factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        if (resized_w, resized_h) == (orig_w, orig_h):
+            return image, (orig_w, orig_h)
+        processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS)
+        return processed, (orig_w, orig_h)
+    except Exception:
+        # If any failure (no transformers, processor load error), fall back to original
+        return image, (orig_w, orig_h)
+def _build_holo_prompt(instruction: str) -> str:
+    """Construct the Holo1.5 grounding prompt."""
+    # Keep it close to the cookbook while avoiding heavy schema generation
+    schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}'
+    return (
+        "Localize an element on the GUI image according to the provided target and output a click position. "
+        f"You must output a valid JSON following the format: {schema_hint} "
+        f"Your target is: {instruction}"
+    )
+def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]:
+    """
+    Parse JSON from model output and extract x, y ints.
+    Tries to find the first JSON object substring if extra text is present.
+    """
+    try:
+        # Fast path: direct JSON
+        data = json.loads(output_text)
+    except Exception:
+        # Try to locate a JSON object within the text
+        start = output_text.find("{")
+        end = output_text.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            return None
+        try:
+            data = json.loads(output_text[start : end + 1])
+        except Exception:
+            return None
+    try:
+        x = int(data.get("x"))
+        y = int(data.get("y"))
+        return x, y
+    except Exception:
+        return None
+@register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*")
+class HoloConfig(AsyncAgentConfig):
+    """Holo is a family of UI grounding models from H Company"""
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        # Holo models are only trained on UI localization tasks, not all-in-one agent
+        raise NotImplementedError()
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using Holo1.5 via litellm.acompletion.
+        - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available
+        - Prompts for JSON with absolute pixel coordinates
+        - Parses x,y and maps back to original screenshot size if resized
+        """
+        try:
+            img_bytes = base64.b64decode(image_b64)
+            original_img = Image.open(BytesIO(img_bytes))
+        except Exception:
+            return None
+        # Optional preprocessing
+        processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model)
+        # If we resized, send the resized image; otherwise send original
+        img_to_send = processed_img
+        buf = BytesIO()
+        img_to_send.save(buf, format="PNG")
+        processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        prompt = _build_holo_prompt(instruction)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{processed_b64}"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Deterministic, small output
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+        coords = _parse_click_json(output_text)
+        if coords is None:
+            return None
+        x, y = coords
+        # Map back to original size if we resized
+        proc_w, proc_h = img_to_send.size
+        if (proc_w, proc_h) != (orig_w, orig_h):
+            try:
+                sx = orig_w / float(proc_w)
+                sy = orig_h / float(proc_h)
+                x = int(round(x * sx))
+                y = int(round(y * sy))
+            except Exception:
+                # Fallback: clamp within original bounds
+                pass
+        # Clamp to original image bounds
+        x = max(0, min(orig_w - 1, x))
+        y = max(0, min(orig_h - 1, y))
+        return x, y
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click"]

cua-agent 0.4.31__py3-none-any.whl → 0.4.33__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.31py3-none-any.whl → 0.4.33py3-none-any.whl