PyPI - cua-agent - Versions diffs - 0.4.31__py3-none-any.whl → 0.4.32__py3-none-any.whl - Mend

cua-agent 0.4.31py3-none-any.whl → 0.4.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (22) hide show

agent/adapters/huggingfacelocal_adapter.py +15 -66
agent/adapters/models/__init__.py +33 -0
agent/adapters/models/generic.py +75 -0
agent/adapters/models/internvl.py +254 -0
agent/adapters/models/opencua.py +100 -0
agent/adapters/models/qwen2_5_vl.py +75 -0
agent/agent.py +5 -1
agent/callbacks/trajectory_saver.py +2 -0
agent/cli.py +90 -1
agent/loops/__init__.py +15 -1
agent/loops/anthropic.py +2 -3
agent/loops/composed_grounded.py +1 -1
agent/loops/glm45v.py +3 -2
agent/loops/gta1.py +1 -1
agent/loops/holo.py +216 -0
agent/loops/internvl.py +185 -0
agent/loops/opencua.py +142 -0
agent/loops/uitars.py +1 -1
{cua_agent-0.4.31.dist-info → cua_agent-0.4.32.dist-info}/METADATA +20 -4
{cua_agent-0.4.31.dist-info → cua_agent-0.4.32.dist-info}/RECORD +22 -14
{cua_agent-0.4.31.dist-info → cua_agent-0.4.32.dist-info}/WHEEL +0 -0
{cua_agent-0.4.31.dist-info → cua_agent-0.4.32.dist-info}/entry_points.txt +0 -0

agent/callbacks/trajectory_saver.py CHANGED Viewed

@@ -188,6 +188,8 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
             model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
             if "+" in model:
                 model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
+            # strip non-alphanumeric characters from model_name_short
+            model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
             # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
             now = datetime.now()

agent/cli.py CHANGED Viewed

@@ -18,6 +18,15 @@ try:
     import json
     from typing import List, Dict, Any
     import dotenv
+    import base64
+    import time
+    import platform
+    from pathlib import Path
+    try:
+        from PIL import Image, ImageDraw
+        PIL_AVAILABLE = True
+    except Exception:
+        PIL_AVAILABLE = False
     from yaspin import yaspin
 except ImportError:
     if __name__ == "__main__":
@@ -248,6 +257,13 @@ Examples:
         help="Initial prompt to send to the agent. Leave blank for interactive mode."
     )
+    parser.add_argument(
+        "--predict-click",
+        dest="predict_click",
+        type=str,
+        help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
+    )
     parser.add_argument(
         "-c", "--cache",
         action="store_true",
@@ -331,6 +347,7 @@ Examples:
         agent_kwargs = {
             "model": args.model,
             "tools": [computer],
+            "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
             "max_retries": args.max_retries
         }
@@ -353,7 +370,79 @@ Examples:
         agent = ComputerAgent(**agent_kwargs)
-        # Start chat loop
+        # If predict-click mode is requested, run once and exit
+        if args.predict_click:
+            if not PIL_AVAILABLE:
+                print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
+                sys.exit(1)
+            instruction = args.predict_click
+            print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
+            # Take a fresh screenshot FIRST
+            try:
+                img_bytes = await computer.interface.screenshot()
+            except Exception as e:
+                print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            # Encode screenshot to base64 for predict_click
+            try:
+                image_b64 = base64.b64encode(img_bytes).decode("utf-8")
+            except Exception as e:
+                print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            try:
+                coords = await agent.predict_click(instruction, image_b64=image_b64)
+            except Exception as e:
+                print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            if not coords:
+                print_colored("⚠️  No coordinates returned.", Colors.YELLOW)
+                sys.exit(2)
+            x, y = coords
+            print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
+            try:
+                from io import BytesIO
+                with Image.open(BytesIO(img_bytes)) as img:
+                    img = img.convert("RGB")
+                    draw = ImageDraw.Draw(img)
+                    # Draw crosshair
+                    size = 12
+                    color = (255, 0, 0)
+                    draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
+                    draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
+                    # Optional small circle
+                    r = 6
+                    draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
+                    out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
+                    img.save(out_path)
+                    print_colored(f"🖼️  Saved to {out_path}")
+                    # Open the image with default viewer
+                    try:
+                        system = platform.system().lower()
+                        if system == "windows":
+                            os.startfile(str(out_path))  # type: ignore[attr-defined]
+                        elif system == "darwin":
+                            os.system(f"open \"{out_path}\"")
+                        else:
+                            os.system(f"xdg-open \"{out_path}\"")
+                    except Exception:
+                        pass
+            except Exception as e:
+                print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+            # Done
+            sys.exit(0)
+        # Start chat loop (default interactive mode)
         await chat_loop(agent, args.model, container_name, args.prompt, args.usage)

agent/loops/__init__.py CHANGED Viewed

@@ -10,5 +10,19 @@ from . import omniparser
 from . import gta1
 from . import composed_grounded
 from . import glm45v
+from . import opencua
+from . import internvl
+from . import holo
-__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
+__all__ = [
+    "anthropic",
+    "openai",
+    "uitars",
+    "omniparser",
+    "gta1",
+    "composed_grounded",
+    "glm45v",
+    "opencua",
+    "internvl",
+    "holo",
+]

agent/loops/anthropic.py CHANGED Viewed

@@ -1577,11 +1577,10 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
                 isinstance(item.get("action"), dict)):
                 action = item["action"]
-                if action.get("type") == "click":
+                if action.get("x") and action.get("y"):
                     x = action.get("x")
                     y = action.get("y")
-                    if x is not None and y is not None:
-                        return (int(x), int(y))
+                    return (int(x), int(y))
         return None

agent/loops/composed_grounded.py CHANGED Viewed

@@ -126,7 +126,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
 @register_agent(r".*\+.*", priority=1)
-class ComposedGroundedConfig:
+class ComposedGroundedConfig(AsyncAgentConfig):
     """
     Composed-grounded agent configuration that uses both grounding and thinking models.

agent/loops/glm45v.py CHANGED Viewed

@@ -844,7 +844,7 @@ Where x,y are coordinates normalized to 0-999 range."""
             api_kwargs = {
                 "model": model,
                 "messages": litellm_messages,
-                "max_tokens": 100,
+                "max_tokens": 2056,
                 "temperature": 0.001,
                 "extra_body": {
                     "skip_special_tokens": False,
@@ -856,6 +856,7 @@ Where x,y are coordinates normalized to 0-999 range."""
             # Extract response content
             response_content = response.choices[0].message.content.strip()
+            print(response)
             # Parse response for click coordinates
             # Look for coordinates in the response, handling special tokens
@@ -866,7 +867,7 @@ Where x,y are coordinates normalized to 0-999 range."""
                 # Fallback: look for coordinates without special tokens
                 coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
                 match = re.search(coord_pattern, response_content)
             if match:
                 x, y = int(match.group(1)), int(match.group(2))

agent/loops/gta1.py CHANGED Viewed

@@ -155,7 +155,7 @@ class GTA1Config(AsyncAgentConfig):
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
-            "max_tokens": 32,
+            "max_tokens": 2056,
             "temperature": 0.0,
             **kwargs
         }

agent/loops/holo.py ADDED Viewed

@@ -0,0 +1,216 @@
+"""
+Holo 1.5 agent loop implementation for click prediction using litellm.acompletion.
+Implements the Holo1.5 grounding behavior:
+- Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int}
+- Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor)
+- If resized, maps predicted coordinates back to the original screenshot resolution
+Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+from __future__ import annotations
+import base64
+import json
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+import litellm
+from PIL import Image
+from ..decorators import register_agent
+from .base import AsyncAgentConfig
+from ..types import AgentCapability
+def _strip_hf_prefix(model: str) -> str:
+    """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load."""
+    if "/" in model and model.lower().startswith("huggingface-local/"):
+        return model.split("/", 1)[1]
+    return model
+def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]:
+    """
+    Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor.
+    Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable,
+    returns the original image and size without resizing.
+    """
+    orig_w, orig_h = image.size
+    try:
+        # Import lazily to avoid hard dependency if not installed
+        from transformers import AutoProcessor  # type: ignore
+        from transformers.models.qwen2_vl.image_processing_qwen2_vl import (  # type: ignore
+            smart_resize,
+        )
+        processor_name = _strip_hf_prefix(model)
+        processor = AutoProcessor.from_pretrained(processor_name)
+        image_processor = getattr(processor, "image_processor", None)
+        if image_processor is None:
+            return image, (orig_w, orig_h)
+        factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1)
+        min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
+        max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
+        resized_h, resized_w = smart_resize(
+            orig_h,
+            orig_w,
+            factor=factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        if (resized_w, resized_h) == (orig_w, orig_h):
+            return image, (orig_w, orig_h)
+        processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS)
+        return processed, (orig_w, orig_h)
+    except Exception:
+        # If any failure (no transformers, processor load error), fall back to original
+        return image, (orig_w, orig_h)
+def _build_holo_prompt(instruction: str) -> str:
+    """Construct the Holo1.5 grounding prompt."""
+    # Keep it close to the cookbook while avoiding heavy schema generation
+    schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}'
+    return (
+        "Localize an element on the GUI image according to the provided target and output a click position. "
+        f"You must output a valid JSON following the format: {schema_hint} "
+        f"Your target is: {instruction}"
+    )
+def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]:
+    """
+    Parse JSON from model output and extract x, y ints.
+    Tries to find the first JSON object substring if extra text is present.
+    """
+    try:
+        # Fast path: direct JSON
+        data = json.loads(output_text)
+    except Exception:
+        # Try to locate a JSON object within the text
+        start = output_text.find("{")
+        end = output_text.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            return None
+        try:
+            data = json.loads(output_text[start : end + 1])
+        except Exception:
+            return None
+    try:
+        x = int(data.get("x"))
+        y = int(data.get("y"))
+        return x, y
+    except Exception:
+        return None
+@register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*")
+class HoloConfig(AsyncAgentConfig):
+    """Holo is a family of UI grounding models from H Company"""
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        # Holo models are only trained on UI localization tasks, not all-in-one agent
+        raise NotImplementedError()
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using Holo1.5 via litellm.acompletion.
+        - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available
+        - Prompts for JSON with absolute pixel coordinates
+        - Parses x,y and maps back to original screenshot size if resized
+        """
+        try:
+            img_bytes = base64.b64decode(image_b64)
+            original_img = Image.open(BytesIO(img_bytes))
+        except Exception:
+            return None
+        # Optional preprocessing
+        processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model)
+        # If we resized, send the resized image; otherwise send original
+        img_to_send = processed_img
+        buf = BytesIO()
+        img_to_send.save(buf, format="PNG")
+        processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        prompt = _build_holo_prompt(instruction)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{processed_b64}"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Deterministic, small output
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+        coords = _parse_click_json(output_text)
+        if coords is None:
+            return None
+        x, y = coords
+        # Map back to original size if we resized
+        proc_w, proc_h = img_to_send.size
+        if (proc_w, proc_h) != (orig_w, orig_h):
+            try:
+                sx = orig_w / float(proc_w)
+                sy = orig_h / float(proc_h)
+                x = int(round(x * sx))
+                y = int(round(y * sy))
+            except Exception:
+                # Fallback: clamp within original bounds
+                pass
+        # Clamp to original image bounds
+        x = max(0, min(orig_w - 1, x))
+        y = max(0, min(orig_h - 1, y))
+        return x, y
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click"]

agent/loops/internvl.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""
+InternVL agent loop implementation for click prediction using litellm.acompletion.
+Implements the ScreenSpot InternVL grounding baseline behavior:
+- Uses the exact grounding prompt format with <image> and <ref> tags
+- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
+- Converts to pixel coordinates relative to the original screenshot size
+Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+from __future__ import annotations
+import base64
+import math
+import re
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+from PIL import Image
+import litellm
+from ..decorators import register_agent
+from .composed_grounded import ComposedGroundedConfig
+from ..types import AgentCapability
+# Regex patterns for extracting coordinates
+# Accept optional whitespace and optional decimal fractions
+_NUM = r"(\d+(?:\.\d+)?)"
+_POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
+_BBOX_PATTERN = re.compile(
+    r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
+)
+def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
+    """Extract the first [[x,y]] as normalized (0-1000) floats."""
+    m = _POINT_PATTERN.search(text)
+    if not m:
+        return None
+    try:
+        x = float(m.group(1))
+        y = float(m.group(2))
+        return x, y
+    except Exception:
+        return None
+def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
+    """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
+    matches = list(_BBOX_PATTERN.finditer(text))
+    if not matches:
+        return None
+    m = matches[-1]
+    try:
+        x1 = float(m.group(1))
+        y1 = float(m.group(2))
+        x2 = float(m.group(3))
+        y2 = float(m.group(4))
+        return x1, y1, x2, y2
+    except Exception:
+        return None
+def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
+    """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
+    x_px = int(math.floor((x_norm / 1000.0) * width))
+    y_px = int(math.floor((y_norm / 1000.0) * height))
+    # Clamp to image bounds just in case
+    x_px = max(0, min(width - 1, x_px))
+    y_px = max(0, min(height - 1, y_px))
+    return x_px, y_px
+@register_agent(models=r"(?i).*InternVL.*")
+class InternVLConfig(ComposedGroundedConfig):
+    """InternVL agent configuration reusing ComposedGroundedConfig for steps and
+    overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """Fallback to a self-composed model"""
+        return await super().predict_step(
+            messages=messages,
+            model=f"{model}+{model}",
+            tools=tools,
+            max_retries=max_retries,
+            stream=stream,
+            computer_handler=computer_handler,
+            _on_api_start=_on_api_start,
+            _on_api_end=_on_api_end,
+            _on_usage=_on_usage,
+            _on_screenshot=_on_screenshot,
+            **kwargs
+        )
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using InternVL via litellm.acompletion.
+        Behavior mirrors the ScreenSpot InternVL baseline:
+        - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
+        - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
+        - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
+        """
+        try:
+            # Decode image dimensions to scale the normalized outputs
+            img_bytes = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(img_bytes))
+            width, height = image.size
+        except Exception:
+            # If decoding fails, proceed with a safe default size to avoid crash
+            width, height = 1920, 1080
+        # Build grounding prompt exactly like the baseline
+        grounding_prompt = (
+            f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
+            f"Answer in the format of [[x1, y1, x2, y2]]"
+        )
+        # Prepare messages for LiteLLM
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                    {"type": "text", "text": grounding_prompt},
+                ],
+            }
+        ]
+        # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Conservative generation params akin to baseline (deterministic)
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+        # print(f"InternVL output: {output_text}")
+        # Try to parse a point first; if absent, parse bbox and take center
+        point = _extract_first_point(output_text)
+        if point is None:
+            bbox = _extract_last_bbox(output_text)
+            if bbox is None:
+                return None
+            x1, y1, x2, y2 = bbox
+            cx = (x1 + x2) / 2.0
+            cy = (y1 + y2) / 2.0
+            point = (cx, cy)
+        x_norm, y_norm = point
+        x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
+        return (x_px, y_px)
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]

cua-agent 0.4.31__py3-none-any.whl → 0.4.32__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.31py3-none-any.whl → 0.4.32py3-none-any.whl