PyPI - cua-agent - Versions diffs - 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/__init__.py +4 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +337 -185
agent/callbacks/__init__.py +9 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +35 -33
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +99 -61
agent/callbacks/trajectory_saver.py +95 -69
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +38 -99
agent/integrations/hud/agent.py +369 -0
agent/integrations/hud/proxy.py +166 -52
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +579 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +136 -150
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +50 -51
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +247 -206
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +61 -57
agent/proxy/handlers.py +46 -39
agent/responses.py +447 -347
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
cua_agent-0.4.22.dist-info/METADATA +0 -436
cua_agent-0.4.22.dist-info/RECORD +0 -51
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/loops/gta1.py CHANGED Viewed

@@ -5,75 +5,80 @@ Code: https://github.com/Yan98/GTA1
 """
 import asyncio
+import base64
 import json
+import math
 import re
-import base64
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
-from io import BytesIO
 import uuid
-from PIL import Image
+from io import BytesIO
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
-import math
+from PIL import Image
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability, AgentResponse, Messages, Tools
-SYSTEM_PROMPT = '''
+SYSTEM_PROMPT = """
 You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
 Output the coordinate pair exactly:
 (x,y)
-'''.strip()
+""".strip()
 def extract_coordinates(raw_string: str) -> Tuple[float, float]:
     """Extract coordinates from model output."""
     try:
         matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
-        return tuple(map(float, matches[0])) # type: ignore
+        return tuple(map(float, matches[0]))  # type: ignore
     except:
         return (0.0, 0.0)
-def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360) -> Tuple[int, int]:
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
+) -> Tuple[int, int]:
     """Smart resize function similar to qwen_vl_utils."""
     # Calculate the total pixels
     total_pixels = height * width
     # If already within bounds, return original dimensions
     if min_pixels <= total_pixels <= max_pixels:
         # Round to nearest factor
         new_height = (height // factor) * factor
         new_width = (width // factor) * factor
         return new_height, new_width
     # Calculate scaling factor
     if total_pixels > max_pixels:
         scale = (max_pixels / total_pixels) ** 0.5
     else:
         scale = (min_pixels / total_pixels) ** 0.5
     # Apply scaling
     new_height = int(height * scale)
     new_width = int(width * scale)
     # Round to nearest factor
     new_height = (new_height // factor) * factor
     new_width = (new_width // factor) * factor
     # Ensure minimum size
     new_height = max(new_height, factor)
     new_width = max(new_width, factor)
     return new_height, new_width
 @register_agent(models=r".*GTA1.*")
 class GTA1Config(AsyncAgentConfig):
     """GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""
     def __init__(self):
         self.current_model = None
         self.last_screenshot_b64 = None
     async def predict_step(
         self,
@@ -87,25 +92,21 @@ class GTA1Config(AsyncAgentConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         raise NotImplementedError()
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[float, float]]:
         """
         Predict click coordinates using GTA1 model via litellm.acompletion.
         Args:
             model: The GTA1 model name
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             Tuple of (x, y) coordinates or None if prediction fails
         """
@@ -113,66 +114,62 @@ class GTA1Config(AsyncAgentConfig):
         image_data = base64.b64decode(image_b64)
         image = Image.open(BytesIO(image_data))
         width, height = image.width, image.height
         # Smart resize the image (similar to qwen_vl_utils)
         resized_height, resized_width = smart_resize(
-            height, width,
+            height,
+            width,
             factor=28,  # Default factor for Qwen models
             min_pixels=3136,
-            max_pixels=4096 * 2160
+            max_pixels=4096 * 2160,
         )
         resized_image = image.resize((resized_width, resized_height))
         scale_x, scale_y = width / resized_width, height / resized_height
         # Convert resized image back to base64
         buffered = BytesIO()
         resized_image.save(buffered, format="PNG")
         resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
         # Prepare system and user messages
         system_message = {
             "role": "system",
-            "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width)
+            "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width),
         }
         user_message = {
             "role": "user",
             "content": [
                 {
                     "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/png;base64,{resized_image_b64}"
-                    }
+                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
                 },
-                {
-                    "type": "text",
-                    "text": instruction
-                }
-            ]
+                {"type": "text", "text": instruction},
+            ],
         }
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
-            "max_tokens": 32,
+            "max_tokens": 2056,
             "temperature": 0.0,
-            **kwargs
+            **kwargs,
         }
         # Use liteLLM acompletion
         response = await litellm.acompletion(**api_kwargs)
         # Extract response text
-        output_text = response.choices[0].message.content # type: ignore
+        output_text = response.choices[0].message.content  # type: ignore
         # Extract and rescale coordinates
-        pred_x, pred_y = extract_coordinates(output_text) # type: ignore
+        pred_x, pred_y = extract_coordinates(output_text)  # type: ignore
         pred_x *= scale_x
         pred_y *= scale_y
         return (math.floor(pred_x), math.floor(pred_y))
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["click"]

agent/loops/holo.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""
+Holo 1.5 agent loop implementation for click prediction using litellm.acompletion.
+Implements the Holo1.5 grounding behavior:
+- Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int}
+- Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor)
+- If resized, maps predicted coordinates back to the original screenshot resolution
+Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+from __future__ import annotations
+import base64
+import json
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+import litellm
+from PIL import Image
+from ..decorators import register_agent
+from ..types import AgentCapability
+from .base import AsyncAgentConfig
+def _strip_hf_prefix(model: str) -> str:
+    """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load."""
+    if "/" in model and model.lower().startswith("huggingface-local/"):
+        return model.split("/", 1)[1]
+    return model
+def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]:
+    """
+    Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor.
+    Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable,
+    returns the original image and size without resizing.
+    """
+    orig_w, orig_h = image.size
+    try:
+        # Import lazily to avoid hard dependency if not installed
+        from transformers import AutoProcessor  # type: ignore
+        from transformers.models.qwen2_vl.image_processing_qwen2_vl import (  # type: ignore
+            smart_resize,
+        )
+        processor_name = _strip_hf_prefix(model)
+        processor = AutoProcessor.from_pretrained(processor_name)
+        image_processor = getattr(processor, "image_processor", None)
+        if image_processor is None:
+            return image, (orig_w, orig_h)
+        factor = getattr(image_processor, "patch_size", 14) * getattr(
+            image_processor, "merge_size", 1
+        )
+        min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
+        max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
+        resized_h, resized_w = smart_resize(
+            orig_h,
+            orig_w,
+            factor=factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        if (resized_w, resized_h) == (orig_w, orig_h):
+            return image, (orig_w, orig_h)
+        processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS)
+        return processed, (orig_w, orig_h)
+    except Exception:
+        # If any failure (no transformers, processor load error), fall back to original
+        return image, (orig_w, orig_h)
+def _build_holo_prompt(instruction: str) -> str:
+    """Construct the Holo1.5 grounding prompt."""
+    # Keep it close to the cookbook while avoiding heavy schema generation
+    schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}'
+    return (
+        "Localize an element on the GUI image according to the provided target and output a click position. "
+        f"You must output a valid JSON following the format: {schema_hint} "
+        f"Your target is: {instruction}"
+    )
+def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]:
+    """
+    Parse JSON from model output and extract x, y ints.
+    Tries to find the first JSON object substring if extra text is present.
+    """
+    try:
+        # Fast path: direct JSON
+        data = json.loads(output_text)
+    except Exception:
+        # Try to locate a JSON object within the text
+        start = output_text.find("{")
+        end = output_text.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            return None
+        try:
+            data = json.loads(output_text[start : end + 1])
+        except Exception:
+            return None
+    try:
+        x = int(data.get("x"))
+        y = int(data.get("y"))
+        return x, y
+    except Exception:
+        return None
+@register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*")
+class HoloConfig(AsyncAgentConfig):
+    """Holo is a family of UI grounding models from H Company"""
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        # Holo models are only trained on UI localization tasks, not all-in-one agent
+        raise NotImplementedError()
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using Holo1.5 via litellm.acompletion.
+        - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available
+        - Prompts for JSON with absolute pixel coordinates
+        - Parses x,y and maps back to original screenshot size if resized
+        """
+        try:
+            img_bytes = base64.b64decode(image_b64)
+            original_img = Image.open(BytesIO(img_bytes))
+        except Exception:
+            return None
+        # Optional preprocessing
+        processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model)
+        # If we resized, send the resized image; otherwise send original
+        img_to_send = processed_img
+        buf = BytesIO()
+        img_to_send.save(buf, format="PNG")
+        processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        prompt = _build_holo_prompt(instruction)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{processed_b64}"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Deterministic, small output
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+        coords = _parse_click_json(output_text)
+        if coords is None:
+            return None
+        x, y = coords
+        # Map back to original size if we resized
+        proc_w, proc_h = img_to_send.size
+        if (proc_w, proc_h) != (orig_w, orig_h):
+            try:
+                sx = orig_w / float(proc_w)
+                sy = orig_h / float(proc_h)
+                x = int(round(x * sx))
+                y = int(round(y * sy))
+            except Exception:
+                # Fallback: clamp within original bounds
+                pass
+        # Clamp to original image bounds
+        x = max(0, min(orig_w - 1, x))
+        y = max(0, min(orig_h - 1, y))
+        return x, y
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click"]

agent/loops/internvl.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""
+InternVL agent loop implementation for click prediction using litellm.acompletion.
+Implements the ScreenSpot InternVL grounding baseline behavior:
+- Uses the exact grounding prompt format with <image> and <ref> tags
+- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
+- Converts to pixel coordinates relative to the original screenshot size
+Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+from __future__ import annotations
+import base64
+import math
+import re
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+import litellm
+from PIL import Image
+from ..decorators import register_agent
+from ..types import AgentCapability
+from .composed_grounded import ComposedGroundedConfig
+# Regex patterns for extracting coordinates
+# Accept optional whitespace and optional decimal fractions
+_NUM = r"(\d+(?:\.\d+)?)"
+_POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
+_BBOX_PATTERN = re.compile(
+    r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
+)
+def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
+    """Extract the first [[x,y]] as normalized (0-1000) floats."""
+    m = _POINT_PATTERN.search(text)
+    if not m:
+        return None
+    try:
+        x = float(m.group(1))
+        y = float(m.group(2))
+        return x, y
+    except Exception:
+        return None
+def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
+    """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
+    matches = list(_BBOX_PATTERN.finditer(text))
+    if not matches:
+        return None
+    m = matches[-1]
+    try:
+        x1 = float(m.group(1))
+        y1 = float(m.group(2))
+        x2 = float(m.group(3))
+        y2 = float(m.group(4))
+        return x1, y1, x2, y2
+    except Exception:
+        return None
+def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
+    """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
+    x_px = int(math.floor((x_norm / 1000.0) * width))
+    y_px = int(math.floor((y_norm / 1000.0) * height))
+    # Clamp to image bounds just in case
+    x_px = max(0, min(width - 1, x_px))
+    y_px = max(0, min(height - 1, y_px))
+    return x_px, y_px
+@register_agent(models=r"(?i).*InternVL.*")
+class InternVLConfig(ComposedGroundedConfig):
+    """InternVL agent configuration reusing ComposedGroundedConfig for steps and
+    overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Fallback to a self-composed model"""
+        return await super().predict_step(
+            messages=messages,
+            model=f"{model}+{model}",
+            tools=tools,
+            max_retries=max_retries,
+            stream=stream,
+            computer_handler=computer_handler,
+            _on_api_start=_on_api_start,
+            _on_api_end=_on_api_end,
+            _on_usage=_on_usage,
+            _on_screenshot=_on_screenshot,
+            **kwargs,
+        )
+    async def predict_click(
+        self, model: str, image_b64: str, instruction: str, **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using InternVL via litellm.acompletion.
+        Behavior mirrors the ScreenSpot InternVL baseline:
+        - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
+        - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
+        - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
+        """
+        try:
+            # Decode image dimensions to scale the normalized outputs
+            img_bytes = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(img_bytes))
+            width, height = image.size
+        except Exception:
+            # If decoding fails, proceed with a safe default size to avoid crash
+            width, height = 1920, 1080
+        # Build grounding prompt exactly like the baseline
+        grounding_prompt = (
+            f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
+            f"Answer in the format of [[x1, y1, x2, y2]]"
+        )
+        # Prepare messages for LiteLLM
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                    {"type": "text", "text": grounding_prompt},
+                ],
+            }
+        ]
+        # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Conservative generation params akin to baseline (deterministic)
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+        # print(f"InternVL output: {output_text}")
+        # Try to parse a point first; if absent, parse bbox and take center
+        point = _extract_first_point(output_text)
+        if point is None:
+            bbox = _extract_last_bbox(output_text)
+            if bbox is None:
+                return None
+            x1, y1, x2, y2 = bbox
+            cx = (x1 + x2) / 2.0
+            cy = (y1 + y2) / 2.0
+            point = (cx, cy)
+        x_norm, y_norm = point
+        x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
+        return (x_px, y_px)
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]

cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl