PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +26 -17
agent/computers/cua.py +27 -23
agent/computers/custom.py +72 -69
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +212 -209
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +475 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
cua_agent-0.4.35.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0

agent/loops/gta1.py CHANGED Viewed

@@ -5,75 +5,80 @@ Code: https://github.com/Yan98/GTA1
 """
 import asyncio
+import base64
 import json
+import math
 import re
-import base64
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
-from io import BytesIO
 import uuid
-from PIL import Image
+from io import BytesIO
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
-import math
+from PIL import Image
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability, AgentResponse, Messages, Tools
-SYSTEM_PROMPT = '''
+SYSTEM_PROMPT = """
 You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
 Output the coordinate pair exactly:
 (x,y)
-'''.strip()
+""".strip()
 def extract_coordinates(raw_string: str) -> Tuple[float, float]:
     """Extract coordinates from model output."""
     try:
         matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
-        return tuple(map(float, matches[0])) # type: ignore
+        return tuple(map(float, matches[0]))  # type: ignore
     except:
         return (0.0, 0.0)
-def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360) -> Tuple[int, int]:
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
+) -> Tuple[int, int]:
     """Smart resize function similar to qwen_vl_utils."""
     # Calculate the total pixels
     total_pixels = height * width
     # If already within bounds, return original dimensions
     if min_pixels <= total_pixels <= max_pixels:
         # Round to nearest factor
         new_height = (height // factor) * factor
         new_width = (width // factor) * factor
         return new_height, new_width
     # Calculate scaling factor
     if total_pixels > max_pixels:
         scale = (max_pixels / total_pixels) ** 0.5
     else:
         scale = (min_pixels / total_pixels) ** 0.5
     # Apply scaling
     new_height = int(height * scale)
     new_width = int(width * scale)
     # Round to nearest factor
     new_height = (new_height // factor) * factor
     new_width = (new_width // factor) * factor
     # Ensure minimum size
     new_height = max(new_height, factor)
     new_width = max(new_width, factor)
     return new_height, new_width
 @register_agent(models=r".*GTA1.*")
 class GTA1Config(AsyncAgentConfig):
     """GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""
     def __init__(self):
         self.current_model = None
         self.last_screenshot_b64 = None
     async def predict_step(
         self,
@@ -87,25 +92,21 @@ class GTA1Config(AsyncAgentConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         raise NotImplementedError()
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[float, float]]:
         """
         Predict click coordinates using GTA1 model via litellm.acompletion.
         Args:
             model: The GTA1 model name
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             Tuple of (x, y) coordinates or None if prediction fails
         """
@@ -113,66 +114,62 @@ class GTA1Config(AsyncAgentConfig):
         image_data = base64.b64decode(image_b64)
         image = Image.open(BytesIO(image_data))
         width, height = image.width, image.height
         # Smart resize the image (similar to qwen_vl_utils)
         resized_height, resized_width = smart_resize(
-            height, width,
+            height,
+            width,
             factor=28,  # Default factor for Qwen models
             min_pixels=3136,
-            max_pixels=4096 * 2160
+            max_pixels=4096 * 2160,
         )
         resized_image = image.resize((resized_width, resized_height))
         scale_x, scale_y = width / resized_width, height / resized_height
         # Convert resized image back to base64
         buffered = BytesIO()
         resized_image.save(buffered, format="PNG")
         resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
         # Prepare system and user messages
         system_message = {
             "role": "system",
-            "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width)
+            "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width),
         }
         user_message = {
             "role": "user",
             "content": [
                 {
                     "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/png;base64,{resized_image_b64}"
-                    }
+                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
                 },
-                {
-                    "type": "text",
-                    "text": instruction
-                }
-            ]
+                {"type": "text", "text": instruction},
+            ],
         }
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
             "max_tokens": 2056,
             "temperature": 0.0,
-            **kwargs
+            **kwargs,
         }
         # Use liteLLM acompletion
         response = await litellm.acompletion(**api_kwargs)
         # Extract response text
-        output_text = response.choices[0].message.content # type: ignore
+        output_text = response.choices[0].message.content  # type: ignore
         # Extract and rescale coordinates
-        pred_x, pred_y = extract_coordinates(output_text) # type: ignore
+        pred_x, pred_y = extract_coordinates(output_text)  # type: ignore
         pred_x *= scale_x
         pred_y *= scale_y
         return (math.floor(pred_x), math.floor(pred_y))
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["click"]

agent/loops/holo.py CHANGED Viewed

@@ -21,8 +21,8 @@ import litellm
 from PIL import Image
 from ..decorators import register_agent
-from .base import AsyncAgentConfig
 from ..types import AgentCapability
+from .base import AsyncAgentConfig
 def _strip_hf_prefix(model: str) -> str:
@@ -53,7 +53,9 @@ def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tu
         if image_processor is None:
             return image, (orig_w, orig_h)
-        factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1)
+        factor = getattr(image_processor, "patch_size", 14) * getattr(
+            image_processor, "merge_size", 1
+        )
         min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
         max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)

agent/loops/internvl.py CHANGED Viewed

@@ -18,13 +18,12 @@ import re
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple
-from PIL import Image
 import litellm
+from PIL import Image
 from ..decorators import register_agent
-from .composed_grounded import ComposedGroundedConfig
 from ..types import AgentCapability
+from .composed_grounded import ComposedGroundedConfig
 # Regex patterns for extracting coordinates
 # Accept optional whitespace and optional decimal fractions
@@ -91,7 +90,7 @@ class InternVLConfig(ComposedGroundedConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """Fallback to a self-composed model"""
         return await super().predict_step(
@@ -105,15 +104,11 @@ class InternVLConfig(ComposedGroundedConfig):
             _on_api_end=_on_api_end,
             _on_usage=_on_usage,
             _on_screenshot=_on_screenshot,
-            **kwargs
+            **kwargs,
         )
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates using InternVL via litellm.acompletion.

agent/loops/moondream3.py CHANGED Viewed

@@ -14,27 +14,28 @@ Differences from composed_grounded:
 from __future__ import annotations
-import uuid
 import base64
 import io
-from typing import Dict, List, Any, Optional, Tuple, Any
+import uuid
+from typing import Any, Dict, List, Optional, Tuple
-from PIL import Image, ImageDraw, ImageFont
 import litellm
+from PIL import Image, ImageDraw, ImageFont
 from ..decorators import register_agent
-from ..types import AgentCapability
 from ..loops.base import AsyncAgentConfig
 from ..responses import (
-    convert_computer_calls_xy2desc,
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
     convert_computer_calls_desc2xy,
+    convert_computer_calls_xy2desc,
+    convert_responses_items_to_completion_messages,
     get_all_element_descriptions,
 )
+from ..types import AgentCapability
 _MOONDREAM_SINGLETON = None
 def get_moondream_model() -> Any:
     """Get a singleton instance of the Moondream3 preview model."""
     global _MOONDREAM_SINGLETON
@@ -42,6 +43,7 @@ def get_moondream_model() -> Any:
         try:
             import torch
             from transformers import AutoModelForCausalLM
             _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
                 "moondream/moondream3-preview",
                 trust_remote_code=True,
@@ -95,6 +97,7 @@ def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> L
         filtered.append(msg_copy)
     return filtered
 def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
     """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
@@ -132,7 +135,12 @@ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str,
             y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
             x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
             y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
-            left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
+            left, top, right, bottom = (
+                int(x_min * W),
+                int(y_min * H),
+                int(x_max * W),
+                int(y_max * H),
+            )
             left, top = max(0, left), max(0, top)
             right, bottom = min(W - 1, right), min(H - 1, bottom)
             crop = base_img.crop((left, top, right, bottom))
@@ -200,6 +208,7 @@ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str,
     annotated_b64 = _image_to_b64(annotated)
     return annotated_b64, detected_names
 GROUNDED_COMPUTER_TOOL_SCHEMA = {
     "type": "function",
     "function": {
@@ -270,6 +279,7 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
     },
 }
 @register_agent(r"moondream3\+.*", priority=2)
 class Moondream3PlusConfig(AsyncAgentConfig):
     def __init__(self):
@@ -321,14 +331,25 @@ class Moondream3PlusConfig(AsyncAgentConfig):
                         "type": "message",
                         "role": "assistant",
                         "content": [
-                            {"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
+                            {
+                                "type": "output_text",
+                                "text": "Taking a screenshot to analyze the current screen.",
+                            }
                         ],
                     },
-                    {"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
+                    {
+                        "type": "computer_call",
+                        "call_id": call_id,
+                        "status": "completed",
+                        "action": {"type": "screenshot"},
+                    },
                     {
                         "type": "computer_call_output",
                         "call_id": call_id,
-                        "output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
+                        "output": {
+                            "type": "input_image",
+                            "image_url": f"data:image/png;base64,{screenshot_b64}",
+                        },
                     },
                 ]
                 last_image_b64 = screenshot_b64
@@ -354,13 +375,16 @@ class Moondream3PlusConfig(AsyncAgentConfig):
                         "content": [
                             {"type": "input_text", "text": "Detected form UI elements on screen:"},
                             {"type": "input_text", "text": names_text},
-                            {"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
+                            {
+                                "type": "input_text",
+                                "text": "Please continue with the next action needed to perform your task.",
+                            },
                         ],
                     }
                 )
         tool_schemas = []
-        for schema in (tools or []):
+        for schema in tools or []:
             if schema.get("type") == "computer":
                 tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
             else:

cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl