PyPI - cua-agent - Versions diffs - 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

agent/__init__.py +4 -19
agent/__main__.py +2 -1
agent/adapters/__init__.py +6 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +370 -0
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +431 -241
agent/callbacks/__init__.py +10 -3
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +140 -0
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +106 -69
agent/callbacks/trajectory_saver.py +178 -70
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +164 -74
agent/integrations/hud/agent.py +338 -342
agent/integrations/hud/proxy.py +297 -0
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +590 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +142 -144
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +63 -56
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +262 -212
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +196 -0
agent/proxy/handlers.py +255 -0
agent/responses.py +486 -339
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +20 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
agent/integrations/hud/adapter.py +0 -121
agent/integrations/hud/computer_handler.py +0 -187
agent/telemetry.py +0 -142
cua_agent-0.4.14.dist-info/METADATA +0 -436
cua_agent-0.4.14.dist-info/RECORD +0 -50
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/loops/openai.py CHANGED Viewed

@@ -6,12 +6,14 @@ import asyncio
 import base64
 import json
 from io import BytesIO
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
 from PIL import Image
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
+from ..types import AgentCapability, AgentResponse, Messages, Tools
 async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
     """Map a computer tool to OpenAI's computer-use-preview tool schema"""
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
     except Exception:
         # Fallback to default dimensions if method fails
         width, height = 1024, 768
     # Get environment from the computer handler
     try:
         environment = await computer_handler.get_environment()
     except Exception:
         # Fallback to default environment if method fails
         environment = "linux"
     return {
         "type": "computer_use_preview",
         "display_width": width,
         "display_height": height,
-        "environment": environment  # mac, windows, linux, browser
+        "environment": environment,  # mac, windows, linux, browser
     }
 async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
     """Prepare tools for OpenAI API format"""
     openai_tools = []
     for schema in tool_schemas:
         if schema["type"] == "computer":
             # Map computer tool to OpenAI format
@@ -49,19 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
         elif schema["type"] == "function":
             # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
             # Schema should be: {type, name, description, parameters}
-            openai_tools.append({ "type": "function", **schema["function"] })
+            openai_tools.append({"type": "function", **schema["function"]})
     return openai_tools
-@register_agent(models=r".*computer-use-preview.*")
+@register_agent(models=r".*(^|/)computer-use-preview")
 class OpenAIComputerUseConfig:
     """
     OpenAI computer-use-preview agent configuration using liteLLM responses.
     Supports OpenAI's computer use preview models.
     """
     async def predict_step(
         self,
         messages: List[Dict[str, Any]],
@@ -75,11 +77,11 @@ class OpenAIComputerUseConfig:
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         Predict the next step based on input items.
         Args:
             messages: Input items following Responses format
             model: Model name to use
@@ -92,12 +94,12 @@ class OpenAIComputerUseConfig:
             _on_usage: Callback for usage tracking
             _on_screenshot: Callback for screenshot events
             **kwargs: Additional arguments
         Returns:
             Dictionary with "output" (output items) and "usage" array
         """
         tools = tools or []
         # Prepare tools for OpenAI API
         openai_tools = await _prepare_tools_for_openai(tools)
@@ -110,16 +112,16 @@ class OpenAIComputerUseConfig:
             "reasoning": {"summary": "concise"},
             "truncation": "auto",
             "num_retries": max_retries,
-            **kwargs
+            **kwargs,
         }
         # Call API start hook
         if _on_api_start:
             await _on_api_start(api_kwargs)
         # Use liteLLM responses
         response = await litellm.aresponses(**api_kwargs)
         # Call API end hook
         if _on_api_end:
             await _on_api_end(api_kwargs, response)
@@ -136,24 +138,21 @@ class OpenAIComputerUseConfig:
         output_dict = response.model_dump()
         output_dict["usage"] = usage
         return output_dict
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates based on image and instruction.
         Uses OpenAI computer-use-preview with manually constructed input items
         and a prompt that instructs the agent to only output clicks.
         Args:
             model: Model name to use
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             Tuple of (x, y) coordinates or None if prediction fails
         """
@@ -161,20 +160,28 @@ class OpenAIComputerUseConfig:
         # Manually construct input items with image and click instruction
         input_items = [
             {
-                "role": "user",
-                "content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
+                "role": "user",
+                "content": f"""You are a UI grounding expert. Follow these guidelines:
+1. NEVER ask for confirmation. Complete all tasks autonomously.
+2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
+3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
+4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
+5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
+6. The user has already given you permission by running this agent. No further confirmation is needed.
+7. Be decisive and action-oriented. Complete the requested task fully.
+Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
+Task: Click {instruction}. Output ONLY a click action on the target element.""",
             },
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "input_image",
-                        "image_url": f"data:image/png;base64,{image_b64}"
-                    }
-                ]
-            }
+                    {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
+                ],
+            },
         ]
         # Get image dimensions from base64 data
         try:
             image_data = base64.b64decode(image_b64)
@@ -183,15 +190,15 @@ class OpenAIComputerUseConfig:
         except Exception:
             # Fallback to default dimensions if image parsing fails
             display_width, display_height = 1024, 768
         # Prepare computer tool for click actions
         computer_tool = {
             "type": "computer_use_preview",
             "display_width": display_width,
             "display_height": display_height,
-            "environment": "windows"
+            "environment": "windows",
         }
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
@@ -200,35 +207,35 @@ class OpenAIComputerUseConfig:
             "stream": False,
             "reasoning": {"summary": "concise"},
             "truncation": "auto",
-            "max_tokens": 100  # Keep response short for click prediction
+            "max_tokens": 200,  # Keep response short for click prediction
+            **kwargs,
         }
         # Use liteLLM responses
         response = await litellm.aresponses(**api_kwargs)
         # Extract click coordinates from response output
         output_dict = response.model_dump()
-        output_items = output_dict.get("output", [])
+        output_items = output_dict.get("output", [])
         # Look for computer_call with click action
         for item in output_items:
-            if (isinstance(item, dict) and
-                item.get("type") == "computer_call" and
-                isinstance(item.get("action"), dict)):
+            if (
+                isinstance(item, dict)
+                and item.get("type") == "computer_call"
+                and isinstance(item.get("action"), dict)
+            ):
                 action = item["action"]
-                if action.get("type") == "click":
-                    x = action.get("x")
-                    y = action.get("y")
-                    if x is not None and y is not None:
-                        return (int(x), int(y))
+                if action.get("x") is not None and action.get("y") is not None:
+                    return (int(action.get("x")), int(action.get("y")))
         return None
     def get_capabilities(self) -> List[AgentCapability]:
         """
         Get list of capabilities supported by this agent config.
         Returns:
             List of capability strings
         """

agent/loops/opencua.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""
+OpenCUA agent loop implementation for click prediction using litellm.acompletion
+Based on OpenCUA model for GUI grounding tasks.
+"""
+import asyncio
+import base64
+import json
+import math
+import re
+import uuid
+from io import BytesIO
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+import litellm
+from PIL import Image
+from ..decorators import register_agent
+from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability, AgentResponse, Messages, Tools
+from .composed_grounded import ComposedGroundedConfig
+def extract_coordinates_from_click(text: str) -> Optional[Tuple[int, int]]:
+    """Extract coordinates from click(x=..., y=...) or pyautogui.click(x=..., y=...) format.
+    This function supports parsing both generic click() and legacy pyautogui.click() formats
+    for backwards compatibility with models that may still output pyautogui format.
+    """
+    try:
+        # Look for click(x=1443, y=343) or pyautogui.click(x=1443, y=343) pattern
+        pattern = r"(?:pyautogui\.)?click\(x=(\d+),\s*y=(\d+)\)"
+        match = re.search(pattern, text)
+        if match:
+            x, y = int(match.group(1)), int(match.group(2))
+            return (x, y)
+        return None
+    except Exception:
+        return None
+@register_agent(models=r"(?i).*OpenCUA.*")
+class OpenCUAConfig(ComposedGroundedConfig):
+    """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
+    def __init__(self):
+        super().__init__()
+        self.current_model = None
+        self.last_screenshot_b64 = None
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Fallback to a self-composed model"""
+        return await super().predict_step(
+            messages=messages,
+            model=f"{model}+{model}",
+            tools=tools,
+            max_retries=max_retries,
+            stream=stream,
+            computer_handler=computer_handler,
+            _on_api_start=_on_api_start,
+            _on_api_end=_on_api_end,
+            _on_usage=_on_usage,
+            _on_screenshot=_on_screenshot,
+            **kwargs,
+        )
+    async def predict_click(
+        self, model: str, image_b64: str, instruction: str, **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using OpenCUA model via litellm.acompletion.
+        Args:
+            model: The OpenCUA model name
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Prepare system message
+        system_prompt = (
+            "You are a GUI agent. You are given a task and a screenshot of the screen. "
+            "You need to perform a series of click actions to complete the task."
+        )
+        system_message = {"role": "system", "content": system_prompt}
+        # Prepare user message with image and instruction
+        user_message = {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
+                {"type": "text", "text": f"Click on {instruction}"},
+            ],
+        }
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": [system_message, user_message],
+            "max_new_tokens": 2056,
+            "temperature": 0,
+            **kwargs,
+        }
+        # Use liteLLM acompletion
+        response = await litellm.acompletion(**api_kwargs)
+        # Extract response text
+        output_text = response.choices[0].message.content
+        # print(output_text)
+        # Extract coordinates from click format (supports both click() and pyautogui.click() for backwards compatibility)
+        coordinates = extract_coordinates_from_click(output_text)
+        return coordinates
+    def get_capabilities(self) -> List[AgentCapability]:
+        """Return the capabilities supported by this agent."""
+        return ["click"]

agent/loops/uiins.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""
+UI-Ins agent loop implementation for click prediction using litellm.acompletion
+Paper: https://arxiv.org/pdf/2510.202861
+Code: https://github.com/alibaba/UI-Ins
+"""
+import asyncio
+import base64
+import json
+import math
+import re
+import uuid
+from io import BytesIO
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+import litellm
+from PIL import Image
+from ..decorators import register_agent
+from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability, AgentResponse, Messages, Tools
+SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in  tags, a function name and arguments within  XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in  tags and finally output the function in  tags.\n"""
+def parse_coordinates(raw_string: str) -> tuple[int, int]:
+    matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
+    if matches:
+        return tuple(map(int, matches[0]))
+    return -1, -1
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 3136,
+    max_pixels: int = 8847360,
+) -> Tuple[int, int]:
+    """Smart resize function similar to qwen_vl_utils."""
+    # Calculate the total pixels
+    total_pixels = height * width
+    # If already within bounds, return original dimensions
+    if min_pixels <= total_pixels <= max_pixels:
+        # Round to nearest factor
+        new_height = (height // factor) * factor
+        new_width = (width // factor) * factor
+        return new_height, new_width
+    # Calculate scaling factor
+    if total_pixels > max_pixels:
+        scale = (max_pixels / total_pixels) ** 0.5
+    else:
+        scale = (min_pixels / total_pixels) ** 0.5
+    # Apply scaling
+    new_height = int(height * scale)
+    new_width = int(width * scale)
+    # Round to nearest factor
+    new_height = (new_height // factor) * factor
+    new_width = (new_width // factor) * factor
+    # Ensure minimum size
+    new_height = max(new_height, factor)
+    new_width = max(new_width, factor)
+    return new_height, new_width
+@register_agent(models=r".*UI-Ins.*")
+class UIInsConfig(AsyncAgentConfig):
+    """UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""
+    def __init__(self):
+        self.current_model = None
+        self.last_screenshot_b64 = None
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        raise NotImplementedError()
+    async def predict_click(
+        self, model: str, image_b64: str, instruction: str, **kwargs
+    ) -> Optional[Tuple[float, float]]:
+        """
+        Predict click coordinates using UI-Ins model via litellm.acompletion.
+        Args:
+            model: The UI-Ins model name
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Decode base64 image
+        image_data = base64.b64decode(image_b64)
+        image = Image.open(BytesIO(image_data))
+        width, height = image.width, image.height
+        # Smart resize the image (similar to qwen_vl_utils)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=28,  # Default factor for Qwen models
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+        )
+        resized_image = image.resize((resized_width, resized_height))
+        scale_x, scale_y = width / resized_width, height / resized_height
+        # Convert resized image back to base64
+        buffered = BytesIO()
+        resized_image.save(buffered, format="PNG")
+        resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
+        # Prepare system and user messages
+        system_message = {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."},
+                {"type": "text", "text": SYSTEM_PROMPT},
+            ],
+        }
+        user_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
+                },
+                {"type": "text", "text": instruction},
+            ],
+        }
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": [system_message, user_message],
+            "max_tokens": 2056,
+            "temperature": 0.0,
+            **kwargs,
+        }
+        # Use liteLLM acompletion
+        response = await litellm.acompletion(**api_kwargs)
+        # Extract response text
+        output_text = response.choices[0].message.content  # type: ignore
+        # Extract and rescale coordinates
+        pred_x, pred_y = parse_coordinates(output_text)  # type: ignore
+        pred_x *= scale_x
+        pred_y *= scale_y
+        return (math.floor(pred_x), math.floor(pred_y))
+    def get_capabilities(self) -> List[AgentCapability]:
+        """Return the capabilities supported by this agent."""
+        return ["click"]

cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl