PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +26 -17
agent/computers/cua.py +27 -23
agent/computers/custom.py +72 -69
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +212 -209
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +475 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
cua_agent-0.4.35.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0

agent/loops/openai.py CHANGED Viewed

@@ -6,12 +6,14 @@ import asyncio
 import base64
 import json
 from io import BytesIO
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
 from PIL import Image
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
+from ..types import AgentCapability, AgentResponse, Messages, Tools
 async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
     """Map a computer tool to OpenAI's computer-use-preview tool schema"""
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
     except Exception:
         # Fallback to default dimensions if method fails
         width, height = 1024, 768
     # Get environment from the computer handler
     try:
         environment = await computer_handler.get_environment()
     except Exception:
         # Fallback to default environment if method fails
         environment = "linux"
     return {
         "type": "computer_use_preview",
         "display_width": width,
         "display_height": height,
-        "environment": environment  # mac, windows, linux, browser
+        "environment": environment,  # mac, windows, linux, browser
     }
 async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
     """Prepare tools for OpenAI API format"""
     openai_tools = []
     for schema in tool_schemas:
         if schema["type"] == "computer":
             # Map computer tool to OpenAI format
@@ -49,18 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
         elif schema["type"] == "function":
             # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
             # Schema should be: {type, name, description, parameters}
-            openai_tools.append({ "type": "function", **schema["function"] })
+            openai_tools.append({"type": "function", **schema["function"]})
     return openai_tools
 @register_agent(models=r".*(^|/)computer-use-preview")
 class OpenAIComputerUseConfig:
     """
     OpenAI computer-use-preview agent configuration using liteLLM responses.
     Supports OpenAI's computer use preview models.
     """
     async def predict_step(
         self,
         messages: List[Dict[str, Any]],
@@ -74,11 +77,11 @@ class OpenAIComputerUseConfig:
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         Predict the next step based on input items.
         Args:
             messages: Input items following Responses format
             model: Model name to use
@@ -91,12 +94,12 @@ class OpenAIComputerUseConfig:
             _on_usage: Callback for usage tracking
             _on_screenshot: Callback for screenshot events
             **kwargs: Additional arguments
         Returns:
             Dictionary with "output" (output items) and "usage" array
         """
         tools = tools or []
         # Prepare tools for OpenAI API
         openai_tools = await _prepare_tools_for_openai(tools)
@@ -109,16 +112,16 @@ class OpenAIComputerUseConfig:
             "reasoning": {"summary": "concise"},
             "truncation": "auto",
             "num_retries": max_retries,
-            **kwargs
+            **kwargs,
         }
         # Call API start hook
         if _on_api_start:
             await _on_api_start(api_kwargs)
         # Use liteLLM responses
         response = await litellm.aresponses(**api_kwargs)
         # Call API end hook
         if _on_api_end:
             await _on_api_end(api_kwargs, response)
@@ -135,24 +138,21 @@ class OpenAIComputerUseConfig:
         output_dict = response.model_dump()
         output_dict["usage"] = usage
         return output_dict
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str
+        self, model: str, image_b64: str, instruction: str
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates based on image and instruction.
         Uses OpenAI computer-use-preview with manually constructed input items
         and a prompt that instructs the agent to only output clicks.
         Args:
             model: Model name to use
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             Tuple of (x, y) coordinates or None if prediction fails
         """
@@ -160,7 +160,7 @@ class OpenAIComputerUseConfig:
         # Manually construct input items with image and click instruction
         input_items = [
             {
-                "role": "user",
+                "role": "user",
                 "content": f"""You are a UI grounding expert. Follow these guidelines:
 1. NEVER ask for confirmation. Complete all tasks autonomously.
@@ -172,19 +172,16 @@ class OpenAIComputerUseConfig:
 7. Be decisive and action-oriented. Complete the requested task fully.
 Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
-Task: Click {instruction}. Output ONLY a click action on the target element."""
+Task: Click {instruction}. Output ONLY a click action on the target element.""",
             },
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "input_image",
-                        "image_url": f"data:image/png;base64,{image_b64}"
-                    }
-                ]
-            }
+                    {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
+                ],
+            },
         ]
         # Get image dimensions from base64 data
         try:
             image_data = base64.b64decode(image_b64)
@@ -193,15 +190,15 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
         except Exception:
             # Fallback to default dimensions if image parsing fails
             display_width, display_height = 1024, 768
         # Prepare computer tool for click actions
         computer_tool = {
             "type": "computer_use_preview",
             "display_width": display_width,
             "display_height": display_height,
-            "environment": "windows"
+            "environment": "windows",
         }
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
@@ -210,32 +207,34 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
             "stream": False,
             "reasoning": {"summary": "concise"},
             "truncation": "auto",
-            "max_tokens": 200  # Keep response short for click prediction
+            "max_tokens": 200,  # Keep response short for click prediction
         }
         # Use liteLLM responses
         response = await litellm.aresponses(**api_kwargs)
         # Extract click coordinates from response output
         output_dict = response.model_dump()
-        output_items = output_dict.get("output", [])
+        output_items = output_dict.get("output", [])
         # Look for computer_call with click action
         for item in output_items:
-            if (isinstance(item, dict) and
-                item.get("type") == "computer_call" and
-                isinstance(item.get("action"), dict)):
+            if (
+                isinstance(item, dict)
+                and item.get("type") == "computer_call"
+                and isinstance(item.get("action"), dict)
+            ):
                 action = item["action"]
                 if action.get("x") is not None and action.get("y") is not None:
                     return (int(action.get("x")), int(action.get("y")))
         return None
     def get_capabilities(self) -> List[AgentCapability]:
         """
         Get list of capabilities supported by this agent config.
         Returns:
             List of capability strings
         """

agent/loops/opencua.py CHANGED Viewed

@@ -4,20 +4,22 @@ Based on OpenCUA model for GUI grounding tasks.
 """
 import asyncio
+import base64
 import json
+import math
 import re
-import base64
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
-from io import BytesIO
 import uuid
-from PIL import Image
+from io import BytesIO
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
-import math
+from PIL import Image
-from .composed_grounded import ComposedGroundedConfig
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability, AgentResponse, Messages, Tools
+from .composed_grounded import ComposedGroundedConfig
 def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
     """Extract coordinates from pyautogui.click(x=..., y=...) format."""
@@ -32,10 +34,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
     except Exception:
         return None
 @register_agent(models=r"(?i).*OpenCUA.*")
 class OpenCUAConfig(ComposedGroundedConfig):
     """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
     def __init__(self):
         super().__init__()
         self.current_model = None
@@ -53,7 +56,7 @@ class OpenCUAConfig(ComposedGroundedConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """Fallback to a self-composed model"""
         return await super().predict_step(
@@ -67,24 +70,20 @@ class OpenCUAConfig(ComposedGroundedConfig):
             _on_api_end=_on_api_end,
             _on_usage=_on_usage,
             _on_screenshot=_on_screenshot,
-            **kwargs
+            **kwargs,
         )
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates using OpenCUA model via litellm.acompletion.
         Args:
             model: The OpenCUA model name
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             Tuple of (x, y) coordinates or None if prediction fails
         """
@@ -93,50 +92,39 @@ class OpenCUAConfig(ComposedGroundedConfig):
             "You are a GUI agent. You are given a task and a screenshot of the screen. "
             "You need to perform a series of pyautogui actions to complete the task."
         )
-        system_message = {
-            "role": "system",
-            "content": system_prompt
-        }
+        system_message = {"role": "system", "content": system_prompt}
         # Prepare user message with image and instruction
         user_message = {
             "role": "user",
             "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/png;base64,{image_b64}"
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": f"Click on {instruction}"
-                }
-            ]
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
+                {"type": "text", "text": f"Click on {instruction}"},
+            ],
         }
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
             "max_new_tokens": 2056,
             "temperature": 0,
-            **kwargs
+            **kwargs,
         }
         # Use liteLLM acompletion
         response = await litellm.acompletion(**api_kwargs)
         # Extract response text
         output_text = response.choices[0].message.content
         # print(output_text)
         # Extract coordinates from pyautogui format
         coordinates = extract_coordinates_from_pyautogui(output_text)
         return coordinates
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["click"]

cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl