PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +26 -17
agent/computers/cua.py +27 -23
agent/computers/custom.py +72 -69
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +212 -209
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +475 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
cua_agent-0.4.35.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0

agent/loops/base.py CHANGED Viewed

@@ -2,13 +2,15 @@
 Base protocol for async agent configurations
 """
-from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
 from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Protocol, Tuple, Union
 from ..types import AgentCapability
 class AsyncAgentConfig(Protocol):
     """Protocol defining the interface for async agent configurations."""
     @abstractmethod
     async def predict_step(
         self,
@@ -22,11 +24,11 @@ class AsyncAgentConfig(Protocol):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         Predict the next step based on input items.
         Args:
             messages: Input items following Responses format (message, function_call, computer_call)
             model: Model name to use
@@ -39,37 +41,34 @@ class AsyncAgentConfig(Protocol):
             _on_usage: Callback for usage tracking
             _on_screenshot: Callback for screenshot events
             **kwargs: Additional arguments
         Returns:
             Dictionary with "output" (output items) and "usage" array
         """
         ...
     @abstractmethod
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str
+        self, model: str, image_b64: str, instruction: str
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates based on image and instruction.
         Args:
             model: Model name to use
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             None or tuple with (x, y) coordinates
         """
         ...
     @abstractmethod
     def get_capabilities(self) -> List[AgentCapability]:
         """
         Get list of capabilities supported by this agent config.
         Returns:
             List of capability strings (e.g., ["step", "click"])
         """

agent/loops/composed_grounded.py CHANGED Viewed

@@ -3,122 +3,117 @@ Composed-grounded agent loop implementation that combines grounding and thinking
 Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
 """
-import uuid
 import asyncio
-import json
 import base64
-from typing import Dict, List, Any, Optional, Tuple
+import json
+import uuid
 from io import BytesIO
-from PIL import Image
+from typing import Any, Dict, List, Optional, Tuple
 import litellm
+from PIL import Image
+from ..agent import find_agent_config
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
 from ..responses import (
-    convert_computer_calls_xy2desc,
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
     convert_computer_calls_desc2xy,
-    get_all_element_descriptions
+    convert_computer_calls_xy2desc,
+    convert_responses_items_to_completion_messages,
+    get_all_element_descriptions,
 )
-from ..agent import find_agent_config
+from ..types import AgentCapability, AgentResponse, Messages, Tools
 GROUNDED_COMPUTER_TOOL_SCHEMA = {
-  "type": "function",
-  "function": {
-    "name": "computer",
-    "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
-    "parameters": {
-        "type": "object",
-        "properties": {
-        "action": {
-            "type": "string",
-            "enum": [
-            "screenshot",
-            "click",
-            "double_click",
-            "drag",
-            "type",
-            "keypress",
-            "scroll",
-            "move",
-            "wait",
-            "get_current_url",
-            "get_dimensions",
-            "get_environment"
-            ],
-            "description": "The action to perform (required for all actions)"
-        },
-        "element_description": {
-            "type": "string",
-            "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
-        },
-        "start_element_description": {
-            "type": "string",
-            "description": "Description of the element to start dragging from (required for drag action)"
-        },
-        "end_element_description": {
-            "type": "string",
-            "description": "Description of the element to drag to (required for drag action)"
-        },
-        "text": {
-            "type": "string",
-            "description": "The text to type (required for type action)"
-        },
-        "keys": {
-            "type": "array",
-            "items": {
-                "type": "string"
+    "type": "function",
+    "function": {
+        "name": "computer",
+        "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform (required for all actions)",
+                },
+                "element_description": {
+                    "type": "string",
+                    "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)",
+                },
+                "start_element_description": {
+                    "type": "string",
+                    "description": "Description of the element to start dragging from (required for drag action)",
+                },
+                "end_element_description": {
+                    "type": "string",
+                    "description": "Description of the element to drag to (required for drag action)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type action)",
+                },
+                "keys": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Key(s) to press (required for keypress action)",
+                },
+                "button": {
+                    "type": "string",
+                    "enum": ["left", "right", "wheel", "back", "forward"],
+                    "description": "The mouse button to use for click action (required for click and double_click action)",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount for scroll action (required for scroll action)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount for scroll action (required for scroll action)",
+                },
             },
-            "description": "Key(s) to press (required for keypress action)"
-        },
-        "button": {
-            "type": "string",
-            "enum": [
-                "left",
-                "right",
-                "wheel",
-                "back",
-                "forward"
-            ],
-            "description": "The mouse button to use for click action (required for click and double_click action)",
-        },
-        "scroll_x": {
-            "type": "integer",
-            "description": "Horizontal scroll amount for scroll action (required for scroll action)",
+            "required": ["action"],
         },
-        "scroll_y": {
-            "type": "integer",
-            "description": "Vertical scroll amount for scroll action (required for scroll action)",
-        },
-        },
-        "required": [
-            "action"
-        ]
-    }
-  }
+    },
 }
 def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Prepare tools for grounded API format"""
     grounded_tools = []
     for schema in tool_schemas:
         if schema["type"] == "computer":
             grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
         else:
             grounded_tools.append(schema)
     return grounded_tools
 def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
     """Get the last computer call output image from messages."""
     for message in reversed(messages):
-        if (isinstance(message, dict) and
-            message.get("type") == "computer_call_output" and
-            isinstance(message.get("output"), dict) and
-            message["output"].get("type") == "input_image"):
+        if (
+            isinstance(message, dict)
+            and message.get("type") == "computer_call_output"
+            and isinstance(message.get("output"), dict)
+            and message["output"].get("type") == "input_image"
+        ):
             image_url = message["output"].get("image_url", "")
             if image_url.startswith("data:image/png;base64,"):
                 return image_url.split(",", 1)[1]
@@ -129,14 +124,14 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
 class ComposedGroundedConfig(AsyncAgentConfig):
     """
     Composed-grounded agent configuration that uses both grounding and thinking models.
     The model parameter should be in format: "grounding_model+thinking_model"
     e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
     """
     def __init__(self):
         self.desc2xy: Dict[str, Tuple[float, float]] = {}
     async def predict_step(
         self,
         messages: List[Dict[str, Any]],
@@ -150,11 +145,11 @@ class ComposedGroundedConfig(AsyncAgentConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         Composed-grounded predict step implementation.
         Process:
         0. Store last computer call image, if none then take a screenshot
         1. Convert computer calls from xy to descriptions
@@ -167,18 +162,20 @@ class ComposedGroundedConfig(AsyncAgentConfig):
         """
         # Parse the composed model
         if "+" not in model:
-            raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
+            raise ValueError(
+                f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
+            )
         grounding_model, thinking_model = model.split("+", 1)
         pre_output_items = []
         # Step 0: Store last computer call image, if none then take a screenshot
         last_image_b64 = get_last_computer_call_image(messages)
         if last_image_b64 is None:
             # Take a screenshot
-            screenshot_b64 = await computer_handler.screenshot() # type: ignore
+            screenshot_b64 = await computer_handler.screenshot()  # type: ignore
             if screenshot_b64:
                 call_id = uuid.uuid4().hex
                 pre_output_items += [
                     {
@@ -187,45 +184,42 @@ class ComposedGroundedConfig(AsyncAgentConfig):
                         "content": [
                             {
                                 "type": "output_text",
-                                "text": "Taking a screenshot to see the current computer screen."
+                                "text": "Taking a screenshot to see the current computer screen.",
                             }
-                        ]
+                        ],
                     },
                     {
-                        "action": {
-                            "type": "screenshot"
-                        },
+                        "action": {"type": "screenshot"},
                         "call_id": call_id,
                         "status": "completed",
-                        "type": "computer_call"
+                        "type": "computer_call",
                     },
                     {
                         "type": "computer_call_output",
                         "call_id": call_id,
                         "output": {
                             "type": "input_image",
-                            "image_url": f"data:image/png;base64,{screenshot_b64}"
-                        }
+                            "image_url": f"data:image/png;base64,{screenshot_b64}",
+                        },
                     },
                 ]
                 last_image_b64 = screenshot_b64
                 # Call screenshot callback if provided
                 if _on_screenshot:
                     await _on_screenshot(screenshot_b64)
-        tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
+        tool_schemas = _prepare_tools_for_grounded(tools)  # type: ignore
         # Step 1: Convert computer calls from xy to descriptions
         input_messages = messages + pre_output_items
         messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
         # Step 2: Convert responses items to completion messages
         completion_messages = convert_responses_items_to_completion_messages(
-            messages_with_descriptions,
-            allow_images_in_tool_results=False
+            messages_with_descriptions, allow_images_in_tool_results=False
         )
         # Step 3: Call thinking model with litellm.acompletion
         api_kwargs = {
             "model": thinking_model,
@@ -233,98 +227,90 @@ class ComposedGroundedConfig(AsyncAgentConfig):
             "tools": tool_schemas,
             "max_retries": max_retries,
             "stream": stream,
-            **kwargs
+            **kwargs,
         }
         if use_prompt_caching:
             api_kwargs["use_prompt_caching"] = use_prompt_caching
         # Call API start hook
         if _on_api_start:
             await _on_api_start(api_kwargs)
         # Make the completion call
         response = await litellm.acompletion(**api_kwargs)
         # Call API end hook
         if _on_api_end:
             await _on_api_end(api_kwargs, response)
         # Extract usage information
         usage = {
-            **response.usage.model_dump(), # type: ignore
+            **response.usage.model_dump(),  # type: ignore
             "response_cost": response._hidden_params.get("response_cost", 0.0),
         }
         if _on_usage:
             await _on_usage(usage)
         # Step 4: Convert completion messages back to responses items format
-        response_dict = response.model_dump() # type: ignore
+        response_dict = response.model_dump()  # type: ignore
         choice_messages = [choice["message"] for choice in response_dict["choices"]]
         thinking_output_items = []
         for choice_message in choice_messages:
-            thinking_output_items.extend(convert_completion_messages_to_responses_items([choice_message]))
+            thinking_output_items.extend(
+                convert_completion_messages_to_responses_items([choice_message])
+            )
         # Step 5: Get all element descriptions and populate desc2xy mapping
         element_descriptions = get_all_element_descriptions(thinking_output_items)
         if element_descriptions and last_image_b64:
             # Use grounding model to predict coordinates for each description
             grounding_agent_conf = find_agent_config(grounding_model)
             if grounding_agent_conf:
                 grounding_agent = grounding_agent_conf.agent_class()
                 for desc in element_descriptions:
-                    for _ in range(3): # try 3 times
+                    for _ in range(3):  # try 3 times
                         coords = await grounding_agent.predict_click(
-                            model=grounding_model,
-                            image_b64=last_image_b64,
-                            instruction=desc
+                            model=grounding_model, image_b64=last_image_b64, instruction=desc
                         )
                         if coords:
                             self.desc2xy[desc] = coords
                             break
         # Step 6: Convert computer calls from descriptions back to xy coordinates
         final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
         # Step 7: Return output and usage
-        return {
-            "output": pre_output_items + final_output_items,
-            "usage": usage
-        }
+        return {"output": pre_output_items + final_output_items, "usage": usage}
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates using the grounding model.
         For composed models, uses only the grounding model part for click prediction.
         """
         # Parse the composed model to get grounding model
         if "+" not in model:
-            raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
+            raise ValueError(
+                f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
+            )
         grounding_model, thinking_model = model.split("+", 1)
         # Find and use the grounding agent
         grounding_agent_conf = find_agent_config(grounding_model)
         if grounding_agent_conf:
             grounding_agent = grounding_agent_conf.agent_class()
             return await grounding_agent.predict_click(
-                model=grounding_model,
-                image_b64=image_b64,
-                instruction=instruction,
-                **kwargs
+                model=grounding_model, image_b64=image_b64, instruction=instruction, **kwargs
             )
         return None
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["click", "step"]

agent/loops/gemini.py CHANGED Viewed

@@ -29,6 +29,7 @@ def _lazy_import_genai():
     try:
         from google import genai  # type: ignore
         from google.genai import types  # type: ignore
         return genai, types
     except Exception as e:  # pragma: no cover
         raise RuntimeError(
@@ -134,7 +135,13 @@ def _map_gemini_fc_to_computer_call(
             dx = magnitude
         elif direction == "left":
             dx = -magnitude
-        action = {"type": "scroll", "scroll_x": dx, "scroll_y": dy, "x": int(screen_w / 2), "y": int(screen_h / 2)}
+        action = {
+            "type": "scroll",
+            "scroll_x": dx,
+            "scroll_y": dy,
+            "x": int(screen_w / 2),
+            "y": int(screen_h / 2),
+        }
     elif name == "scroll_at":
         x = _denormalize(int(args.get("x", 500)), screen_w)
         y = _denormalize(int(args.get("y", 500)), screen_h)
@@ -155,7 +162,14 @@ def _map_gemini_fc_to_computer_call(
         y = _denormalize(int(args.get("y", 0)), screen_h)
         dx = _denormalize(int(args.get("destination_x", x)), screen_w)
         dy = _denormalize(int(args.get("destination_y", y)), screen_h)
-        action = {"type": "drag", "start_x": x, "start_y": y, "end_x": dx, "end_y": dy, "button": "left"}
+        action = {
+            "type": "drag",
+            "start_x": x,
+            "start_y": y,
+            "end_x": dx,
+            "end_y": dy,
+            "button": "left",
+        }
     elif name == "wait_5_seconds":
         action = {"type": "wait"}
     else:
@@ -242,20 +256,25 @@ class GeminiComputerUseConfig(AsyncAgentConfig):
         }
         if _on_api_start:
-            await _on_api_start({
-                "model": api_kwargs["model"],
-                # "contents": api_kwargs["contents"], # Disabled for now
-                "config": api_kwargs["config"],
-            })
+            await _on_api_start(
+                {
+                    "model": api_kwargs["model"],
+                    # "contents": api_kwargs["contents"], # Disabled for now
+                    "config": api_kwargs["config"],
+                }
+            )
         response = client.models.generate_content(**api_kwargs)
         if _on_api_end:
-            await _on_api_end({
-                "model": api_kwargs["model"],
-                # "contents": api_kwargs["contents"], # Disabled for now
-                "config": api_kwargs["config"],
-            }, response)
+            await _on_api_end(
+                {
+                    "model": api_kwargs["model"],
+                    # "contents": api_kwargs["contents"], # Disabled for now
+                    "config": api_kwargs["config"],
+                },
+                response,
+            )
         # Usage (Gemini SDK may not always provide token usage; populate when available)
         usage: Dict[str, Any] = {}

cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl