PyPI - cua-agent - Versions diffs - 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

agent/__init__.py +4 -19
agent/__main__.py +2 -1
agent/adapters/__init__.py +6 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +370 -0
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +431 -241
agent/callbacks/__init__.py +10 -3
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +140 -0
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +106 -69
agent/callbacks/trajectory_saver.py +178 -70
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +164 -74
agent/integrations/hud/agent.py +338 -342
agent/integrations/hud/proxy.py +297 -0
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +590 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +142 -144
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +63 -56
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +262 -212
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +196 -0
agent/proxy/handlers.py +255 -0
agent/responses.py +486 -339
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +20 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
agent/integrations/hud/adapter.py +0 -121
agent/integrations/hud/computer_handler.py +0 -187
agent/telemetry.py +0 -142
cua_agent-0.4.14.dist-info/METADATA +0 -436
cua_agent-0.4.14.dist-info/RECORD +0 -50
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/loops/base.py CHANGED Viewed

@@ -2,13 +2,15 @@
 Base protocol for async agent configurations
 """
-from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
 from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Protocol, Tuple, Union
 from ..types import AgentCapability
 class AsyncAgentConfig(Protocol):
     """Protocol defining the interface for async agent configurations."""
     @abstractmethod
     async def predict_step(
         self,
@@ -22,11 +24,11 @@ class AsyncAgentConfig(Protocol):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **generation_config,
     ) -> Dict[str, Any]:
         """
         Predict the next step based on input items.
         Args:
             messages: Input items following Responses format (message, function_call, computer_call)
             model: Model name to use
@@ -38,38 +40,40 @@ class AsyncAgentConfig(Protocol):
             _on_api_end: Callback for API end
             _on_usage: Callback for usage tracking
             _on_screenshot: Callback for screenshot events
-            **kwargs: Additional arguments
+            **generation_config: Additional arguments to pass to the model provider
+                - api_key: Optional API key for the provider
+                - api_base: Optional API base URL for the provider
         Returns:
             Dictionary with "output" (output items) and "usage" array
         """
         ...
     @abstractmethod
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str
+        self, model: str, image_b64: str, instruction: str, **generation_config
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates based on image and instruction.
         Args:
             model: Model name to use
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
+            **generation_config: Additional arguments to pass to the model provider
+                - api_key: Optional API key for the provider
+                - api_base: Optional API base URL for the provider
         Returns:
             None or tuple with (x, y) coordinates
         """
         ...
     @abstractmethod
     def get_capabilities(self) -> List[AgentCapability]:
         """
         Get list of capabilities supported by this agent config.
         Returns:
             List of capability strings (e.g., ["step", "click"])
         """

agent/loops/composed_grounded.py CHANGED Viewed

@@ -3,112 +3,117 @@ Composed-grounded agent loop implementation that combines grounding and thinking
 Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
 """
-import uuid
 import asyncio
-import json
 import base64
-from typing import Dict, List, Any, Optional, Tuple
+import json
+import uuid
 from io import BytesIO
-from PIL import Image
+from typing import Any, Dict, List, Optional, Tuple
 import litellm
+from PIL import Image
+from ..agent import find_agent_config
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
 from ..responses import (
-    convert_computer_calls_xy2desc,
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
     convert_computer_calls_desc2xy,
-    get_all_element_descriptions
+    convert_computer_calls_xy2desc,
+    convert_responses_items_to_completion_messages,
+    get_all_element_descriptions,
 )
-from ..agent import find_agent_config
+from ..types import AgentCapability, AgentResponse, Messages, Tools
 GROUNDED_COMPUTER_TOOL_SCHEMA = {
-  "type": "function",
-  "function": {
-    "name": "computer",
-    "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
-    "parameters": {
-        "type": "object",
-        "properties": {
-        "action": {
-            "type": "string",
-            "enum": [
-            "screenshot",
-            "click",
-            "double_click",
-            "drag",
-            "type",
-            "keypress",
-            "scroll",
-            "move",
-            "wait",
-            "get_current_url",
-            "get_dimensions",
-            "get_environment"
-            ],
-            "description": "The action to perform"
-        },
-        "element_description": {
-            "type": "string",
-            "description": "Description of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
-        },
-        "start_element_description": {
-            "type": "string",
-            "description": "Description of the element to start dragging from (required for drag action)"
-        },
-        "end_element_description": {
-            "type": "string",
-            "description": "Description of the element to drag to (required for drag action)"
+    "type": "function",
+    "function": {
+        "name": "computer",
+        "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform (required for all actions)",
+                },
+                "element_description": {
+                    "type": "string",
+                    "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)",
+                },
+                "start_element_description": {
+                    "type": "string",
+                    "description": "Description of the element to start dragging from (required for drag action)",
+                },
+                "end_element_description": {
+                    "type": "string",
+                    "description": "Description of the element to drag to (required for drag action)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type action)",
+                },
+                "keys": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Key(s) to press (required for keypress action)",
+                },
+                "button": {
+                    "type": "string",
+                    "enum": ["left", "right", "wheel", "back", "forward"],
+                    "description": "The mouse button to use for click action (required for click and double_click action)",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount for scroll action (required for scroll action)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount for scroll action (required for scroll action)",
+                },
+            },
+            "required": ["action"],
         },
-        "text": {
-            "type": "string",
-            "description": "The text to type (required for type action)"
-        },
-        "keys": {
-            "type": "string",
-            "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
-        },
-        "button": {
-            "type": "string",
-            "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
-        },
-        "scroll_x": {
-            "type": "integer",
-            "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
-        },
-        "scroll_y": {
-            "type": "integer",
-            "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
-        },
-        },
-        "required": [
-            "action"
-        ]
-    }
-  }
+    },
 }
 def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Prepare tools for grounded API format"""
     grounded_tools = []
     for schema in tool_schemas:
         if schema["type"] == "computer":
             grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
         else:
             grounded_tools.append(schema)
     return grounded_tools
 def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
     """Get the last computer call output image from messages."""
     for message in reversed(messages):
-        if (isinstance(message, dict) and
-            message.get("type") == "computer_call_output" and
-            isinstance(message.get("output"), dict) and
-            message["output"].get("type") == "input_image"):
+        if (
+            isinstance(message, dict)
+            and message.get("type") == "computer_call_output"
+            and isinstance(message.get("output"), dict)
+            and message["output"].get("type") == "input_image"
+        ):
             image_url = message["output"].get("image_url", "")
             if image_url.startswith("data:image/png;base64,"):
                 return image_url.split(",", 1)[1]
@@ -116,17 +121,17 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
 @register_agent(r".*\+.*", priority=1)
-class ComposedGroundedConfig:
+class ComposedGroundedConfig(AsyncAgentConfig):
     """
     Composed-grounded agent configuration that uses both grounding and thinking models.
     The model parameter should be in format: "grounding_model+thinking_model"
     e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
     """
     def __init__(self):
         self.desc2xy: Dict[str, Tuple[float, float]] = {}
     async def predict_step(
         self,
         messages: List[Dict[str, Any]],
@@ -140,11 +145,11 @@ class ComposedGroundedConfig:
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         Composed-grounded predict step implementation.
         Process:
         0. Store last computer call image, if none then take a screenshot
         1. Convert computer calls from xy to descriptions
@@ -157,18 +162,20 @@ class ComposedGroundedConfig:
         """
         # Parse the composed model
         if "+" not in model:
-            raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
+            raise ValueError(
+                f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
+            )
         grounding_model, thinking_model = model.split("+", 1)
         pre_output_items = []
         # Step 0: Store last computer call image, if none then take a screenshot
         last_image_b64 = get_last_computer_call_image(messages)
         if last_image_b64 is None:
             # Take a screenshot
-            screenshot_b64 = await computer_handler.screenshot() # type: ignore
+            screenshot_b64 = await computer_handler.screenshot()  # type: ignore
             if screenshot_b64:
                 call_id = uuid.uuid4().hex
                 pre_output_items += [
                     {
@@ -177,45 +184,42 @@ class ComposedGroundedConfig:
                         "content": [
                             {
                                 "type": "output_text",
-                                "text": "Taking a screenshot to see the current computer screen."
+                                "text": "Taking a screenshot to see the current computer screen.",
                             }
-                        ]
+                        ],
                     },
                     {
-                        "action": {
-                            "type": "screenshot"
-                        },
+                        "action": {"type": "screenshot"},
                         "call_id": call_id,
                         "status": "completed",
-                        "type": "computer_call"
+                        "type": "computer_call",
                     },
                     {
                         "type": "computer_call_output",
                         "call_id": call_id,
                         "output": {
                             "type": "input_image",
-                            "image_url": f"data:image/png;base64,{screenshot_b64}"
-                        }
+                            "image_url": f"data:image/png;base64,{screenshot_b64}",
+                        },
                     },
                 ]
                 last_image_b64 = screenshot_b64
                 # Call screenshot callback if provided
                 if _on_screenshot:
                     await _on_screenshot(screenshot_b64)
-        tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
+        tool_schemas = _prepare_tools_for_grounded(tools)  # type: ignore
         # Step 1: Convert computer calls from xy to descriptions
         input_messages = messages + pre_output_items
         messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
         # Step 2: Convert responses items to completion messages
         completion_messages = convert_responses_items_to_completion_messages(
-            messages_with_descriptions,
-            allow_images_in_tool_results=False
+            messages_with_descriptions, allow_images_in_tool_results=False
         )
         # Step 3: Call thinking model with litellm.acompletion
         api_kwargs = {
             "model": thinking_model,
@@ -223,96 +227,90 @@ class ComposedGroundedConfig:
             "tools": tool_schemas,
             "max_retries": max_retries,
             "stream": stream,
-            **kwargs
+            **kwargs,
         }
         if use_prompt_caching:
             api_kwargs["use_prompt_caching"] = use_prompt_caching
         # Call API start hook
         if _on_api_start:
             await _on_api_start(api_kwargs)
         # Make the completion call
         response = await litellm.acompletion(**api_kwargs)
         # Call API end hook
         if _on_api_end:
             await _on_api_end(api_kwargs, response)
         # Extract usage information
         usage = {
-            **response.usage.model_dump(), # type: ignore
+            **response.usage.model_dump(),  # type: ignore
             "response_cost": response._hidden_params.get("response_cost", 0.0),
         }
         if _on_usage:
             await _on_usage(usage)
         # Step 4: Convert completion messages back to responses items format
-        response_dict = response.model_dump() # type: ignore
+        response_dict = response.model_dump()  # type: ignore
         choice_messages = [choice["message"] for choice in response_dict["choices"]]
         thinking_output_items = []
         for choice_message in choice_messages:
-            thinking_output_items.extend(convert_completion_messages_to_responses_items([choice_message]))
+            thinking_output_items.extend(
+                convert_completion_messages_to_responses_items([choice_message])
+            )
         # Step 5: Get all element descriptions and populate desc2xy mapping
         element_descriptions = get_all_element_descriptions(thinking_output_items)
         if element_descriptions and last_image_b64:
             # Use grounding model to predict coordinates for each description
             grounding_agent_conf = find_agent_config(grounding_model)
             if grounding_agent_conf:
                 grounding_agent = grounding_agent_conf.agent_class()
                 for desc in element_descriptions:
-                    coords = await grounding_agent.predict_click(
-                        model=grounding_model,
-                        image_b64=last_image_b64,
-                        instruction=desc
-                    )
-                    if coords:
-                        self.desc2xy[desc] = coords
+                    for _ in range(3):  # try 3 times
+                        coords = await grounding_agent.predict_click(
+                            model=grounding_model, image_b64=last_image_b64, instruction=desc
+                        )
+                        if coords:
+                            self.desc2xy[desc] = coords
+                            break
         # Step 6: Convert computer calls from descriptions back to xy coordinates
         final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
         # Step 7: Return output and usage
-        return {
-            "output": pre_output_items + final_output_items,
-            "usage": usage
-        }
+        return {"output": pre_output_items + final_output_items, "usage": usage}
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates using the grounding model.
         For composed models, uses only the grounding model part for click prediction.
         """
         # Parse the composed model to get grounding model
         if "+" not in model:
-            raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
+            raise ValueError(
+                f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
+            )
         grounding_model, thinking_model = model.split("+", 1)
         # Find and use the grounding agent
         grounding_agent_conf = find_agent_config(grounding_model)
         if grounding_agent_conf:
             grounding_agent = grounding_agent_conf.agent_class()
             return await grounding_agent.predict_click(
-                model=grounding_model,
-                image_b64=image_b64,
-                instruction=instruction,
-                **kwargs
+                model=grounding_model, image_b64=image_b64, instruction=instruction, **kwargs
             )
         return None
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["click", "step"]

agent/loops/fara/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+FARA-7B agent loop implementation.
+Original implementation from Microsoft: https://github.com/microsoft/Fara
+"""
+from .config import FaraVlmConfig
+__all__ = ("FaraVlmConfig",)

cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl