PyPI - cua-agent - Versions diffs - 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl - Mend

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/huggingfacelocal_adapter.py +54 -61
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +14 -6
agent/adapters/models/generic.py +7 -4
agent/adapters/models/internvl.py +66 -30
agent/adapters/models/opencua.py +23 -8
agent/adapters/models/qwen2_5_vl.py +7 -4
agent/agent.py +184 -158
agent/callbacks/__init__.py +4 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +18 -13
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +3 -1
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/telemetry.py +67 -61
agent/callbacks/trajectory_saver.py +90 -70
agent/cli.py +115 -110
agent/computers/__init__.py +13 -8
agent/computers/base.py +26 -17
agent/computers/cua.py +27 -23
agent/computers/custom.py +72 -69
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +235 -185
agent/integrations/hud/__init__.py +15 -21
agent/integrations/hud/agent.py +101 -83
agent/integrations/hud/proxy.py +90 -57
agent/loops/__init__.py +25 -21
agent/loops/anthropic.py +537 -483
agent/loops/base.py +13 -14
agent/loops/composed_grounded.py +135 -149
agent/loops/gemini.py +31 -12
agent/loops/glm45v.py +135 -133
agent/loops/gta1.py +47 -50
agent/loops/holo.py +4 -2
agent/loops/internvl.py +6 -11
agent/loops/moondream3.py +36 -12
agent/loops/omniparser.py +212 -209
agent/loops/openai.py +49 -50
agent/loops/opencua.py +29 -41
agent/loops/qwen.py +475 -0
agent/loops/uitars.py +237 -202
agent/proxy/examples.py +54 -50
agent/proxy/handlers.py +27 -34
agent/responses.py +330 -330
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +23 -18
agent/ui/gradio/ui_components.py +310 -161
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
cua_agent-0.4.35.dist-info/RECORD +64 -0
cua_agent-0.4.34.dist-info/RECORD +0 -63
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
{cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0

agent/callbacks/__init__.py CHANGED Viewed

@@ -3,17 +3,17 @@ Callback system for ComputerAgent preprocessing and postprocessing hooks.
 """
 from .base import AsyncCallbackHandler
+from .budget_manager import BudgetManagerCallback
 from .image_retention import ImageRetentionCallback
 from .logging import LoggingCallback
-from .trajectory_saver import TrajectorySaverCallback
-from .budget_manager import BudgetManagerCallback
-from .telemetry import TelemetryCallback
 from .operator_validator import OperatorNormalizerCallback
 from .prompt_instructions import PromptInstructionsCallback
+from .telemetry import TelemetryCallback
+from .trajectory_saver import TrajectorySaverCallback
 __all__ = [
     "AsyncCallbackHandler",
-    "ImageRetentionCallback",
+    "ImageRetentionCallback",
     "LoggingCallback",
     "TrajectorySaverCallback",
     "BudgetManagerCallback",

agent/callbacks/base.py CHANGED Viewed

@@ -3,7 +3,7 @@ Base callback handler interface for ComputerAgent preprocessing and postprocessi
 """
 from abc import ABC, abstractmethod
-from typing import List, Dict, Any, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 class AsyncCallbackHandler(ABC):
@@ -16,42 +16,52 @@ class AsyncCallbackHandler(ABC):
         """Called at the start of an agent run loop."""
         pass
-    async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
+    async def on_run_end(
+        self,
+        kwargs: Dict[str, Any],
+        old_items: List[Dict[str, Any]],
+        new_items: List[Dict[str, Any]],
+    ) -> None:
         """Called at the end of an agent run loop."""
         pass
-    async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
+    async def on_run_continue(
+        self,
+        kwargs: Dict[str, Any],
+        old_items: List[Dict[str, Any]],
+        new_items: List[Dict[str, Any]],
+    ) -> bool:
         """Called during agent run loop to determine if execution should continue.
         Args:
             kwargs: Run arguments
             old_items: Original messages
             new_items: New messages generated during run
         Returns:
             True to continue execution, False to stop
         """
         return True
     async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         Called before messages are sent to the agent loop.
         Args:
             messages: List of message dictionaries to preprocess
         Returns:
             List of preprocessed message dictionaries
         """
         return messages
     async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         Called after the agent loop returns output.
         Args:
             output: List of output message dictionaries to postprocess
         Returns:
             List of postprocessed output dictionaries
         """
@@ -60,63 +70,67 @@ class AsyncCallbackHandler(ABC):
     async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
         """
         Called when a computer call is about to start.
         Args:
             item: The computer call item dictionary
         """
         pass
-    async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
+    async def on_computer_call_end(
+        self, item: Dict[str, Any], result: List[Dict[str, Any]]
+    ) -> None:
         """
         Called when a computer call has completed.
         Args:
             item: The computer call item dictionary
             result: The result of the computer call
         """
         pass
     async def on_function_call_start(self, item: Dict[str, Any]) -> None:
         """
         Called when a function call is about to start.
         Args:
             item: The function call item dictionary
         """
         pass
-    async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
+    async def on_function_call_end(
+        self, item: Dict[str, Any], result: List[Dict[str, Any]]
+    ) -> None:
         """
         Called when a function call has completed.
         Args:
             item: The function call item dictionary
             result: The result of the function call
         """
         pass
     async def on_text(self, item: Dict[str, Any]) -> None:
         """
         Called when a text message is encountered.
         Args:
             item: The message item dictionary
         """
         pass
     async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
         """
         Called when an API call is about to start.
         Args:
             kwargs: The kwargs being passed to the API call
         """
         pass
     async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
         """
         Called when an API call has completed.
         Args:
             kwargs: The kwargs that were passed to the API call
             result: The result of the API call
@@ -126,7 +140,7 @@ class AsyncCallbackHandler(ABC):
     async def on_usage(self, usage: Dict[str, Any]) -> None:
         """
         Called when usage information is received.
         Args:
             usage: The usage information
         """
@@ -135,7 +149,7 @@ class AsyncCallbackHandler(ABC):
     async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
         """
         Called when a screenshot is taken.
         Args:
             screenshot: The screenshot image
             name: The name of the screenshot
@@ -145,9 +159,9 @@ class AsyncCallbackHandler(ABC):
     async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
         """
         Called when responses are received.
         Args:
             kwargs: The kwargs being passed to the agent loop
             responses: The responses received
         """
-        pass
+        pass

agent/callbacks/budget_manager.py CHANGED Viewed

@@ -1,17 +1,23 @@
-from typing import Dict, List, Any
+from typing import Any, Dict, List
 from .base import AsyncCallbackHandler
 class BudgetExceededError(Exception):
     """Exception raised when budget is exceeded."""
     pass
 class BudgetManagerCallback(AsyncCallbackHandler):
     """Budget manager callback that tracks usage costs and can stop execution when budget is exceeded."""
-    def __init__(self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False):
+    def __init__(
+        self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False
+    ):
         """
         Initialize BudgetManagerCallback.
         Args:
             max_budget: Maximum budget allowed
             reset_after_each_run: Whether to reset budget after each run
@@ -21,24 +27,30 @@ class BudgetManagerCallback(AsyncCallbackHandler):
         self.reset_after_each_run = reset_after_each_run
         self.raise_error = raise_error
         self.total_cost = 0.0
     async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
         """Reset budget if configured to do so."""
         if self.reset_after_each_run:
             self.total_cost = 0.0
     async def on_usage(self, usage: Dict[str, Any]) -> None:
         """Track usage costs."""
         if "response_cost" in usage:
             self.total_cost += usage["response_cost"]
-    async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
+    async def on_run_continue(
+        self,
+        kwargs: Dict[str, Any],
+        old_items: List[Dict[str, Any]],
+        new_items: List[Dict[str, Any]],
+    ) -> bool:
         """Check if budget allows continuation."""
         if self.total_cost >= self.max_budget:
             if self.raise_error:
-                raise BudgetExceededError(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
+                raise BudgetExceededError(
+                    f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}"
+                )
             else:
                 print(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
             return False
         return True

agent/callbacks/image_retention.py CHANGED Viewed

@@ -2,7 +2,8 @@
 Image retention callback handler that limits the number of recent images in message history.
 """
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 from .base import AsyncCallbackHandler
@@ -11,40 +12,40 @@ class ImageRetentionCallback(AsyncCallbackHandler):
     Callback handler that applies image retention policy to limit the number
     of recent images in message history to prevent context window overflow.
     """
     def __init__(self, only_n_most_recent_images: Optional[int] = None):
         """
         Initialize the image retention callback.
         Args:
             only_n_most_recent_images: If set, only keep the N most recent images in message history
         """
         self.only_n_most_recent_images = only_n_most_recent_images
     async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         Apply image retention policy to messages before sending to agent loop.
         Args:
             messages: List of message dictionaries
         Returns:
             List of messages with image retention policy applied
         """
         if self.only_n_most_recent_images is None:
             return messages
         return self._apply_image_retention(messages)
     def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Apply image retention policy to keep only the N most recent images.
         Removes computer_call_output items with image_url and their corresponding computer_call items,
         keeping only the most recent N image pairs based on only_n_most_recent_images setting.
         Args:
             messages: List of message dictionaries
         Returns:
             Filtered list of messages with image retention applied
         """
@@ -78,7 +79,11 @@ class ImageRetentionCallback(AsyncCallbackHandler):
             # Remove the immediately preceding computer_call with matching call_id (if present)
             call_id = messages[idx].get("call_id")
             prev_idx = idx - 1
-            if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id:
+            if (
+                prev_idx >= 0
+                and messages[prev_idx].get("type") == "computer_call"
+                and messages[prev_idx].get("call_id") == call_id
+            ):
                 to_remove.add(prev_idx)
                 # Check a single reasoning immediately before that computer_call
                 r_idx = prev_idx - 1
@@ -87,4 +92,4 @@ class ImageRetentionCallback(AsyncCallbackHandler):
         # Construct filtered list
         filtered = [m for i, m in enumerate(messages) if i not in to_remove]
-        return filtered
+        return filtered

agent/callbacks/logging.py CHANGED Viewed

@@ -4,17 +4,18 @@ Logging callback for ComputerAgent that provides configurable logging of agent l
 import json
 import logging
-from typing import Dict, List, Any, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 from .base import AsyncCallbackHandler
 def sanitize_image_urls(data: Any) -> Any:
     """
     Recursively search for 'image_url' keys and set their values to '[omitted]'.
     Args:
         data: Any data structure (dict, list, or primitive type)
     Returns:
         A deep copy of the data with all 'image_url' values replaced with '[omitted]'
     """
@@ -28,11 +29,11 @@ def sanitize_image_urls(data: Any) -> Any:
                 # Recursively sanitize the value
                 sanitized[key] = sanitize_image_urls(value)
         return sanitized
     elif isinstance(data, list):
         # Recursively sanitize each item in the list
         return [sanitize_image_urls(item) for item in data]
     else:
         # For primitive types (str, int, bool, None, etc.), return as-is
         return data
@@ -41,37 +42,36 @@ def sanitize_image_urls(data: Any) -> Any:
 class LoggingCallback(AsyncCallbackHandler):
     """
     Callback handler that logs agent lifecycle events with configurable verbosity.
     Logging levels:
     - DEBUG: All events including API calls, message preprocessing, and detailed outputs
-    - INFO: Major lifecycle events (start/end, messages, outputs)
+    - INFO: Major lifecycle events (start/end, messages, outputs)
     - WARNING: Only warnings and errors
     - ERROR: Only errors
     """
     def __init__(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO):
         """
         Initialize the logging callback.
         Args:
             logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
             level: Logging level (logging.DEBUG, logging.INFO, etc.)
         """
-        self.logger = logger or logging.getLogger('agent.ComputerAgent')
+        self.logger = logger or logging.getLogger("agent.ComputerAgent")
         self.level = level
         # Set up logger if it doesn't have handlers
         if not self.logger.handlers:
             handler = logging.StreamHandler()
-            formatter = logging.Formatter(
-                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-            )
+            formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
             handler.setFormatter(formatter)
             self.logger.addHandler(handler)
             self.logger.setLevel(level)
     def _update_usage(self, usage: Dict[str, Any]) -> None:
         """Update total usage statistics."""
         def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
             for key, value in source.items():
                 if isinstance(value, dict):
@@ -82,18 +82,25 @@ class LoggingCallback(AsyncCallbackHandler):
                     if key not in target:
                         target[key] = 0
                     target[key] += value
         add_dicts(self.total_usage, usage)
     async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
         """Called before the run starts."""
         self.total_usage = {}
     async def on_usage(self, usage: Dict[str, Any]) -> None:
         """Called when usage information is received."""
         self._update_usage(usage)
-    async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
+    async def on_run_end(
+        self,
+        kwargs: Dict[str, Any],
+        old_items: List[Dict[str, Any]],
+        new_items: List[Dict[str, Any]],
+    ) -> None:
         """Called after the run ends."""
         def format_dict(d, indent=0):
             lines = []
             prefix = f" - {' ' * indent}"
@@ -106,10 +113,10 @@ class LoggingCallback(AsyncCallbackHandler):
                 else:
                     lines.append(f"{prefix}{key}: {value}")
             return lines
         formatted_output = "\n".join(format_dict(self.total_usage))
         self.logger.info(f"Total usage:\n{formatted_output}")
     async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Called before LLM processing starts."""
         if self.logger.isEnabledFor(logging.INFO):
@@ -118,27 +125,27 @@ class LoggingCallback(AsyncCallbackHandler):
             sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
             self.logger.debug(f"LLM input messages: {json.dumps(sanitized_messages, indent=2)}")
         return messages
     async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Called after LLM processing ends."""
         if self.logger.isEnabledFor(logging.DEBUG):
             sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
             self.logger.debug(f"LLM output: {json.dumps(sanitized_messages, indent=2)}")
         return messages
     async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
         """Called when a computer call starts."""
         action = item.get("action", {})
         action_type = action.get("type", "unknown")
         action_args = {k: v for k, v in action.items() if k != "type"}
         # INFO level logging for the action
         self.logger.info(f"Computer: {action_type}({action_args})")
         # DEBUG level logging for full details
         if self.logger.isEnabledFor(logging.DEBUG):
             self.logger.debug(f"Computer call started: {json.dumps(action, indent=2)}")
     async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None:
         """Called when a computer call ends."""
         if self.logger.isEnabledFor(logging.DEBUG):
@@ -147,48 +154,52 @@ class LoggingCallback(AsyncCallbackHandler):
             if result:
                 sanitized_result = sanitize_image_urls(result)
                 self.logger.debug(f"Computer call result: {json.dumps(sanitized_result, indent=2)}")
     async def on_function_call_start(self, item: Dict[str, Any]) -> None:
         """Called when a function call starts."""
         name = item.get("name", "unknown")
         arguments = item.get("arguments", "{}")
         # INFO level logging for the function call
         self.logger.info(f"Function: {name}({arguments})")
         # DEBUG level logging for full details
         if self.logger.isEnabledFor(logging.DEBUG):
             self.logger.debug(f"Function call started: {name}")
     async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None:
         """Called when a function call ends."""
         # INFO level logging for function output (similar to function_call_output)
         if result:
             # Handle both list and direct result formats
             if isinstance(result, list) and len(result) > 0:
-                output = result[0].get("output", str(result)) if isinstance(result[0], dict) else str(result[0])
+                output = (
+                    result[0].get("output", str(result))
+                    if isinstance(result[0], dict)
+                    else str(result[0])
+                )
             else:
                 output = str(result)
             # Truncate long outputs
             if len(output) > 100:
                 output = output[:100] + "..."
             self.logger.info(f"Output: {output}")
         # DEBUG level logging for full details
         if self.logger.isEnabledFor(logging.DEBUG):
             name = item.get("name", "unknown")
             self.logger.debug(f"Function call completed: {name}")
             if result:
                 self.logger.debug(f"Function call result: {json.dumps(result, indent=2)}")
     async def on_text(self, item: Dict[str, Any]) -> None:
         """Called when a text message is encountered."""
         # Get the role to determine if it's Agent or User
         role = item.get("role", "unknown")
         content_items = item.get("content", [])
         # Process content items to build display text
         text_parts = []
         for content_item in content_items:
@@ -206,10 +217,10 @@ class LoggingCallback(AsyncCallbackHandler):
             else:
                 # Non-text content, show as [type]
                 text_parts.append(f"[{content_type}]")
         # Join all text parts
-        display_text = ''.join(text_parts) if text_parts else "[empty]"
+        display_text = "".join(text_parts) if text_parts else "[empty]"
         # Log with appropriate level and format
         if role == "assistant":
             self.logger.info(f"Agent: {display_text}")
@@ -219,7 +230,7 @@ class LoggingCallback(AsyncCallbackHandler):
             # Fallback for unknown roles, use debug level
             if self.logger.isEnabledFor(logging.DEBUG):
                 self.logger.debug(f"Text message ({role}): {display_text}")
     async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
         """Called when an API call is about to start."""
         if self.logger.isEnabledFor(logging.DEBUG):
@@ -232,16 +243,18 @@ class LoggingCallback(AsyncCallbackHandler):
             elif "input" in kwargs:
                 sanitized_input = sanitize_image_urls(kwargs["input"])
                 self.logger.debug(f"API call input: {json.dumps(sanitized_input, indent=2)}")
     async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
         """Called when an API call has completed."""
         if self.logger.isEnabledFor(logging.DEBUG):
             model = kwargs.get("model", "unknown")
             self.logger.debug(f"API call completed for model: {model}")
-            self.logger.debug(f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}")
+            self.logger.debug(
+                f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}"
+            )
     async def on_screenshot(self, item: Union[str, bytes], name: str = "screenshot") -> None:
         """Called when a screenshot is taken."""
         if self.logger.isEnabledFor(logging.DEBUG):
             image_size = len(item) / 1024
-            self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")
+            self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")

agent/callbacks/operator_validator.py CHANGED Viewed

@@ -9,6 +9,7 @@ Ensures agent output actions conform to expected schemas by fixing common issues
 This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
 The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
 """
 from __future__ import annotations
 from typing import Any, Dict, List
@@ -48,6 +49,7 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
                 action["type"] = "type"
             action_type = action.get("type")
             def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
                 """Keep only the provided keys on action; delete everything else.
                 Always ensures required 'type' is present if listed in keys_to_keep.
@@ -55,6 +57,7 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
                 for key in list(action.keys()):
                     if key not in keys_to_keep:
                         del action[key]
             # rename "coordinate" to "x", "y"
             if "coordinate" in action:
                 action["x"] = action["coordinate"][0]
@@ -100,7 +103,6 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
             keep = required_keys_by_type.get(action_type or "")
             if keep:
                 _keep_keys(action, keep)
         # # Second pass: if an assistant message is immediately followed by a computer_call,
         # # replace the assistant message itself with a reasoning message with summary text.

cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.34py3-none-any.whl → 0.4.35py3-none-any.whl