PyPI - cua-agent - Versions diffs - 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/__init__.py +4 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +337 -185
agent/callbacks/__init__.py +9 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +35 -33
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +99 -61
agent/callbacks/trajectory_saver.py +95 -69
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +38 -99
agent/integrations/hud/agent.py +369 -0
agent/integrations/hud/proxy.py +166 -52
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +579 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +136 -150
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +50 -51
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +247 -206
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +61 -57
agent/proxy/handlers.py +46 -39
agent/responses.py +447 -347
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
cua_agent-0.4.22.dist-info/METADATA +0 -436
cua_agent-0.4.22.dist-info/RECORD +0 -51
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/computers/base.py CHANGED Viewed

@@ -2,69 +2,82 @@
 Base computer interface protocol for agent interactions.
 """
-from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    Union,
+    runtime_checkable,
+)
 @runtime_checkable
 class AsyncComputerHandler(Protocol):
     """Protocol defining the interface for computer interactions."""
-    # ==== Computer-Use-Preview Action Space ====
+    # ==== Computer-Use-Preview Action Space ====
     async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
         """Get the current environment type."""
         ...
     async def get_dimensions(self) -> tuple[int, int]:
         """Get screen dimensions as (width, height)."""
         ...
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         ...
     async def click(self, x: int, y: int, button: str = "left") -> None:
         """Click at coordinates with specified button."""
         ...
     async def double_click(self, x: int, y: int) -> None:
         """Double click at coordinates."""
         ...
     async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
         """Scroll at coordinates with specified scroll amounts."""
         ...
     async def type(self, text: str) -> None:
         """Type text."""
         ...
     async def wait(self, ms: int = 1000) -> None:
         """Wait for specified milliseconds."""
         ...
     async def move(self, x: int, y: int) -> None:
         """Move cursor to coordinates."""
         ...
     async def keypress(self, keys: Union[List[str], str]) -> None:
         """Press key combination."""
         ...
     async def drag(self, path: List[Dict[str, int]]) -> None:
         """Drag along specified path."""
         ...
     async def get_current_url(self) -> str:
         """Get current URL (for browser environments)."""
         ...
-    # ==== Anthropic Action Space ====
+    # ==== Anthropic Action Space ====
     async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse down at coordinates."""
         ...
     async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse up at coordinates."""
         ...

agent/computers/cua.py CHANGED Viewed

@@ -3,24 +3,27 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
 """
 import base64
-from typing import Dict, List, Any, Literal, Union, Optional
-from .base import AsyncComputerHandler
+from typing import Any, Dict, List, Literal, Optional, Union
 from computer import Computer
+from .base import AsyncComputerHandler
 class cuaComputerHandler(AsyncComputerHandler):
     """Computer handler that implements the Computer protocol using the computer interface."""
     def __init__(self, cua_computer: Computer):
         """Initialize with a computer interface (from tool schema)."""
         self.cua_computer = cua_computer
         self.interface = None
     async def _initialize(self):
-        if hasattr(self.cua_computer, '_initialized') and not self.cua_computer._initialized:
+        if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
             await self.cua_computer.run()
         self.interface = self.cua_computer.interface
-    # ==== Computer-Use-Preview Action Space ====
+    # ==== Computer-Use-Preview Action Space ====
     async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
         """Get the current environment type."""
@@ -32,13 +35,17 @@ class cuaComputerHandler(AsyncComputerHandler):
         assert self.interface is not None
         screen_size = await self.interface.get_screen_size()
         return screen_size["width"], screen_size["height"]
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         assert self.interface is not None
         screenshot_bytes = await self.interface.screenshot()
-        return base64.b64encode(screenshot_bytes).decode('utf-8')
+        return base64.b64encode(screenshot_bytes).decode("utf-8")
     async def click(self, x: int, y: int, button: str = "left") -> None:
         """Click at coordinates with specified button."""
         assert self.interface is not None
@@ -49,34 +56,35 @@ class cuaComputerHandler(AsyncComputerHandler):
         else:
             # Default to left click for unknown buttons
             await self.interface.left_click(x, y)
     async def double_click(self, x: int, y: int) -> None:
         """Double click at coordinates."""
         assert self.interface is not None
         await self.interface.double_click(x, y)
     async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
         """Scroll at coordinates with specified scroll amounts."""
         assert self.interface is not None
         await self.interface.move_cursor(x, y)
         await self.interface.scroll(scroll_x, scroll_y)
     async def type(self, text: str) -> None:
         """Type text."""
         assert self.interface is not None
         await self.interface.type_text(text)
     async def wait(self, ms: int = 1000) -> None:
         """Wait for specified milliseconds."""
         assert self.interface is not None
         import asyncio
         await asyncio.sleep(ms / 1000.0)
     async def move(self, x: int, y: int) -> None:
         """Move cursor to coordinates."""
         assert self.interface is not None
         await self.interface.move_cursor(x, y)
     async def keypress(self, keys: Union[List[str], str]) -> None:
         """Press key combination."""
         assert self.interface is not None
@@ -87,38 +95,57 @@ class cuaComputerHandler(AsyncComputerHandler):
         else:
             # Handle key combinations
             await self.interface.hotkey(*keys)
     async def drag(self, path: List[Dict[str, int]]) -> None:
         """Drag along specified path."""
         assert self.interface is not None
         if not path:
             return
         # Start drag from first point
         start = path[0]
         await self.interface.mouse_down(start["x"], start["y"])
         # Move through path
         for point in path[1:]:
             await self.interface.move_cursor(point["x"], point["y"])
         # End drag at last point
         end = path[-1]
         await self.interface.mouse_up(end["x"], end["y"])
     async def get_current_url(self) -> str:
         """Get current URL (for browser environments)."""
         # This would need to be implemented based on the specific browser interface
         # For now, return empty string
         return ""
-    # ==== Anthropic Computer Action Space ====
+    # ==== Anthropic Computer Action Space ====
     async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse down at coordinates."""
         assert self.interface is not None
         await self.interface.mouse_down(x, y, button="left")
     async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse up at coordinates."""
         assert self.interface is not None
-        await self.interface.mouse_up(x, y, button="left")
+        await self.interface.mouse_up(x, y, button="left")
+    # ==== Browser Control Methods (via Playwright) ====
+    async def playwright_exec(
+        self, command: str, params: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """Execute a Playwright browser command.
+        Supports: visit_url, click, type, scroll, web_search, screenshot,
+                  get_current_url, go_back, go_forward
+        Args:
+            command: The browser command to execute
+            params: Command parameters
+        Returns:
+            Dict containing the command result
+        """
+        assert self.interface is not None
+        return await self.interface.playwright_exec(command, params or {})

agent/computers/custom.py CHANGED Viewed

@@ -3,47 +3,49 @@ Custom computer handler implementation that accepts a dictionary of functions.
 """
 import base64
-from typing import Dict, List, Any, Literal, Union, Optional, Callable
-from PIL import Image
 import io
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from PIL import Image
 from .base import AsyncComputerHandler
 class CustomComputerHandler(AsyncComputerHandler):
     """Computer handler that implements the Computer protocol using a dictionary of custom functions."""
     def __init__(self, functions: Dict[str, Callable]):
         """
         Initialize with a dictionary of functions.
         Args:
             functions: Dictionary where keys are method names and values are callable functions.
                       Only 'screenshot' is required, all others are optional.
         Raises:
             ValueError: If required 'screenshot' function is not provided.
         """
-        if 'screenshot' not in functions:
+        if "screenshot" not in functions:
             raise ValueError("'screenshot' function is required in functions dictionary")
         self.functions = functions
         self._last_screenshot_size: Optional[tuple[int, int]] = None
     async def _call_function(self, func, *args, **kwargs):
         """
         Call a function, handling both async and sync functions.
         Args:
             func: The function to call
             *args: Positional arguments to pass to the function
             **kwargs: Keyword arguments to pass to the function
         Returns:
             The result of the function call
         """
         import asyncio
         import inspect
         if callable(func):
             if inspect.iscoroutinefunction(func):
                 return await func(*args, **kwargs)
@@ -51,14 +53,14 @@ class CustomComputerHandler(AsyncComputerHandler):
                 return func(*args, **kwargs)
         else:
             return func
     async def _get_value(self, attribute: str):
         """
         Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
         Args:
             attribute: The attribute name to look for
         Returns:
             The value from the functions dict, called if callable, returned directly if not
         """
@@ -66,20 +68,20 @@ class CustomComputerHandler(AsyncComputerHandler):
         get_key = f"get_{attribute}"
         if get_key in self.functions:
             return await self._call_function(self.functions[get_key])
-        # Check for '{attribute}'
+        # Check for '{attribute}'
         if attribute in self.functions:
             return await self._call_function(self.functions[attribute])
         return None
     def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
         """
         Convert image to base64 string.
         Args:
             img: Image as bytes, PIL Image, or base64 string
         Returns:
             str: Base64 encoded image string
         """
@@ -88,43 +90,47 @@ class CustomComputerHandler(AsyncComputerHandler):
             return img
         elif isinstance(img, bytes):
             # Raw bytes
-            return base64.b64encode(img).decode('utf-8')
+            return base64.b64encode(img).decode("utf-8")
         elif isinstance(img, Image.Image):
             # PIL Image
             buffer = io.BytesIO()
-            img.save(buffer, format='PNG')
-            return base64.b64encode(buffer.getvalue()).decode('utf-8')
+            img.save(buffer, format="PNG")
+            return base64.b64encode(buffer.getvalue()).decode("utf-8")
         else:
             raise ValueError(f"Unsupported image type: {type(img)}")
-    # ==== Computer-Use-Preview Action Space ====
+    # ==== Computer-Use-Preview Action Space ====
     async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
         """Get the current environment type."""
-        result = await self._get_value('environment')
+        result = await self._get_value("environment")
         if result is None:
             return "linux"
         assert result in ["windows", "mac", "linux", "browser"]
-        return result # type: ignore
+        return result  # type: ignore
     async def get_dimensions(self) -> tuple[int, int]:
         """Get screen dimensions as (width, height)."""
-        result = await self._get_value('dimensions')
+        result = await self._get_value("dimensions")
         if result is not None:
-            return result # type: ignore
+            return result  # type: ignore
         # Fallback: use last screenshot size if available
         if not self._last_screenshot_size:
             await self.screenshot()
         assert self._last_screenshot_size is not None, "Failed to get screenshot size"
         return self._last_screenshot_size
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
-        result = await self._call_function(self.functions['screenshot'])
-        b64_str = self._to_b64_str(result) # type: ignore
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
+        result = await self._call_function(self.functions["screenshot"])
+        b64_str = self._to_b64_str(result)  # type: ignore
         # Try to extract dimensions for fallback use
         try:
             if isinstance(result, Image.Image):
@@ -136,74 +142,75 @@ class CustomComputerHandler(AsyncComputerHandler):
         except Exception:
             # If we can't get dimensions, that's okay
             pass
         return b64_str
     async def click(self, x: int, y: int, button: str = "left") -> None:
         """Click at coordinates with specified button."""
-        if 'click' in self.functions:
-            await self._call_function(self.functions['click'], x, y, button)
+        if "click" in self.functions:
+            await self._call_function(self.functions["click"], x, y, button)
         # No-op if not implemented
     async def double_click(self, x: int, y: int) -> None:
         """Double click at coordinates."""
-        if 'double_click' in self.functions:
-            await self._call_function(self.functions['double_click'], x, y)
+        if "double_click" in self.functions:
+            await self._call_function(self.functions["double_click"], x, y)
         # No-op if not implemented
     async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
         """Scroll at coordinates with specified scroll amounts."""
-        if 'scroll' in self.functions:
-            await self._call_function(self.functions['scroll'], x, y, scroll_x, scroll_y)
+        if "scroll" in self.functions:
+            await self._call_function(self.functions["scroll"], x, y, scroll_x, scroll_y)
         # No-op if not implemented
     async def type(self, text: str) -> None:
         """Type text."""
-        if 'type' in self.functions:
-            await self._call_function(self.functions['type'], text)
+        if "type" in self.functions:
+            await self._call_function(self.functions["type"], text)
         # No-op if not implemented
     async def wait(self, ms: int = 1000) -> None:
         """Wait for specified milliseconds."""
-        if 'wait' in self.functions:
-            await self._call_function(self.functions['wait'], ms)
+        if "wait" in self.functions:
+            await self._call_function(self.functions["wait"], ms)
         else:
             # Default implementation
             import asyncio
             await asyncio.sleep(ms / 1000.0)
     async def move(self, x: int, y: int) -> None:
         """Move cursor to coordinates."""
-        if 'move' in self.functions:
-            await self._call_function(self.functions['move'], x, y)
+        if "move" in self.functions:
+            await self._call_function(self.functions["move"], x, y)
         # No-op if not implemented
     async def keypress(self, keys: Union[List[str], str]) -> None:
         """Press key combination."""
-        if 'keypress' in self.functions:
-            await self._call_function(self.functions['keypress'], keys)
+        if "keypress" in self.functions:
+            await self._call_function(self.functions["keypress"], keys)
         # No-op if not implemented
     async def drag(self, path: List[Dict[str, int]]) -> None:
         """Drag along specified path."""
-        if 'drag' in self.functions:
-            await self._call_function(self.functions['drag'], path)
+        if "drag" in self.functions:
+            await self._call_function(self.functions["drag"], path)
         # No-op if not implemented
     async def get_current_url(self) -> str:
         """Get current URL (for browser environments)."""
-        if 'get_current_url' in self.functions:
-            return await self._get_value('current_url') # type: ignore
+        if "get_current_url" in self.functions:
+            return await self._get_value("current_url")  # type: ignore
         return ""  # Default fallback
     async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse down at coordinates."""
-        if 'left_mouse_down' in self.functions:
-            await self._call_function(self.functions['left_mouse_down'], x, y)
+        if "left_mouse_down" in self.functions:
+            await self._call_function(self.functions["left_mouse_down"], x, y)
         # No-op if not implemented
     async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
         """Left mouse up at coordinates."""
-        if 'left_mouse_up' in self.functions:
-            await self._call_function(self.functions['left_mouse_up'], x, y)
+        if "left_mouse_up" in self.functions:
+            await self._call_function(self.functions["left_mouse_up"], x, y)
         # No-op if not implemented

agent/decorators.py CHANGED Viewed

@@ -3,47 +3,56 @@ Decorators for agent - agent_loop decorator
 """
 from typing import List, Optional
 from .types import AgentConfigInfo
 # Global registry
 _agent_configs: List[AgentConfigInfo] = []
 def register_agent(models: str, priority: int = 0):
     """
     Decorator to register an AsyncAgentConfig class.
     Args:
         models: Regex pattern to match supported models
         priority: Priority for agent selection (higher = more priority)
     """
     def decorator(agent_class: type):
         # Validate that the class implements AsyncAgentConfig protocol
-        if not hasattr(agent_class, 'predict_step'):
-            raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
-        if not hasattr(agent_class, 'predict_click'):
-            raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
-        if not hasattr(agent_class, 'get_capabilities'):
-            raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
+        if not hasattr(agent_class, "predict_step"):
+            raise ValueError(
+                f"Agent class {agent_class.__name__} must implement predict_step method"
+            )
+        if not hasattr(agent_class, "predict_click"):
+            raise ValueError(
+                f"Agent class {agent_class.__name__} must implement predict_click method"
+            )
+        if not hasattr(agent_class, "get_capabilities"):
+            raise ValueError(
+                f"Agent class {agent_class.__name__} must implement get_capabilities method"
+            )
         # Register the agent config
         config_info = AgentConfigInfo(
-            agent_class=agent_class,
-            models_regex=models,
-            priority=priority
+            agent_class=agent_class, models_regex=models, priority=priority
         )
         _agent_configs.append(config_info)
         # Sort by priority (highest first)
         _agent_configs.sort(key=lambda x: x.priority, reverse=True)
         return agent_class
     return decorator
 def get_agent_configs() -> List[AgentConfigInfo]:
     """Get all registered agent configs"""
     return _agent_configs.copy()
 def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
     """Find the best matching agent config for a model"""
     for config_info in _agent_configs:

agent/human_tool/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ Components:
 Usage:
     # Run the server and UI
     python -m agent.human_tool
     # Or run components separately
     python -m agent.human_tool.server  # API server only
     python -m agent.human_tool.ui      # UI only
@@ -21,9 +21,4 @@ Usage:
 from .server import CompletionQueue, completion_queue
 from .ui import HumanCompletionUI, create_ui
-__all__ = [
-    "CompletionQueue",
-    "completion_queue",
-    "HumanCompletionUI",
-    "create_ui"
-]
+__all__ = ["CompletionQueue", "completion_queue", "HumanCompletionUI", "create_ui"]

agent/human_tool/__main__.py CHANGED Viewed

@@ -8,6 +8,7 @@ with a Gradio UI for human interaction.
 import gradio as gr
 from fastapi import FastAPI
 from .server import app as fastapi_app
 from .ui import create_ui
@@ -18,6 +19,7 @@ gradio_demo = create_ui()
 CUSTOM_PATH = "/gradio"
 app = gr.mount_gradio_app(fastapi_app, gradio_demo, path=CUSTOM_PATH)
 # Add a redirect from root to Gradio UI
 @fastapi_app.get("/")
 async def redirect_to_ui():
@@ -25,14 +27,16 @@ async def redirect_to_ui():
     return {
         "message": "Human Completion Server is running",
         "ui_url": "/gradio",
-        "api_docs": "/docs"
+        "api_docs": "/docs",
     }
 if __name__ == "__main__":
     import uvicorn
     print("🚀 Starting Human-in-the-Loop Completion Server...")
     print("📊 API Server: http://localhost:8002")
     print("🎨 Gradio UI: http://localhost:8002/gradio")
     print("📚 API Docs: http://localhost:8002/docs")
     uvicorn.run(app, host="0.0.0.0", port=8002)

cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl