PyPI - cua-agent - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl - Mend

cua-agent 0.3.2py3-none-any.whl → 0.4.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show

agent/__init__.py +15 -51
agent/__main__.py +21 -0
agent/adapters/__init__.py +9 -0
agent/adapters/huggingfacelocal_adapter.py +216 -0
agent/agent.py +577 -0
agent/callbacks/__init__.py +17 -0
agent/callbacks/base.py +153 -0
agent/callbacks/budget_manager.py +44 -0
agent/callbacks/image_retention.py +139 -0
agent/callbacks/logging.py +247 -0
agent/callbacks/pii_anonymization.py +259 -0
agent/callbacks/trajectory_saver.py +305 -0
agent/cli.py +290 -0
agent/computer_handler.py +107 -0
agent/decorators.py +90 -0
agent/loops/__init__.py +11 -0
agent/loops/anthropic.py +728 -0
agent/loops/omniparser.py +339 -0
agent/loops/openai.py +95 -0
agent/loops/uitars.py +688 -0
agent/responses.py +207 -0
agent/types.py +79 -0
agent/ui/__init__.py +7 -1
agent/ui/gradio/__init__.py +6 -19
agent/ui/gradio/app.py +80 -1299
agent/ui/gradio/ui_components.py +703 -0
cua_agent-0.4.0b1.dist-info/METADATA +424 -0
cua_agent-0.4.0b1.dist-info/RECORD +30 -0
agent/core/__init__.py +0 -27
agent/core/agent.py +0 -210
agent/core/base.py +0 -217
agent/core/callbacks.py +0 -200
agent/core/experiment.py +0 -249
agent/core/factory.py +0 -122
agent/core/messages.py +0 -332
agent/core/provider_config.py +0 -21
agent/core/telemetry.py +0 -142
agent/core/tools/__init__.py +0 -21
agent/core/tools/base.py +0 -74
agent/core/tools/bash.py +0 -52
agent/core/tools/collection.py +0 -46
agent/core/tools/computer.py +0 -113
agent/core/tools/edit.py +0 -67
agent/core/tools/manager.py +0 -56
agent/core/tools.py +0 -32
agent/core/types.py +0 -88
agent/core/visualization.py +0 -197
agent/providers/__init__.py +0 -4
agent/providers/anthropic/__init__.py +0 -6
agent/providers/anthropic/api/client.py +0 -360
agent/providers/anthropic/api/logging.py +0 -150
agent/providers/anthropic/api_handler.py +0 -140
agent/providers/anthropic/callbacks/__init__.py +0 -5
agent/providers/anthropic/callbacks/manager.py +0 -65
agent/providers/anthropic/loop.py +0 -568
agent/providers/anthropic/prompts.py +0 -23
agent/providers/anthropic/response_handler.py +0 -226
agent/providers/anthropic/tools/__init__.py +0 -33
agent/providers/anthropic/tools/base.py +0 -88
agent/providers/anthropic/tools/bash.py +0 -66
agent/providers/anthropic/tools/collection.py +0 -34
agent/providers/anthropic/tools/computer.py +0 -396
agent/providers/anthropic/tools/edit.py +0 -326
agent/providers/anthropic/tools/manager.py +0 -54
agent/providers/anthropic/tools/run.py +0 -42
agent/providers/anthropic/types.py +0 -16
agent/providers/anthropic/utils.py +0 -381
agent/providers/omni/__init__.py +0 -8
agent/providers/omni/api_handler.py +0 -42
agent/providers/omni/clients/anthropic.py +0 -103
agent/providers/omni/clients/base.py +0 -35
agent/providers/omni/clients/oaicompat.py +0 -195
agent/providers/omni/clients/ollama.py +0 -122
agent/providers/omni/clients/openai.py +0 -155
agent/providers/omni/clients/utils.py +0 -25
agent/providers/omni/image_utils.py +0 -34
agent/providers/omni/loop.py +0 -990
agent/providers/omni/parser.py +0 -307
agent/providers/omni/prompts.py +0 -64
agent/providers/omni/tools/__init__.py +0 -30
agent/providers/omni/tools/base.py +0 -29
agent/providers/omni/tools/bash.py +0 -74
agent/providers/omni/tools/computer.py +0 -179
agent/providers/omni/tools/manager.py +0 -61
agent/providers/omni/utils.py +0 -236
agent/providers/openai/__init__.py +0 -6
agent/providers/openai/api_handler.py +0 -456
agent/providers/openai/loop.py +0 -472
agent/providers/openai/response_handler.py +0 -205
agent/providers/openai/tools/__init__.py +0 -15
agent/providers/openai/tools/base.py +0 -79
agent/providers/openai/tools/computer.py +0 -326
agent/providers/openai/tools/manager.py +0 -106
agent/providers/openai/types.py +0 -36
agent/providers/openai/utils.py +0 -98
agent/providers/uitars/__init__.py +0 -1
agent/providers/uitars/clients/base.py +0 -35
agent/providers/uitars/clients/mlxvlm.py +0 -263
agent/providers/uitars/clients/oaicompat.py +0 -214
agent/providers/uitars/loop.py +0 -660
agent/providers/uitars/prompts.py +0 -63
agent/providers/uitars/tools/__init__.py +0 -1
agent/providers/uitars/tools/computer.py +0 -283
agent/providers/uitars/tools/manager.py +0 -60
agent/providers/uitars/utils.py +0 -264
agent/telemetry.py +0 -21
agent/ui/__main__.py +0 -15
cua_agent-0.3.2.dist-info/METADATA +0 -295
cua_agent-0.3.2.dist-info/RECORD +0 -87
{cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
{cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0

agent/cli.py ADDED Viewed

@@ -0,0 +1,290 @@
+"""
+CLI chat interface for agent - Computer Use Agent
+Usage:
+    python -m agent.cli <model_string>
+Examples:
+    python -m agent.cli openai/computer-use-preview
+    python -m agent.cli anthropic/claude-3-5-sonnet-20241022
+    python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
+"""
+import asyncio
+import argparse
+import os
+import sys
+import json
+from typing import List, Dict, Any
+import dotenv
+from yaspin import yaspin
+# Load environment variables
+dotenv.load_dotenv()
+# Color codes for terminal output
+class Colors:
+    RESET = '\033[0m'
+    BOLD = '\033[1m'
+    DIM = '\033[2m'
+    # Text colors
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    MAGENTA = '\033[35m'
+    CYAN = '\033[36m'
+    WHITE = '\033[37m'
+    GRAY = '\033[90m'
+    # Background colors
+    BG_RED = '\033[41m'
+    BG_GREEN = '\033[42m'
+    BG_YELLOW = '\033[43m'
+    BG_BLUE = '\033[44m'
+def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
+    """Print colored text to terminal."""
+    prefix = ""
+    if bold:
+        prefix += Colors.BOLD
+    if dim:
+        prefix += Colors.DIM
+    if color:
+        prefix += color
+    print(f"{prefix}{text}{Colors.RESET}", end=end)
+def print_action(action_type: str, details: Dict[str, Any]):
+    """Print computer action with nice formatting."""
+    # Format action details
+    args_str = ""
+    if action_type == "click" and "x" in details and "y" in details:
+        args_str = f"({details['x']}, {details['y']})"
+    elif action_type == "type" and "text" in details:
+        text = details["text"]
+        if len(text) > 50:
+            text = text[:47] + "..."
+        args_str = f'"{text}"'
+    elif action_type == "key" and "key" in details:
+        args_str = f"'{details['key']}'"
+    elif action_type == "scroll" and "x" in details and "y" in details:
+        args_str = f"({details['x']}, {details['y']})"
+    print_colored(f"🛠️  {action_type}{args_str}", dim=True)
+def print_welcome(model: str, agent_loop: str, container_name: str):
+    """Print welcome message."""
+    print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
+    print_colored("Type 'exit' to quit.", dim=True)
+async def ainput(prompt: str = ""):
+    return await asyncio.to_thread(input, prompt)
+async def chat_loop(agent, model: str, container_name: str):
+    """Main chat loop with the agent."""
+    print_welcome(model, agent.agent_loop.__name__, container_name)
+    history = []
+    while True:
+        # Get user input with prompt
+        print_colored("> ", end="")
+        user_input = await ainput()
+        if user_input.lower() in ['exit', 'quit', 'q']:
+            print_colored("\n👋 Goodbye!")
+            break
+        if not user_input:
+            continue
+        # Add user message to history
+        history.append({"role": "user", "content": user_input})
+        # Stream responses from the agent with spinner
+        with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
+            spinner.hide()
+            async for result in agent.run(history):
+                # Add agent responses to history
+                history.extend(result.get("output", []))
+                # Process and display the output
+                for item in result.get("output", []):
+                    if item.get("type") == "message":
+                        # Display agent text response
+                        content = item.get("content", [])
+                        for content_part in content:
+                            if content_part.get("text"):
+                                text = content_part.get("text", "").strip()
+                                if text:
+                                    spinner.hide()
+                                    print_colored(text)
+                    elif item.get("type") == "computer_call":
+                        # Display computer action
+                        action = item.get("action", {})
+                        action_type = action.get("type", "")
+                        if action_type:
+                            spinner.hide()
+                            print_action(action_type, action)
+                            spinner.text = f"Performing {action_type}..."
+                            spinner.show()
+                    elif item.get("type") == "function_call":
+                        # Display function call
+                        function_name = item.get("name", "")
+                        spinner.hide()
+                        print_colored(f"🔧 Calling function: {function_name}", dim=True)
+                        spinner.text = f"Calling {function_name}..."
+                        spinner.show()
+                    elif item.get("type") == "function_call_output":
+                        # Display function output (dimmed)
+                        output = item.get("output", "")
+                        if output and len(output.strip()) > 0:
+                            spinner.hide()
+                            print_colored(f"📤 {output}", dim=True)
+            spinner.hide()
+async def main():
+    """Main CLI function."""
+    parser = argparse.ArgumentParser(
+        description="CUA Agent CLI - Interactive computer use assistant",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m agent.cli openai/computer-use-preview
+  python -m agent.cli anthropic/claude-3-5-sonnet-20241022
+  python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
+  python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
+        """
+    )
+    parser.add_argument(
+        "model",
+        help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
+    )
+    parser.add_argument(
+        "--images",
+        type=int,
+        default=3,
+        help="Number of recent images to keep in context (default: 3)"
+    )
+    parser.add_argument(
+        "--trajectory",
+        action="store_true",
+        help="Save trajectory for debugging"
+    )
+    parser.add_argument(
+        "--budget",
+        type=float,
+        help="Maximum budget for the session (in dollars)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    args = parser.parse_args()
+    # Check for required environment variables
+    container_name = os.getenv("CUA_CONTAINER_NAME")
+    cua_api_key = os.getenv("CUA_API_KEY")
+    # Prompt for missing environment variables
+    if not container_name:
+        print_colored("CUA_CONTAINER_NAME not set.", dim=True)
+        print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
+        container_name = input("Enter your CUA container name: ").strip()
+        if not container_name:
+            print_colored("❌ Container name is required.")
+            sys.exit(1)
+    if not cua_api_key:
+        print_colored("CUA_API_KEY not set.", dim=True)
+        cua_api_key = input("Enter your CUA API key: ").strip()
+        if not cua_api_key:
+            print_colored("❌ API key is required.")
+            sys.exit(1)
+    # Check for provider-specific API keys based on model
+    provider_api_keys = {
+        "openai/": "OPENAI_API_KEY",
+        "anthropic/": "ANTHROPIC_API_KEY",
+        "omniparser+": "OPENAI_API_KEY",
+        "omniparser+": "ANTHROPIC_API_KEY",
+    }
+    # Find matching provider and check for API key
+    for prefix, env_var in provider_api_keys.items():
+        if args.model.startswith(prefix):
+            if not os.getenv(env_var):
+                print_colored(f"{env_var} not set.", dim=True)
+                api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
+                if not api_key:
+                    print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
+                    sys.exit(1)
+                # Set the environment variable for the session
+                os.environ[env_var] = api_key
+            break
+    # Import here to avoid import errors if dependencies are missing
+    try:
+        from agent import ComputerAgent
+        from computer import Computer
+    except ImportError as e:
+        print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
+        print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
+        sys.exit(1)
+    # Create computer instance
+    async with Computer(
+        os_type="linux",
+        provider_type="cloud",
+        name=container_name,
+        api_key=cua_api_key
+    ) as computer:
+        # Create agent
+        agent_kwargs = {
+            "model": args.model,
+            "tools": [computer],
+            "only_n_most_recent_images": args.images,
+            "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
+        }
+        if args.trajectory:
+            agent_kwargs["trajectory_dir"] = "trajectories"
+        if args.budget:
+            agent_kwargs["max_trajectory_budget"] = {
+                "max_budget": args.budget,
+                "raise_error": True,
+                "reset_after_each_run": False
+            }
+        agent = ComputerAgent(**agent_kwargs)
+        # Start chat loop
+        await chat_loop(agent, args.model, container_name)
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except (KeyboardInterrupt, EOFError) as _:
+        print_colored("\n\n👋 Goodbye!")

agent/computer_handler.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""
+Computer handler implementation for OpenAI computer-use-preview protocol.
+"""
+import base64
+from typing import Dict, List, Any, Literal
+from .types import Computer
+class OpenAIComputerHandler:
+    """Computer handler that implements the Computer protocol using the computer interface."""
+    def __init__(self, computer_interface):
+        """Initialize with a computer interface (from tool schema)."""
+        self.interface = computer_interface
+    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
+        """Get the current environment type."""
+        # For now, return a default - this could be enhanced to detect actual environment
+        return "windows"
+    async def get_dimensions(self) -> tuple[int, int]:
+        """Get screen dimensions as (width, height)."""
+        screen_size = await self.interface.get_screen_size()
+        return screen_size["width"], screen_size["height"]
+    async def screenshot(self) -> str:
+        """Take a screenshot and return as base64 string."""
+        screenshot_bytes = await self.interface.screenshot()
+        return base64.b64encode(screenshot_bytes).decode('utf-8')
+    async def click(self, x: int, y: int, button: str = "left") -> None:
+        """Click at coordinates with specified button."""
+        if button == "left":
+            await self.interface.left_click(x, y)
+        elif button == "right":
+            await self.interface.right_click(x, y)
+        else:
+            # Default to left click for unknown buttons
+            await self.interface.left_click(x, y)
+    async def double_click(self, x: int, y: int) -> None:
+        """Double click at coordinates."""
+        await self.interface.double_click(x, y)
+    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
+        """Scroll at coordinates with specified scroll amounts."""
+        await self.interface.move_cursor(x, y)
+        await self.interface.scroll(scroll_x, scroll_y)
+    async def type(self, text: str) -> None:
+        """Type text."""
+        await self.interface.type_text(text)
+    async def wait(self, ms: int = 1000) -> None:
+        """Wait for specified milliseconds."""
+        import asyncio
+        await asyncio.sleep(ms / 1000.0)
+    async def move(self, x: int, y: int) -> None:
+        """Move cursor to coordinates."""
+        await self.interface.move_cursor(x, y)
+    async def keypress(self, keys: List[str]) -> None:
+        """Press key combination."""
+        if len(keys) == 1:
+            await self.interface.press_key(keys[0])
+        else:
+            # Handle key combinations
+            await self.interface.hotkey(*keys)
+    async def drag(self, path: List[Dict[str, int]]) -> None:
+        """Drag along specified path."""
+        if not path:
+            return
+        # Start drag from first point
+        start = path[0]
+        await self.interface.mouse_down(start["x"], start["y"])
+        # Move through path
+        for point in path[1:]:
+            await self.interface.move_cursor(point["x"], point["y"])
+        # End drag at last point
+        end = path[-1]
+        await self.interface.mouse_up(end["x"], end["y"])
+    async def get_current_url(self) -> str:
+        """Get current URL (for browser environments)."""
+        # This would need to be implemented based on the specific browser interface
+        # For now, return empty string
+        return ""
+def acknowledge_safety_check_callback(message: str) -> bool:
+    """Safety check callback for user acknowledgment."""
+    response = input(
+        f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
+    ).lower()
+    return response.strip() == "y"
+def check_blocklisted_url(url: str) -> None:
+    """Check if URL is blocklisted (placeholder implementation)."""
+    # This would contain actual URL checking logic
+    pass

agent/decorators.py ADDED Viewed

@@ -0,0 +1,90 @@
+"""
+Decorators for agent - agent_loop decorator
+"""
+import asyncio
+import inspect
+from typing import Dict, List, Any, Callable, Optional
+from functools import wraps
+from .types import AgentLoopInfo
+# Global registry
+_agent_loops: List[AgentLoopInfo] = []
+def agent_loop(models: str, priority: int = 0):
+    """
+    Decorator to register an agent loop function.
+    Args:
+        models: Regex pattern to match supported models
+        priority: Priority for loop selection (higher = more priority)
+    """
+    def decorator(func: Callable):
+        # Validate function signature
+        sig = inspect.signature(func)
+        required_params = {'messages', 'model'}
+        func_params = set(sig.parameters.keys())
+        if not required_params.issubset(func_params):
+            missing = required_params - func_params
+            raise ValueError(f"Agent loop function must have parameters: {missing}")
+        # Register the loop
+        loop_info = AgentLoopInfo(
+            func=func,
+            models_regex=models,
+            priority=priority
+        )
+        _agent_loops.append(loop_info)
+        # Sort by priority (highest first)
+        _agent_loops.sort(key=lambda x: x.priority, reverse=True)
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            # Wrap the function in an asyncio.Queue for cancellation support
+            queue = asyncio.Queue()
+            task = None
+            try:
+                # Create a task that can be cancelled
+                async def run_loop():
+                    try:
+                        result = await func(*args, **kwargs)
+                        await queue.put(('result', result))
+                    except Exception as e:
+                        await queue.put(('error', e))
+                task = asyncio.create_task(run_loop())
+                # Wait for result or cancellation
+                event_type, data = await queue.get()
+                if event_type == 'error':
+                    raise data
+                return data
+            except asyncio.CancelledError:
+                if task:
+                    task.cancel()
+                    try:
+                        await task
+                    except asyncio.CancelledError:
+                        pass
+                raise
+        return wrapper
+    return decorator
+def get_agent_loops() -> List[AgentLoopInfo]:
+    """Get all registered agent loops"""
+    return _agent_loops.copy()
+def find_agent_loop(model: str) -> Optional[AgentLoopInfo]:
+    """Find the best matching agent loop for a model"""
+    for loop_info in _agent_loops:
+        if loop_info.matches_model(model):
+            return loop_info
+    return None

agent/loops/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+Agent loops for agent
+"""
+# Import the loops to register them
+from . import anthropic
+from . import openai
+from . import uitars
+from . import omniparser
+__all__ = ["anthropic", "openai", "uitars", "omniparser"]

cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

Potentially problematic release.

cua-agent 0.3.2py3-none-any.whl → 0.4.0b1py3-none-any.whl