PyPI - cua-agent - Versions diffs - 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl - Mend

cua-agent 0.4.12py3-none-any.whl → 0.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (25) hide show

agent/adapters/__init__.py +2 -0
agent/adapters/huggingfacelocal_adapter.py +15 -3
agent/adapters/human_adapter.py +348 -0
agent/agent.py +29 -21
agent/callbacks/trajectory_saver.py +35 -26
agent/cli.py +1 -1
agent/computers/__init__.py +41 -0
agent/computers/base.py +70 -0
agent/{computer_handler.py → computers/cua.py} +26 -23
agent/computers/custom.py +209 -0
agent/human_tool/__init__.py +29 -0
agent/human_tool/__main__.py +38 -0
agent/human_tool/server.py +234 -0
agent/human_tool/ui.py +630 -0
agent/integrations/hud/__init__.py +77 -0
agent/integrations/hud/adapter.py +121 -0
agent/integrations/hud/agent.py +373 -0
agent/integrations/hud/computer_handler.py +187 -0
agent/types.py +1 -53
agent/ui/gradio/app.py +1 -0
agent/ui/gradio/ui_components.py +20 -9
{cua_agent-0.4.12.dist-info → cua_agent-0.4.13.dist-info}/METADATA +8 -5
{cua_agent-0.4.12.dist-info → cua_agent-0.4.13.dist-info}/RECORD +25 -13
{cua_agent-0.4.12.dist-info → cua_agent-0.4.13.dist-info}/WHEEL +0 -0
{cua_agent-0.4.12.dist-info → cua_agent-0.4.13.dist-info}/entry_points.txt +0 -0

agent/integrations/hud/__init__.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""HUD integration for ComputerAgent."""
+import logging
+from typing import Any, Optional, Dict
+from hud import run_job as hud_run_job
+from .agent import ComputerAgent
+from .adapter import ComputerAgentAdapter
+from .computer_handler import HUDComputerHandler
+async def run_job(
+    model: str,
+    task_or_taskset: Any,
+    job_name: str,
+    # Job kwargs
+    auto_reply_question: bool = False,
+    adapter_cls: Any = None,
+    adapter_kwargs: Optional[Dict[str, Any]] = None,
+    max_steps_per_task: int = 20,
+    run_parallel: bool = True,
+    job_metadata: Optional[Dict[str, Any]] = None,
+    show_progress: bool = True,
+    max_concurrent_env_creations: Optional[int] = 30,  # Limits gym.make calls
+    max_concurrent_agent_predictions: Optional[int] = None,  # No limit on LLM calls
+    max_concurrent_tasks: Optional[int] = 30,  # Limits overall task concurrency
+    **agent_kwargs: Any
+) -> Any:
+    """
+    Run a job using ComputerAgent with the specified model.
+    Args:
+        model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
+        task_or_taskset: Task or TaskSet to run
+        job_name: Name for the job
+        auto_reply_question: Whether to auto-reply to questions
+        adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
+        adapter_kwargs: Additional kwargs for the adapter
+        max_steps_per_task: Maximum steps per task
+        run_parallel: Whether to run tasks in parallel
+        job_metadata: Additional metadata for the job
+        show_progress: Whether to show progress
+        max_concurrent_env_creations: Max concurrent environment creations
+        max_concurrent_agent_predictions: Max concurrent agent predictions
+        max_concurrent_tasks: Max concurrent tasks
+        **agent_kwargs: Additional kwargs to pass to ComputerAgent
+    Returns:
+        Job instance from HUD
+    """
+    # combine verbose and verbosity kwargs
+    if "verbose" in agent_kwargs:
+        agent_kwargs["verbosity"] = logging.INFO
+        del agent_kwargs["verbose"]
+    verbose = True if agent_kwargs.get("verbosity", logging.WARNING) > logging.INFO else False
+    # run job
+    return await hud_run_job(
+        agent_cls=ComputerAgent,
+        agent_kwargs={"model": model, **agent_kwargs},
+        task_or_taskset=task_or_taskset,
+        job_name=job_name,
+        auto_reply_question=auto_reply_question,
+        adapter_cls=adapter_cls,
+        adapter_kwargs=adapter_kwargs,
+        max_steps_per_task=max_steps_per_task,
+        run_parallel=run_parallel,
+        job_metadata=job_metadata,
+        show_progress=show_progress,
+        verbose=verbose,
+        max_concurrent_env_creations=max_concurrent_env_creations,
+        max_concurrent_agent_predictions=max_concurrent_agent_predictions,
+        max_concurrent_tasks=max_concurrent_tasks
+    )
+__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]

agent/integrations/hud/adapter.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""HUD Adapter for ComputerAgent integration."""
+from __future__ import annotations
+from typing import Any, ClassVar
+from hud.adapters.common import CLA, Adapter
+from hud.adapters.common.types import (
+    CLAButton,
+    CLAKey,
+    ClickAction,
+    CustomAction,
+    DragAction,
+    MoveAction,
+    Point,
+    PressAction,
+    ResponseAction,
+    ScreenshotFetch,
+    ScrollAction,
+    TypeAction,
+    WaitAction,
+)
+class ComputerAgentAdapter(Adapter):
+    """Adapter for ComputerAgent to work with HUD."""
+    KEY_MAP: ClassVar[dict[str, CLAKey]] = {
+        "return": "enter",
+        "arrowup": "up",
+        "arrowdown": "down",
+        "arrowleft": "left",
+        "arrowright": "right",
+        "cmd": "ctrl",
+        "super": "win",
+        "meta": "win",
+    }
+    BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
+        "wheel": "middle",
+        "middle": "middle",
+    }
+    def __init__(self) -> None:
+        super().__init__()
+        # ComputerAgent default dimensions (can be overridden)
+        self.agent_width = 1024
+        self.agent_height = 768
+    def _map_key(self, key: str) -> CLAKey:
+        """Map a key to its standardized form."""
+        return self.KEY_MAP.get(key.lower(), key.lower())  # type: ignore
+    def convert(self, data: Any) -> CLA:
+        """Convert a ComputerAgent action to a HUD action."""
+        try:
+            action_type = data.get("type")
+            if action_type == "click":
+                x, y = data.get("x", 0), data.get("y", 0)
+                button = data.get("button", "left")
+                button = self.BUTTON_MAP.get(button, button)
+                if button is None:
+                    button = "left"
+                converted_action = ClickAction(point=Point(x=x, y=y), button=button)
+            elif action_type == "double_click":
+                x, y = data.get("x", 0), data.get("y", 0)
+                converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
+            elif action_type == "scroll":
+                x, y = int(data.get("x", 0)), int(data.get("y", 0))
+                scroll_x = int(data.get("scroll_x", 0))
+                scroll_y = int(data.get("scroll_y", 0))
+                converted_action = ScrollAction(
+                    point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
+                )
+            elif action_type == "type":
+                text = data.get("text", "")
+                converted_action = TypeAction(text=text, enter_after=False)
+            elif action_type == "wait":
+                ms = data.get("ms", 1000)
+                converted_action = WaitAction(time=ms)
+            elif action_type == "move":
+                x, y = data.get("x", 0), data.get("y", 0)
+                converted_action = MoveAction(point=Point(x=x, y=y))
+            elif action_type == "keypress":
+                keys = data.get("keys", [])
+                if isinstance(keys, str):
+                    keys = [keys]
+                converted_action = PressAction(keys=[self._map_key(k) for k in keys])
+            elif action_type == "drag":
+                path = data.get("path", [])
+                points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
+                converted_action = DragAction(path=points)
+            elif action_type == "screenshot":
+                converted_action = ScreenshotFetch()
+            elif action_type == "response":
+                converted_action = ResponseAction(text=data.get("text", ""))
+            elif action_type == "custom":
+                converted_action = CustomAction(action=data.get("action", ""))
+            else:
+                raise ValueError(f"Unsupported action type: {action_type}")
+            # Add reasoning and logs if available
+            converted_action.reasoning = data.get("reasoning", "")
+            converted_action.logs = data.get("logs", "")
+            return converted_action
+        except Exception as e:
+            raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e

agent/integrations/hud/agent.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
+import logging
+from typing import Any, Literal, Optional, Union, List, Dict
+import asyncio
+from agent import ComputerAgent as BaseComputerAgent
+from agent.responses import make_failed_tool_call_items
+from hud.adapters import Adapter
+from hud.agent.base import Agent
+from hud.utils.common import Observation
+from hud.adapters.common.types import LogType
+from hud.types import Gym
+from .adapter import ComputerAgentAdapter
+from .computer_handler import HUDComputerHandler
+logger = logging.getLogger(__name__)
+BASE_SYSTEM_PROMPT = """
+You are an autonomous computer-using agent. Follow these guidelines:
+1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
+2. Use the computer tools to complete the task and do not stop until the task is complete.
+3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
+4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
+5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
+6. Trust that the user wants you to complete the entire task they've requested.
+7. You must say "Task completed" when the task is complete.
+Remember: You have been given permission to complete the requested task autonomously.
+""".strip()
+class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
+    """
+    A ComputerAgent wrapper for HUD integration.
+    This agent wraps the base ComputerAgent to work with HUD environments,
+    providing the same interface as OperatorAgent but using ComputerAgent internally.
+    """
+    transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
+    def __init__(
+        self,
+        model: str = "anthropic/claude-3-5-sonnet-20241022",
+        environment: Literal["windows", "mac", "linux", "browser"] = "linux",
+        adapter: Optional[Adapter] = None,
+        name: Optional[str] = None,
+        **kwargs: Any,
+    ):
+        """
+        Initialize the ComputerAgent for HUD.
+        Args:
+            model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
+            environment: The environment type (windows, mac, linux, browser)
+            adapter: The adapter to use for preprocessing and postprocessing
+            name: The name of the agent
+            **kwargs: Additional arguments passed to ComputerAgent
+        """
+        # Create adapter if not provided
+        adapter = adapter or ComputerAgentAdapter()
+        if name is None:
+            name = f"computeragent-{model.split('/')[-1]}"
+        # Initialize the base Agent class without client (we'll create it later)
+        super().__init__(client=None, adapter=adapter, name=name)
+        self.model = model
+        self.environment = environment
+        self.kwargs = kwargs
+        # Default dimensions
+        self.width = 1024
+        self.height = 768
+        # Update dimensions if adapter is provided
+        if self.adapter:
+            self.width = self.adapter.agent_width
+            self.height = self.adapter.agent_height
+        # Create HUD computer handler
+        self.hud_computer = HUDComputerHandler(
+            environment=environment,
+            dimensions=(self.width, self.height)
+        )
+        # Handle trajectory_dir by adding TrajectorySaverCallback
+        trajectory_dir = kwargs.pop("trajectory_dir", None)
+        callbacks = kwargs.get("callbacks", [])
+        if trajectory_dir:
+            from agent.callbacks.trajectory_saver import TrajectorySaverCallback
+            trajectory_callback = TrajectorySaverCallback(trajectory_dir, reset_on_run=False)
+            callbacks = callbacks + [trajectory_callback]
+            kwargs["callbacks"] = callbacks
+        # Initialize ComputerAgent with HUD computer handler
+        self.computer_agent = BaseComputerAgent(
+            model=model,
+            tools=[self.hud_computer],
+            **kwargs
+        )
+        # Set the client to the computer_agent for compatibility
+        self.client = self.computer_agent
+        # State tracking
+        self.conversation_history: List[Dict[str, Any]] = []
+        self.initial_prompt: Optional[str] = None
+        # System prompt for computer use tasks
+        self.base_system_prompt = BASE_SYSTEM_PROMPT
+    async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
+        """
+        Fetch a response from ComputerAgent based on the observation.
+        Args:
+            observation: The preprocessed observation, attributes:
+                screenshot: Base64 encoded PNG string of the screen
+                text: Text observation, if available
+        Returns:
+            tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
+                                             boolean indicating if the agent believes the task is complete.
+        """
+        try:
+            # Update the computer handler with the current screenshot
+            if observation.screenshot:
+                self.hud_computer.update_screenshot(observation.screenshot)
+            # Set up action callback to capture actions
+            captured_actions = []
+            action_done = False
+            async def action_callback(action: Dict[str, Any]) -> None:
+                """Callback to capture actions from ComputerAgent."""
+                nonlocal captured_actions, action_done
+                captured_actions.append(action)
+            # Set the action callback
+            self.hud_computer.set_action_callback(action_callback)
+            # Prepare the message for ComputerAgent
+            if not self.conversation_history:
+                # First interaction - use the observation text as initial prompt
+                if observation.text:
+                    self.initial_prompt = observation.text
+                    message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
+                else:
+                    message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
+                input_content = [
+                    {"type": "input_text", "text": message}
+                ]
+                # Add screenshot if present
+                if observation.screenshot:
+                    input_content.append(
+                        {
+                            "type": "input_image",
+                            "image_url": f"data:image/png;base64,{observation.screenshot}",
+                        }
+                    )
+                self.conversation_history.append({"role": "user", "content": input_content})
+            else:
+                # Subsequent interactions - check if last action was computer_call
+                # If so, add computer_call_output with screenshot instead of user message
+                last_computer_calls = []
+                for msg in reversed(self.conversation_history):
+                    if msg.get("type") == "computer_call":
+                        call_id = msg.get("call_id")
+                        if call_id:
+                            # Check if this call_id already has a computer_call_output
+                            has_output = any(
+                                m.get("type") == "computer_call_output" and m.get("call_id") == call_id
+                                for m in self.conversation_history
+                            )
+                            if not has_output:
+                                last_computer_calls.append(call_id)
+                if last_computer_calls:
+                    if not observation.screenshot:
+                        print("No screenshot found, taking screenshot")
+                    screenshot_b64 = await self.hud_computer.screenshot()
+                    # Add computer_call_output for each unresponded computer_call
+                    for call_id in reversed(last_computer_calls):  # Maintain order
+                        self.conversation_history.append({
+                            "type": "computer_call_output",
+                            "call_id": call_id,
+                            "output": {
+                                "type": "input_image",
+                                "image_url": f"data:image/png;base64,{screenshot_b64}"
+                            }
+                        })
+                else:
+                    # No computer_call found, add regular user message
+                    message = "Continue with the task based on the current screen state."
+                    input_content = [
+                        {"type": "input_text", "text": message}
+                    ]
+                    # Add screenshot if present
+                    if observation.screenshot:
+                        input_content.append(
+                            {
+                                "type": "input_image",
+                                "image_url": f"data:image/png;base64,{observation.screenshot}",
+                            }
+                        )
+                    self.conversation_history.append({"role": "user", "content": input_content})
+                # If the last message is a reasoning message, change it to output_text
+                if (self.conversation_history and
+                    self.conversation_history[-1].get("type") == "reasoning" and
+                    self.conversation_history[-1].get("summary")):
+                    reasoning_msg = self.conversation_history[-1]
+                    summary_texts = []
+                    # Extract all summary_text entries
+                    for summary_item in reasoning_msg["summary"]:
+                        if summary_item.get("type") == "summary_text":
+                            summary_texts.append(summary_item.get("text", ""))
+                    # Convert to message format with output_text
+                    if summary_texts:
+                        converted_message = {
+                            "type": "message",
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "text": " ".join(summary_texts),
+                                    "type": "output_text"
+                                }
+                            ]
+                        }
+                        # Replace the reasoning message with the converted message
+                        self.conversation_history[-1] = converted_message
+            # Run ComputerAgent
+            try:
+                new_items = []
+                # ComputerAgent.run returns an async generator
+                try:
+                    async for result in self.computer_agent.run(self.conversation_history, stream=False):
+                        # if the result has computer_call_output, immediately exit
+                        if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
+                            break
+                        # otherwise add agent output to conversation history
+                        new_items += result["output"]
+                except Exception as e:
+                    # if the last message is reasoning, change it to output_text
+                    if new_items and new_items[-1].get("type") == "reasoning":
+                        new_items[-1] = {
+                            "type": "message",
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "text": new_items[-1].get("summary", [{}])[0].get("text", ""),
+                                    "type": "output_text"
+                                }
+                            ]
+                        }
+                    # Check if there are any computer_call items in new_items
+                    computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
+                    if computer_calls:
+                        # Remove computer_call items from new_items
+                        new_items = [item for item in new_items if item.get("type") != "computer_call"]
+                        # Add failed tool call items for each computer call
+                        for computer_call in computer_calls:
+                            tool_input = computer_call.get("action", {})
+                            call_id = computer_call.get("call_id")
+                            new_items.extend(make_failed_tool_call_items(
+                                tool_name="computer",
+                                tool_kwargs=tool_input,
+                                error_message=repr(e),
+                                call_id=call_id
+                            ))
+                    else:
+                        # add error message to conversation history (fallback for non-computer-call errors)
+                        new_items.append({
+                            "type": "user",
+                            "content": [
+                                {
+                                    "type": "input_text",
+                                    "text": f"Error during previous attempted action: {repr(e)}"
+                                }
+                            ]
+                        })
+                # Check if we captured any actions
+                if captured_actions:
+                    # Extract reasoning from the conversation history
+                    reasoning = ""
+                    # Look for the latest reasoning message
+                    for msg in reversed(new_items):
+                        if msg.get("type") == "reasoning" and msg.get("summary"):
+                            reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
+                            break
+                        elif msg.get("type") == "message" and msg.get("role") == "assistant":
+                            content = msg.get("content", [])
+                            if isinstance(content, list):
+                                reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
+                            break
+                    # update conversation history
+                    self.conversation_history += new_items
+                    # Add reasoning and logs to each action
+                    for action in captured_actions:
+                        action["reasoning"] = reasoning
+                        action["logs"] = {"conversation_length": len(self.conversation_history)}
+                    return captured_actions, False
+                # Check if the last message is "Task completed"
+                response_text = ""
+                for msg in reversed(new_items):
+                    if msg.get("type") == "message" and msg.get("role") == "assistant":
+                        content = msg.get("content", [])
+                        for c in content:
+                            if c.get("type") == "output_text":
+                                response_text = c.get("text", response_text)
+                                break
+                        break
+                done = "task completed" in response_text.lower()
+                # update conversation history
+                self.conversation_history += new_items
+                response_action = {
+                    "type": "response",
+                    "text": response_text,
+                    "reasoning": response_text,
+                    "logs": {"conversation_length": len(self.conversation_history)}
+                }
+                # Check if this indicates task completion or failure
+                if "task is infeasible" in response_text.lower():
+                    response_action = {"type": "custom", "action": "FAIL"}
+                    done = True
+                return [response_action], done
+            except Exception as e:
+                logger.error(f"Error running ComputerAgent: {e}")
+                # Return an error response
+                error_action = {
+                    "type": "response",
+                    "text": f"Error occurred: {str(e)}",
+                    "reasoning": f"ComputerAgent encountered an error: {str(e)}",
+                    "logs": {"error": str(e)}
+                }
+                return [error_action], True
+        except Exception as e:
+            logger.error(f"Error in fetch_response: {e}")
+            error_action = {
+                "type": "response",
+                "text": f"Error in agent processing: {str(e)}",
+                "reasoning": f"Agent processing error: {str(e)}",
+                "logs": {"error": str(e)}
+            }
+            return [error_action], True

cua-agent 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.12py3-none-any.whl → 0.4.13py3-none-any.whl