PyPI - cua-agent - Versions diffs - 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

cua-agent 0.4.6py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (24) hide show

agent/__init__.py +2 -2
agent/adapters/huggingfacelocal_adapter.py +8 -5
agent/agent.py +85 -15
agent/cli.py +9 -3
agent/computer_handler.py +3 -1
agent/decorators.py +28 -66
agent/loops/__init__.py +3 -1
agent/loops/anthropic.py +200 -84
agent/loops/base.py +76 -0
agent/loops/composed_grounded.py +318 -0
agent/loops/gta1.py +178 -0
agent/loops/model_types.csv +6 -0
agent/loops/omniparser.py +178 -84
agent/loops/openai.py +198 -58
agent/loops/uitars.py +305 -178
agent/responses.py +477 -1
agent/types.py +7 -5
agent/ui/gradio/app.py +14 -7
agent/ui/gradio/ui_components.py +18 -1
{cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/METADATA +3 -3
cua_agent-0.4.8.dist-info/RECORD +37 -0
cua_agent-0.4.6.dist-info/RECORD +0 -33
{cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
{cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0

agent/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
 import logging
 import sys
-from .decorators import agent_loop
+from .decorators import register_agent
 from .agent import ComputerAgent
 from .types import Messages, AgentResponse
@@ -13,7 +13,7 @@ from .types import Messages, AgentResponse
 from . import loops
 __all__ = [
-    "agent_loop",
+    "register_agent",
     "ComputerAgent",
     "Messages",
     "AgentResponse"

agent/adapters/huggingfacelocal_adapter.py CHANGED Viewed

@@ -8,7 +8,7 @@ from litellm import completion, acompletion
 # Try to import HuggingFace dependencies
 try:
     import torch
-    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from transformers import AutoModelForImageTextToText, AutoProcessor
     HF_AVAILABLE = True
 except ImportError:
     HF_AVAILABLE = False
@@ -40,7 +40,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
         """
         if model_name not in self.models:
             # Load model
-            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model = AutoModelForImageTextToText.from_pretrained(
                 model_name,
                 torch_dtype=torch.float16,
                 device_map=self.device,
@@ -48,7 +48,11 @@ class HuggingFaceLocalAdapter(CustomLLM):
             )
             # Load processor
-            processor = AutoProcessor.from_pretrained(model_name)
+            processor = AutoProcessor.from_pretrained(
+                model_name,
+                min_pixels=3136,
+                max_pixels=4096 * 2160
+            )
             # Cache them
             self.models[model_name] = model
@@ -141,8 +145,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
         )
         # Move inputs to the same device as model
-        if torch.cuda.is_available() and self.device != "cpu":
-            inputs = inputs.to("cuda")
+        inputs = inputs.to(model.device)
         # Generate response
         with torch.no_grad():

agent/agent.py CHANGED Viewed

@@ -3,12 +3,12 @@ ComputerAgent - Main agent class that selects and runs agent loops
 """
 import asyncio
-from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set
+from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
 from litellm.responses.utils import Usage
-from .types import Messages, Computer
-from .decorators import find_agent_loop
+from .types import Messages, Computer, AgentCapability
+from .decorators import find_agent_config
 from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
 import json
 import litellm
@@ -117,6 +117,13 @@ def sanitize_message(msg: Any) -> Any:
             return sanitized
     return msg
+def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
+    call_ids = []
+    for message in messages:
+        if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
+            call_ids.append(message.get("call_id"))
+    return call_ids
 class ComputerAgent:
     """
     Main agent class that automatically selects the appropriate agent loop
@@ -207,19 +214,21 @@ class ComputerAgent:
         litellm.custom_provider_map = [
             {"provider": "huggingface-local", "custom_handler": hf_adapter}
         ]
+        litellm.suppress_debug_info = True
         # == Initialize computer agent ==
         # Find the appropriate agent loop
         if custom_loop:
             self.agent_loop = custom_loop
-            self.agent_loop_info = None
+            self.agent_config_info = None
         else:
-            loop_info = find_agent_loop(model)
-            if not loop_info:
-                raise ValueError(f"No agent loop found for model: {model}")
-            self.agent_loop = loop_info.func
-            self.agent_loop_info = loop_info
+            config_info = find_agent_config(model)
+            if not config_info:
+                raise ValueError(f"No agent config found for model: {model}")
+            # Instantiate the agent config class
+            self.agent_loop = config_info.agent_class()
+            self.agent_config_info = config_info
         self.tool_schemas = []
         self.computer_handler = None
@@ -389,8 +398,10 @@ class ComputerAgent:
     # AGENT OUTPUT PROCESSING
     # ============================================================================
-    async def _handle_item(self, item: Any, computer: Optional[Computer] = None) -> List[Dict[str, Any]]:
+    async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
         """Handle each item; may cause a computer action + screenshot."""
+        if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
+            return []
         item_type = item.get("type", None)
@@ -411,6 +422,9 @@ class ComputerAgent:
             # Perform computer actions
             action = item.get("action")
             action_type = action.get("type")
+            if action_type is None:
+                print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
+                return []
             # Extract action arguments (all fields except 'type')
             action_args = {k: v for k, v in action.items() if k != "type"}
@@ -436,7 +450,7 @@ class ComputerAgent:
             acknowledged_checks = []
             for check in pending_checks:
                 check_message = check.get("message", str(check))
-                if acknowledge_safety_check_callback(check_message):
+                if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks
                     acknowledged_checks.append(check)
                 else:
                     raise ValueError(f"Safety check failed: {check_message}")
@@ -511,6 +525,12 @@ class ComputerAgent:
         Returns:
             AsyncGenerator that yields response chunks
         """
+        if not self.agent_config_info:
+            raise ValueError("Agent configuration not found")
+        capabilities = self.get_capabilities()
+        if "step" not in capabilities:
+            raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
         await self._initialize_computers()
@@ -525,7 +545,7 @@ class ComputerAgent:
             "messages": messages,
             "stream": stream,
             "model": self.model,
-            "agent_loop": self.agent_loop.__name__,
+            "agent_loop": self.agent_config_info.agent_class.__name__,
             **merged_kwargs
         }
         await self._on_run_start(run_kwargs, old_items)
@@ -555,7 +575,7 @@ class ComputerAgent:
             }
             # Run agent loop iteration
-            result = await self.agent_loop(
+            result = await self.agent_loop.predict_step(
                 **loop_kwargs,
                 _on_api_start=self._on_api_start,
                 _on_api_end=self._on_api_end,
@@ -576,9 +596,12 @@ class ComputerAgent:
             # Add agent response to new_items
             new_items += result.get("output")
+            # Get output call ids
+            output_call_ids = get_output_call_ids(result.get("output", []))
             # Handle computer actions
             for item in result.get("output"):
-                partial_items = await self._handle_item(item, self.computer_handler)
+                partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
                 new_items += partial_items
                 # Yield partial response
@@ -591,4 +614,51 @@ class ComputerAgent:
                     )
                 }
-        await self._on_run_end(loop_kwargs, old_items, new_items)
+        await self._on_run_end(loop_kwargs, old_items, new_items)
+    async def predict_click(
+        self,
+        instruction: str,
+        image_b64: Optional[str] = None
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates based on image and instruction.
+        Args:
+            instruction: Instruction for where to click
+            image_b64: Base64 encoded image (optional, will take screenshot if not provided)
+        Returns:
+            None or tuple with (x, y) coordinates
+        """
+        if not self.agent_config_info:
+            raise ValueError("Agent configuration not found")
+        capabilities = self.get_capabilities()
+        if "click" not in capabilities:
+            raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
+        if hasattr(self.agent_loop, 'predict_click'):
+            if not image_b64:
+                if not self.computer_handler:
+                    raise ValueError("Computer tool or image_b64 is required for predict_click")
+                image_b64 = await self.computer_handler.screenshot()
+            return await self.agent_loop.predict_click(
+                model=self.model,
+                image_b64=image_b64,
+                instruction=instruction
+            )
+        return None
+    def get_capabilities(self) -> List[AgentCapability]:
+        """
+        Get list of capabilities supported by the current agent config.
+        Returns:
+            List of capability strings (e.g., ["step", "click"])
+        """
+        if not self.agent_config_info:
+            raise ValueError("Agent configuration not found")
+        if hasattr(self.agent_loop, 'get_capabilities'):
+            return self.agent_loop.get_capabilities()
+        return ["step"]  # Default capability

agent/cli.py CHANGED Viewed

@@ -120,7 +120,7 @@ async def ainput(prompt: str = ""):
 async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
     """Main chat loop with the agent."""
-    print_welcome(model, agent.agent_loop.__name__, container_name)
+    print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
     history = []
@@ -130,7 +130,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
     total_cost = 0
     while True:
-        if history[-1].get("role") != "user":
+        if len(history) == 0 or history[-1].get("role") != "user":
             # Get user input with prompt
             print_colored("> ", end="")
             user_input = await ainput()
@@ -260,7 +260,12 @@ Examples:
         help="Show total cost of the agent runs"
     )
+    parser.add_argument(
+        "-r", "--max-retries",
+        type=int,
+        default=3,
+        help="Maximum number of retries for the LLM API calls"
+    )
     args = parser.parse_args()
@@ -327,6 +332,7 @@ Examples:
             "model": args.model,
             "tools": [computer],
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
+            "max_retries": args.max_retries
         }
         if args.images > 0:

agent/computer_handler.py CHANGED Viewed

@@ -93,8 +93,10 @@ class OpenAIComputerHandler:
         return ""
-def acknowledge_safety_check_callback(message: str) -> bool:
+def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool:
     """Safety check callback for user acknowledgment."""
+    if allow_always:
+        return True
     response = input(
         f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
     ).lower()

agent/decorators.py CHANGED Viewed

@@ -2,89 +2,51 @@
 Decorators for agent - agent_loop decorator
 """
-import asyncio
-import inspect
-from typing import Dict, List, Any, Callable, Optional
-from functools import wraps
-from .types import AgentLoopInfo
+from typing import List, Optional
+from .types import AgentConfigInfo
 # Global registry
-_agent_loops: List[AgentLoopInfo] = []
+_agent_configs: List[AgentConfigInfo] = []
-def agent_loop(models: str, priority: int = 0):
+def register_agent(models: str, priority: int = 0):
     """
-    Decorator to register an agent loop function.
+    Decorator to register an AsyncAgentConfig class.
     Args:
         models: Regex pattern to match supported models
-        priority: Priority for loop selection (higher = more priority)
+        priority: Priority for agent selection (higher = more priority)
     """
-    def decorator(func: Callable):
-        # Validate function signature
-        sig = inspect.signature(func)
-        required_params = {'messages', 'model'}
-        func_params = set(sig.parameters.keys())
-        if not required_params.issubset(func_params):
-            missing = required_params - func_params
-            raise ValueError(f"Agent loop function must have parameters: {missing}")
+    def decorator(agent_class: type):
+        # Validate that the class implements AsyncAgentConfig protocol
+        if not hasattr(agent_class, 'predict_step'):
+            raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
+        if not hasattr(agent_class, 'predict_click'):
+            raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
+        if not hasattr(agent_class, 'get_capabilities'):
+            raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
-        # Register the loop
-        loop_info = AgentLoopInfo(
-            func=func,
+        # Register the agent config
+        config_info = AgentConfigInfo(
+            agent_class=agent_class,
             models_regex=models,
             priority=priority
         )
-        _agent_loops.append(loop_info)
+        _agent_configs.append(config_info)
         # Sort by priority (highest first)
-        _agent_loops.sort(key=lambda x: x.priority, reverse=True)
-        @wraps(func)
-        async def wrapper(*args, **kwargs):
-            # Wrap the function in an asyncio.Queue for cancellation support
-            queue = asyncio.Queue()
-            task = None
-            try:
-                # Create a task that can be cancelled
-                async def run_loop():
-                    try:
-                        result = await func(*args, **kwargs)
-                        await queue.put(('result', result))
-                    except Exception as e:
-                        await queue.put(('error', e))
-                task = asyncio.create_task(run_loop())
-                # Wait for result or cancellation
-                event_type, data = await queue.get()
-                if event_type == 'error':
-                    raise data
-                return data
-            except asyncio.CancelledError:
-                if task:
-                    task.cancel()
-                    try:
-                        await task
-                    except asyncio.CancelledError:
-                        pass
-                raise
+        _agent_configs.sort(key=lambda x: x.priority, reverse=True)
-        return wrapper
+        return agent_class
     return decorator
-def get_agent_loops() -> List[AgentLoopInfo]:
-    """Get all registered agent loops"""
-    return _agent_loops.copy()
+def get_agent_configs() -> List[AgentConfigInfo]:
+    """Get all registered agent configs"""
+    return _agent_configs.copy()
-def find_agent_loop(model: str) -> Optional[AgentLoopInfo]:
-    """Find the best matching agent loop for a model"""
-    for loop_info in _agent_loops:
-        if loop_info.matches_model(model):
-            return loop_info
+def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
+    """Find the best matching agent config for a model"""
+    for config_info in _agent_configs:
+        if config_info.matches_model(model):
+            return config_info
     return None

agent/loops/__init__.py CHANGED Viewed

@@ -7,5 +7,7 @@ from . import anthropic
 from . import openai
 from . import uitars
 from . import omniparser
+from . import gta1
+from . import composed_grounded
-__all__ = ["anthropic", "openai", "uitars", "omniparser"]
+__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded"]

cua-agent 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.6py3-none-any.whl → 0.4.8py3-none-any.whl