PyPI - cua-agent - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

cua-agent 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (22) hide show

agent/__init__.py +2 -2
agent/adapters/huggingfacelocal_adapter.py +5 -1
agent/agent.py +82 -15
agent/cli.py +9 -3
agent/computer_handler.py +3 -1
agent/decorators.py +28 -66
agent/loops/__init__.py +3 -1
agent/loops/anthropic.py +200 -84
agent/loops/base.py +76 -0
agent/loops/composed_grounded.py +318 -0
agent/loops/gta1.py +178 -0
agent/loops/model_types.csv +6 -0
agent/loops/omniparser.py +178 -84
agent/loops/openai.py +198 -58
agent/loops/uitars.py +305 -178
agent/responses.py +477 -1
agent/types.py +7 -5
{cua_agent-0.4.7.dist-info → cua_agent-0.4.8.dist-info}/METADATA +2 -2
cua_agent-0.4.8.dist-info/RECORD +37 -0
cua_agent-0.4.7.dist-info/RECORD +0 -33
{cua_agent-0.4.7.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
{cua_agent-0.4.7.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0

agent/loops/omniparser.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """
 OpenAI computer-use-preview agent loop implementation using liteLLM
+Paper: https://arxiv.org/abs/2408.00203
+Code: https://github.com/microsoft/OmniParser
 """
 import asyncio
@@ -9,8 +11,9 @@ import litellm
 import inspect
 import base64
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
+from ..decorators import register_agent
+from ..types import Messages, AgentResponse, Tools, AgentCapability
+from ..loops.base import AsyncAgentConfig
 SOM_TOOL_SCHEMA = {
   "type": "function",
@@ -246,94 +249,185 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
     return [item]
-@agent_loop(models=r"omniparser\+.*|omni\+.*", priority=10)
-async def omniparser_loop(
-    messages: Messages,
-    model: str,
-    tools: Optional[List[Dict[str, Any]]] = None,
-    max_retries: Optional[int] = None,
-    stream: bool = False,
-    computer_handler=None,
-    use_prompt_caching: Optional[bool] = False,
-    _on_api_start=None,
-    _on_api_end=None,
-    _on_usage=None,
-    _on_screenshot=None,
-    **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
-    """
-    OpenAI computer-use-preview agent loop using liteLLM responses.
-    Supports OpenAI's computer use preview models.
-    """
-    if not OMNIPARSER_AVAILABLE:
-        raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
-    tools = tools or []
+@register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
+class OmniparserConfig(AsyncAgentConfig):
+    """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
-    llm_model = model.split('+')[-1]
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        OpenAI computer-use-preview agent loop using liteLLM responses.
+        Supports OpenAI's computer use preview models.
+        """
+        if not OMNIPARSER_AVAILABLE:
+            raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
+        tools = tools or []
+        llm_model = model.split('+')[-1]
-    # Prepare tools for OpenAI API
-    openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
+        # Prepare tools for OpenAI API
+        openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
-    # Find last computer_call_output
-    last_computer_call_output = get_last_computer_call_output(messages)
-    if last_computer_call_output:
-        image_url = last_computer_call_output.get("output", {}).get("image_url", "")
-        image_data = image_url.split(",")[-1]
-        if image_data:
-            parser = get_parser()
-            result = parser.parse(image_data)
-            if _on_screenshot:
-                await _on_screenshot(result.annotated_image_base64, "annotated_image")
-            for element in result.elements:
-                id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
-    # handle computer calls -> function calls
-    new_messages = []
-    for message in messages:
-        if not isinstance(message, dict):
-            message = message.__dict__
-        new_messages += await replace_computer_call_with_function(message, id2xy)
-    messages = new_messages
+        # Find last computer_call_output
+        last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
+        if last_computer_call_output:
+            image_url = last_computer_call_output.get("output", {}).get("image_url", "")
+            image_data = image_url.split(",")[-1]
+            if image_data:
+                parser = get_parser()
+                result = parser.parse(image_data)
+                if _on_screenshot:
+                    await _on_screenshot(result.annotated_image_base64, "annotated_image")
+                for element in result.elements:
+                    id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
+        # handle computer calls -> function calls
+        new_messages = []
+        for message in messages:
+            if not isinstance(message, dict):
+                message = message.__dict__
+            new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
+        messages = new_messages
-    # Prepare API call kwargs
-    api_kwargs = {
-        "model": llm_model,
-        "input": messages,
-        "tools": openai_tools if openai_tools else None,
-        "stream": stream,
-        "reasoning": {"summary": "concise"},
-        "truncation": "auto",
-        "num_retries": max_retries,
-        **kwargs
-    }
-    # Call API start hook
-    if _on_api_start:
-        await _on_api_start(api_kwargs)
-    print(str(api_kwargs)[:1000])
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": llm_model,
+            "input": messages,
+            "tools": openai_tools if openai_tools else None,
+            "stream": stream,
+            "truncation": "auto",
+            "num_retries": max_retries,
+            **kwargs
+        }
+        # Call API start hook
+        if _on_api_start:
+            await _on_api_start(api_kwargs)
+        print(str(api_kwargs)[:1000])
+        # Use liteLLM responses
+        response = await litellm.aresponses(**api_kwargs)
-    # Use liteLLM responses
-    response = await litellm.aresponses(**api_kwargs)
+        # Call API end hook
+        if _on_api_end:
+            await _on_api_end(api_kwargs, response)
-    # Call API end hook
-    if _on_api_end:
-        await _on_api_end(api_kwargs, response)
+        # Extract usage information
+        usage = {
+            **response.usage.model_dump(), # type: ignore
+            "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
+        }
+        if _on_usage:
+            await _on_usage(usage)
-    # Extract usage information
-    response.usage = {
-        **response.usage.model_dump(),
-        "response_cost": response._hidden_params.get("response_cost", 0.0),
-    }
-    if _on_usage:
-        await _on_usage(response.usage)
+        # handle som function calls -> xy computer calls
+        new_output = []
+        for i in range(len(response.output)): # type: ignore
+          new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
+        return {
+            "output": new_output,
+            "usage": usage
+        }
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs
+    ) -> Optional[Tuple[float, float]]:
+        """
+        Predict click coordinates using OmniParser and LLM.
+        Uses OmniParser to annotate the image with element IDs, then uses LLM
+        to identify the correct element ID based on the instruction.
+        """
+        if not OMNIPARSER_AVAILABLE:
+            return None
+        # Parse the image with OmniParser to get annotated image and elements
+        parser = get_parser()
+        result = parser.parse(image_b64)
+        # Extract the LLM model from composed model string
+        llm_model = model.split('+')[-1]
+        # Create system prompt for element ID prediction
+        SYSTEM_PROMPT = f'''
+You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
-    # handle som function calls -> xy computer calls
-    new_output = []
-    for i in range(len(response.output)):
-      new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
-    response.output = new_output
+The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
-    return response
+Output only the element ID as a single integer.
+'''.strip()
+        # Prepare messages for LLM
+        messages = [
+            {
+                "role": "system",
+                "content": SYSTEM_PROMPT
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{result.annotated_image_base64}"
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": f"Find the element: {instruction}"
+                    }
+                ]
+            }
+        ]
+        # Call LLM to predict element ID
+        response = await litellm.acompletion(
+            model=llm_model,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.1
+        )
+        # Extract element ID from response
+        response_text = response.choices[0].message.content.strip() # type: ignore
+        # Try to parse the element ID
+        try:
+            element_id = int(response_text)
+            # Find the element with this ID and return its center coordinates
+            for element in result.elements:
+                if element.id == element_id:
+                    center_x = (element.bbox.x1 + element.bbox.x2) / 2
+                    center_y = (element.bbox.y1 + element.bbox.y2) / 2
+                    return (center_x, center_y)
+        except ValueError:
+            # If we can't parse the ID, return None
+            pass
+        return None
+    def get_capabilities(self) -> List[AgentCapability]:
+        """Return the capabilities supported by this agent."""
+        return ["step"]

agent/loops/openai.py CHANGED Viewed

@@ -3,31 +3,49 @@ OpenAI computer-use-preview agent loop implementation using liteLLM
 """
 import asyncio
+import base64
 import json
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional
+from io import BytesIO
+from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
 import litellm
+from PIL import Image
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
+from ..decorators import register_agent
+from ..types import Messages, AgentResponse, Tools, AgentCapability
-def _map_computer_tool_to_openai(computer_tool: Any) -> Dict[str, Any]:
+async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
     """Map a computer tool to OpenAI's computer-use-preview tool schema"""
+    # Get dimensions from the computer handler
+    try:
+        width, height = await computer_handler.get_dimensions()
+    except Exception:
+        # Fallback to default dimensions if method fails
+        width, height = 1024, 768
+    # Get environment from the computer handler
+    try:
+        environment = await computer_handler.get_environment()
+    except Exception:
+        # Fallback to default environment if method fails
+        environment = "linux"
     return {
         "type": "computer_use_preview",
-        "display_width": getattr(computer_tool, 'display_width', 1024),
-        "display_height": getattr(computer_tool, 'display_height', 768),
-        "environment": getattr(computer_tool, 'environment', "linux")  # mac, windows, linux, browser
+        "display_width": width,
+        "display_height": height,
+        "environment": environment  # mac, windows, linux, browser
     }
-def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
+async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
     """Prepare tools for OpenAI API format"""
     openai_tools = []
     for schema in tool_schemas:
         if schema["type"] == "computer":
             # Map computer tool to OpenAI format
-            openai_tools.append(_map_computer_tool_to_openai(schema["computer"]))
+            computer_tool = await _map_computer_tool_to_openai(schema["computer"])
+            openai_tools.append(computer_tool)
         elif schema["type"] == "function":
             # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
             # Schema should be: {type, name, description, parameters}
@@ -36,60 +54,182 @@ def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
     return openai_tools
-@agent_loop(models=r".*computer-use-preview.*", priority=10)
-async def openai_computer_use_loop(
-    messages: Messages,
-    model: str,
-    tools: Optional[List[Dict[str, Any]]] = None,
-    max_retries: Optional[int] = None,
-    stream: bool = False,
-    computer_handler=None,
-    use_prompt_caching: Optional[bool] = False,
-    _on_api_start=None,
-    _on_api_end=None,
-    _on_usage=None,
-    _on_screenshot=None,
-    **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
+@register_agent(models=r".*computer-use-preview.*")
+class OpenAIComputerUseConfig:
     """
-    OpenAI computer-use-preview agent loop using liteLLM responses.
+    OpenAI computer-use-preview agent configuration using liteLLM responses.
     Supports OpenAI's computer use preview models.
     """
-    tools = tools or []
-    # Prepare tools for OpenAI API
-    openai_tools = _prepare_tools_for_openai(tools)
-    # Prepare API call kwargs
-    api_kwargs = {
-        "model": model,
-        "input": messages,
-        "tools": openai_tools if openai_tools else None,
-        "stream": stream,
-        "reasoning": {"summary": "concise"},
-        "truncation": "auto",
-        "num_retries": max_retries,
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
         **kwargs
-    }
-    # Call API start hook
-    if _on_api_start:
-        await _on_api_start(api_kwargs)
-    # Use liteLLM responses
-    response = await litellm.aresponses(**api_kwargs)
-    # Call API end hook
-    if _on_api_end:
-        await _on_api_end(api_kwargs, response)
+    ) -> Dict[str, Any]:
+        """
+        Predict the next step based on input items.
+        Args:
+            messages: Input items following Responses format
+            model: Model name to use
+            tools: Optional list of tool schemas
+            max_retries: Maximum number of retries
+            stream: Whether to stream responses
+            computer_handler: Computer handler instance
+            _on_api_start: Callback for API start
+            _on_api_end: Callback for API end
+            _on_usage: Callback for usage tracking
+            _on_screenshot: Callback for screenshot events
+            **kwargs: Additional arguments
+        Returns:
+            Dictionary with "output" (output items) and "usage" array
+        """
+        tools = tools or []
+        # Prepare tools for OpenAI API
+        openai_tools = await _prepare_tools_for_openai(tools)
-    # Extract usage information
-    response.usage = {
-        **response.usage.model_dump(),
-        "response_cost": response._hidden_params.get("response_cost", 0.0),
-    }
-    if _on_usage:
-        await _on_usage(response.usage)
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "input": messages,
+            "tools": openai_tools if openai_tools else None,
+            "stream": stream,
+            "reasoning": {"summary": "concise"},
+            "truncation": "auto",
+            "num_retries": max_retries,
+            **kwargs
+        }
+        # Call API start hook
+        if _on_api_start:
+            await _on_api_start(api_kwargs)
+        # Use liteLLM responses
+        response = await litellm.aresponses(**api_kwargs)
+        # Call API end hook
+        if _on_api_end:
+            await _on_api_end(api_kwargs, response)
+        # Extract usage information
+        usage = {
+            **response.usage.model_dump(),
+            "response_cost": response._hidden_params.get("response_cost", 0.0),
+        }
+        if _on_usage:
+            await _on_usage(usage)
+        # Return in the expected format
+        output_dict = response.model_dump()
+        output_dict["usage"] = usage
+        return output_dict
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates based on image and instruction.
+        Uses OpenAI computer-use-preview with manually constructed input items
+        and a prompt that instructs the agent to only output clicks.
+        Args:
+            model: Model name to use
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # TODO: use computer tool to get dimensions + environment
+        # Manually construct input items with image and click instruction
+        input_items = [
+            {
+                "role": "user",
+                "content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_image",
+                        "image_url": f"data:image/png;base64,{image_b64}"
+                    }
+                ]
+            }
+        ]
+        # Get image dimensions from base64 data
+        try:
+            image_data = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(image_data))
+            display_width, display_height = image.size
+        except Exception:
+            # Fallback to default dimensions if image parsing fails
+            display_width, display_height = 1024, 768
+        # Prepare computer tool for click actions
+        computer_tool = {
+            "type": "computer_use_preview",
+            "display_width": display_width,
+            "display_height": display_height,
+            "environment": "windows"
+        }
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "input": input_items,
+            "tools": [computer_tool],
+            "stream": False,
+            "reasoning": {"summary": "concise"},
+            "truncation": "auto",
+            "max_tokens": 100  # Keep response short for click prediction
+        }
+        # Use liteLLM responses
+        response = await litellm.aresponses(**api_kwargs)
+        # Extract click coordinates from response output
+        output_dict = response.model_dump()
+        output_items = output_dict.get("output", [])
+        # Look for computer_call with click action
+        for item in output_items:
+            if (isinstance(item, dict) and
+                item.get("type") == "computer_call" and
+                isinstance(item.get("action"), dict)):
+                action = item["action"]
+                if action.get("type") == "click":
+                    x = action.get("x")
+                    y = action.get("y")
+                    if x is not None and y is not None:
+                        return (int(x), int(y))
+        return None
-    return response
+    def get_capabilities(self) -> List[AgentCapability]:
+        """
+        Get list of capabilities supported by this agent config.
+        Returns:
+            List of capability strings
+        """
+        return ["click", "step"]

cua-agent 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl