PyPI - cua-agent - Versions diffs - 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

cua-agent 0.4.6py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (24) hide show

agent/__init__.py +2 -2
agent/adapters/huggingfacelocal_adapter.py +8 -5
agent/agent.py +85 -15
agent/cli.py +9 -3
agent/computer_handler.py +3 -1
agent/decorators.py +28 -66
agent/loops/__init__.py +3 -1
agent/loops/anthropic.py +200 -84
agent/loops/base.py +76 -0
agent/loops/composed_grounded.py +318 -0
agent/loops/gta1.py +178 -0
agent/loops/model_types.csv +6 -0
agent/loops/omniparser.py +178 -84
agent/loops/openai.py +198 -58
agent/loops/uitars.py +305 -178
agent/responses.py +477 -1
agent/types.py +7 -5
agent/ui/gradio/app.py +14 -7
agent/ui/gradio/ui_components.py +18 -1
{cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/METADATA +3 -3
cua_agent-0.4.8.dist-info/RECORD +37 -0
cua_agent-0.4.6.dist-info/RECORD +0 -33
{cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
{cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0

agent/loops/uitars.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """
 UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
+Paper: https://arxiv.org/abs/2501.12326
+Code: https://github.com/bytedance/UI-TARS
 """
 import asyncio
@@ -9,7 +11,7 @@ import base64
 import math
 import re
 import ast
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional
+from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
 from io import BytesIO
 from PIL import Image
 import litellm
@@ -21,8 +23,8 @@ from openai.types.responses.response_input_param import ComputerCallOutput
 from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
 from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
+from ..decorators import register_agent
+from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..responses import (
     make_reasoning_item,
     make_output_text_item,
@@ -79,6 +81,18 @@ Action: ...
 {instruction}
 """
+GROUNDING_UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+## Output Format
+Action: ...
+## Action Space
+click(point='<|box_start|>(x1,y1)<|box_end|>')
+## User Instruction
+{instruction}"""
 def round_by_factor(number: float, factor: int) -> int:
     """Returns the closest integer to 'number' that is divisible by 'factor'."""
@@ -501,188 +515,301 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
     return litellm_messages
-@agent_loop(models=r"(?i).*ui-?tars.*", priority=10)
-async def uitars_loop(
-    messages: Messages,
-    model: str,
-    tools: Optional[List[Dict[str, Any]]] = None,
-    max_retries: Optional[int] = None,
-    stream: bool = False,
-    computer_handler=None,
-    use_prompt_caching: Optional[bool] = False,
-    _on_api_start=None,
-    _on_api_end=None,
-    _on_usage=None,
-    _on_screenshot=None,
-    **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
+@register_agent(models=r"(?i).*ui-?tars.*")
+class UITARSConfig:
     """
-    UITARS agent loop using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
+    UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
     Supports UITARS vision-language models for computer control.
     """
-    tools = tools or []
-    # Create response items
-    response_items = []
-    # Find computer tool for screen dimensions
-    computer_tool = None
-    for tool_schema in tools:
-        if tool_schema["type"] == "computer":
-            computer_tool = tool_schema["computer"]
-            break
-    # Get screen dimensions
-    screen_width, screen_height = 1024, 768
-    if computer_tool:
-        try:
-            screen_width, screen_height = await computer_tool.get_dimensions()
-        except:
-            pass
-    # Process messages to extract instruction and image
-    instruction = ""
-    image_data = None
-    # Convert messages to list if string
-    if isinstance(messages, str):
-        messages = [{"role": "user", "content": messages}]
-    # Extract instruction and latest screenshot
-    for message in reversed(messages):
-        if isinstance(message, dict):
-            content = message.get("content", "")
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Predict the next step based on input messages.
+        Args:
+            messages: Input messages following Responses format
+            model: Model name to use
+            tools: Optional list of tool schemas
+            max_retries: Maximum number of retries
+            stream: Whether to stream responses
+            computer_handler: Computer handler instance
+            _on_api_start: Callback for API start
+            _on_api_end: Callback for API end
+            _on_usage: Callback for usage tracking
+            _on_screenshot: Callback for screenshot events
+            **kwargs: Additional arguments
-            # Handle different content formats
-            if isinstance(content, str):
-                if not instruction and message.get("role") == "user":
-                    instruction = content
-            elif isinstance(content, list):
-                for item in content:
-                    if isinstance(item, dict):
-                        if item.get("type") == "text" and not instruction:
-                            instruction = item.get("text", "")
-                        elif item.get("type") == "image_url" and not image_data:
-                            image_url = item.get("image_url", {})
-                            if isinstance(image_url, dict):
-                                image_data = image_url.get("url", "")
-                            else:
-                                image_data = image_url
-        # Also check for computer_call_output with screenshots
-        if message.get("type") == "computer_call_output" and not image_data:
-            output = message.get("output", {})
-            if isinstance(output, dict) and output.get("type") == "input_image":
-                image_data = output.get("image_url", "")
-        if instruction and image_data:
-            break
-    if not instruction:
-        instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
-    # Create prompt
-    user_prompt = UITARS_PROMPT_TEMPLATE.format(
-        instruction=instruction,
-        action_space=UITARS_ACTION_SPACE,
-        language="English"
-    )
-    # Convert conversation history to LiteLLM format
-    history_messages = convert_uitars_messages_to_litellm(messages)
-    # Prepare messages for liteLLM
-    litellm_messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }
-    ]
-    # Add current user instruction with screenshot
-    current_user_message = {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": user_prompt},
+        Returns:
+            Dictionary with "output" (output items) and "usage" array
+        """
+        tools = tools or []
+        # Create response items
+        response_items = []
+        # Find computer tool for screen dimensions
+        computer_tool = None
+        for tool_schema in tools:
+            if tool_schema["type"] == "computer":
+                computer_tool = tool_schema["computer"]
+                break
+        # Get screen dimensions
+        screen_width, screen_height = 1024, 768
+        if computer_tool:
+            try:
+                screen_width, screen_height = await computer_tool.get_dimensions()
+            except:
+                pass
+        # Process messages to extract instruction and image
+        instruction = ""
+        image_data = None
+        # Convert messages to list if string
+        if isinstance(messages, str):
+            messages = [{"role": "user", "content": messages}]
+        # Extract instruction and latest screenshot
+        for message in reversed(messages):
+            if isinstance(message, dict):
+                content = message.get("content", "")
+                # Handle different content formats
+                if isinstance(content, str):
+                    if not instruction and message.get("role") == "user":
+                        instruction = content
+                elif isinstance(content, list):
+                    for item in content:
+                        if isinstance(item, dict):
+                            if item.get("type") == "text" and not instruction:
+                                instruction = item.get("text", "")
+                            elif item.get("type") == "image_url" and not image_data:
+                                image_url = item.get("image_url", {})
+                                if isinstance(image_url, dict):
+                                    image_data = image_url.get("url", "")
+                                else:
+                                    image_data = image_url
+            # Also check for computer_call_output with screenshots
+            if message.get("type") == "computer_call_output" and not image_data:
+                output = message.get("output", {})
+                if isinstance(output, dict) and output.get("type") == "input_image":
+                    image_data = output.get("image_url", "")
+            if instruction and image_data:
+                break
+        if not instruction:
+            instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
+        # Create prompt
+        user_prompt = UITARS_PROMPT_TEMPLATE.format(
+            instruction=instruction,
+            action_space=UITARS_ACTION_SPACE,
+            language="English"
+        )
+        # Convert conversation history to LiteLLM format
+        history_messages = convert_uitars_messages_to_litellm(messages)
+        # Prepare messages for liteLLM
+        litellm_messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            }
         ]
-    }
-    litellm_messages.append(current_user_message)
-    # Process image for UITARS
-    if not image_data:
-        # Take screenshot if none found in messages
-        if computer_handler:
-            image_data = await computer_handler.screenshot()
-            await _on_screenshot(image_data, "screenshot_before")
-            # Add screenshot to output items so it can be retained in history
-            response_items.append(make_input_image_item(image_data))
-        else:
-            raise ValueError("No screenshot found in messages and no computer_handler provided")
-    processed_image, original_width, original_height = process_image_for_uitars(image_data)
-    encoded_image = pil_to_base64(processed_image)
-    # Add conversation history
-    if history_messages:
-        litellm_messages.extend(history_messages)
-    else:
-        litellm_messages.append({
-            "role": "user",
+        # Add current user instruction with screenshot
+        current_user_message = {
+            "role": "user",
             "content": [
-                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
+                {"type": "text", "text": user_prompt},
             ]
-        })
+        }
+        litellm_messages.append(current_user_message)
+        # Process image for UITARS
+        if not image_data:
+            # Take screenshot if none found in messages
+            if computer_handler:
+                image_data = await computer_handler.screenshot()
+                await _on_screenshot(image_data, "screenshot_before")
+                # Add screenshot to output items so it can be retained in history
+                response_items.append(make_input_image_item(image_data))
+            else:
+                raise ValueError("No screenshot found in messages and no computer_handler provided")
+        processed_image, original_width, original_height = process_image_for_uitars(image_data)
+        encoded_image = pil_to_base64(processed_image)
+        # Add conversation history
+        if history_messages:
+            litellm_messages.extend(history_messages)
+        else:
+            litellm_messages.append({
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
+                ]
+            })
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": litellm_messages,
+            "max_tokens": kwargs.get("max_tokens", 500),
+            "temperature": kwargs.get("temperature", 0.0),
+            "do_sample": kwargs.get("temperature", 0.0) > 0.0,
+            "num_retries": max_retries,
+            **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
+        }
+        # Call API start hook
+        if _on_api_start:
+            await _on_api_start(api_kwargs)
+        # Call liteLLM with UITARS model
+        response = await litellm.acompletion(**api_kwargs)
+        # Call API end hook
+        if _on_api_end:
+            await _on_api_end(api_kwargs, response)
+        # Extract response content
+        response_content = response.choices[0].message.content.strip() # type: ignore
+        # Parse UITARS response
+        parsed_responses = parse_uitars_response(response_content, original_width, original_height)
+        # Convert to computer actions
+        computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
+        # Add computer actions to response items
+        thought = parsed_responses[0].get("thought", "")
+        if thought:
+            response_items.append(make_reasoning_item(thought))
+        response_items.extend(computer_actions)
+        # Extract usage information
+        response_usage = {
+            **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
+            "response_cost": response._hidden_params.get("response_cost", 0.0),
+        }
+        if _on_usage:
+            await _on_usage(response_usage)
-    # Prepare API call kwargs
-    api_kwargs = {
-        "model": model,
-        "messages": litellm_messages,
-        "max_tokens": kwargs.get("max_tokens", 500),
-        "temperature": kwargs.get("temperature", 0.0),
-        "do_sample": kwargs.get("temperature", 0.0) > 0.0,
-        "num_retries": max_retries,
-        **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
-    }
-    # Call API start hook
-    if _on_api_start:
-        await _on_api_start(api_kwargs)
-    # Call liteLLM with UITARS model
-    response = await litellm.acompletion(**api_kwargs)
-    # Call API end hook
-    if _on_api_end:
-        await _on_api_end(api_kwargs, response)
-    # Extract response content
-    response_content = response.choices[0].message.content.strip() # type: ignore
-    # Parse UITARS response
-    parsed_responses = parse_uitars_response(response_content, original_width, original_height)
-    # Convert to computer actions
-    computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
-    # Add computer actions to response items
-    thought = parsed_responses[0].get("thought", "")
-    if thought:
-        response_items.append(make_reasoning_item(thought))
-    response_items.extend(computer_actions)
-    # Extract usage information
-    response_usage = {
-        **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
-        "response_cost": response._hidden_params.get("response_cost", 0.0),
-    }
-    if _on_usage:
-        await _on_usage(response_usage)
-    # Create agent response
-    agent_response = {
-        "output": response_items,
-        "usage": response_usage
-    }
-    return agent_response
+        # Create agent response
+        agent_response = {
+            "output": response_items,
+            "usage": response_usage
+        }
+        return agent_response
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates based on image and instruction.
+        UITARS supports click prediction through its action parsing.
+        Args:
+            model: Model name to use
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+        Returns:
+            Tuple with (x, y) coordinates or None
+        """
+        try:
+            # Create prompt using grounding template
+            user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
+                instruction=instruction
+            )
+            # Process image for UITARS
+            processed_image, original_width, original_height = process_image_for_uitars(image_b64)
+            encoded_image = pil_to_base64(processed_image)
+            # Prepare messages for liteLLM
+            litellm_messages = [
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": user_prompt},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
+                    ]
+                }
+            ]
+            # Prepare API call kwargs
+            api_kwargs = {
+                "model": model,
+                "messages": litellm_messages,
+                "max_tokens": 100,
+                "temperature": 0.0,
+                "do_sample": False
+            }
+            # Call liteLLM with UITARS model
+            response = await litellm.acompletion(**api_kwargs)
+            # Extract response content
+            response_content = response.choices[0].message.content.strip() # type: ignore
+            # Parse the response to extract click coordinates
+            # Look for click action with coordinates
+            click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
+            match = re.search(click_pattern, response_content)
+            if match:
+                x, y = int(match.group(1)), int(match.group(2))
+                # Scale coordinates back to original image dimensions
+                scale_x = original_width / processed_image.width
+                scale_y = original_height / processed_image.height
+                scaled_x = int(x * scale_x)
+                scaled_y = int(y * scale_y)
+                return (scaled_x, scaled_y)
+            return None
+        except Exception as e:
+            # Log error and return None
+            print(f"Error in predict_click: {e}")
+            return None
+    def get_capabilities(self) -> List[AgentCapability]:
+        """
+        Get list of capabilities supported by this agent config.
+        Returns:
+            List of capability strings
+        """
+        return ["step", "click"]

cua-agent 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.6py3-none-any.whl → 0.4.8py3-none-any.whl