PyPI - cua-agent - Versions diffs - 0.1.30__tar.gz → 0.1.32__tar.gz - Mend

cua-agent 0.1.30tar.gz → 0.1.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (84) hide show

{cua_agent-0.1.30 → cua_agent-0.1.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.1.30
+Version: 0.1.32
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.10
@@ -101,6 +101,7 @@ pip install "cua-agent[all]"
 # or install specific loop providers
 pip install "cua-agent[openai]" # OpenAI Cua Loop
 pip install "cua-agent[anthropic]" # Anthropic Cua Loop
+pip install "cua-agent[uitars]"    # UI-Tars support
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
 ```
@@ -119,10 +120,10 @@ async with Computer() as macos_computer:
       # model=LLM(provider=LLMProvider.ANTHROPIC)
       # or
       # loop=AgentLoop.OMNI,
-      # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
+      # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
       # or
       # loop=AgentLoop.UITARS,
-      # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
+      # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
   )
   tasks = [
@@ -148,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
 ## Using the Gradio UI
-The agent includes a Gradio-based user interface for easy interaction. To use it:
+The agent includes a Gradio-based user interface for easier interaction.
+<div align="center">
+    <img src="../../img/agent_gradio_ui.png"/>
+</div>
+To use it:
 ```bash
 # Install with Gradio support

{cua_agent-0.1.30 → cua_agent-0.1.32}/README.md RENAMED Viewed

@@ -31,6 +31,7 @@ pip install "cua-agent[all]"
 # or install specific loop providers
 pip install "cua-agent[openai]" # OpenAI Cua Loop
 pip install "cua-agent[anthropic]" # Anthropic Cua Loop
+pip install "cua-agent[uitars]"    # UI-Tars support
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
 ```
@@ -49,10 +50,10 @@ async with Computer() as macos_computer:
       # model=LLM(provider=LLMProvider.ANTHROPIC)
       # or
       # loop=AgentLoop.OMNI,
-      # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
+      # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
       # or
       # loop=AgentLoop.UITARS,
-      # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
+      # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
   )
   tasks = [
@@ -78,7 +79,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
 ## Using the Gradio UI
-The agent includes a Gradio-based user interface for easy interaction. To use it:
+The agent includes a Gradio-based user interface for easier interaction.
+<div align="center">
+    <img src="../../img/agent_gradio_ui.png"/>
+</div>
+To use it:
 ```bash
 # Install with Gradio support

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/loop.py RENAMED Viewed

@@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop):
                     messages,
                     model=self.model,
                 )
+                # Log standardized response for ease of parsing
+                self._log_api_call("agent_response", request=None, response=openai_compatible_response)
                 await queue.put(openai_compatible_response)
                 if not should_continue:

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/computer.py RENAMED Viewed

@@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     self.logger.info(f"Moving cursor to ({x}, {y})")
                     await self.computer.interface.move_cursor(x, y)
                 elif action == "left_click_drag":
-                    self.logger.info(f"Dragging from ({x}, {y})")
-                    # First move to the position
-                    await self.computer.interface.move_cursor(x, y)
-                    # Then perform drag operation - check if drag_to exists or we need to use other methods
-                    try:
-                        await self.computer.interface.drag_to(x, y)
-                    except Exception as e:
-                        self.logger.error(f"Error during drag operation: {str(e)}")
-                        raise ToolError(f"Failed to perform drag: {str(e)}")
+                    # Get the start coordinate from kwargs
+                    start_coordinate = kwargs.get("start_coordinate")
+                    if not start_coordinate:
+                        raise ToolError("start_coordinate is required for left_click_drag action")
+                    start_x, start_y = start_coordinate
+                    end_x, end_y = x, y
+                    self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
+                    await self.computer.interface.move_cursor(start_x, start_y)
+                    await self.computer.interface.drag_to(end_x, end_y)
                 # Wait briefly for any UI changes
                 await asyncio.sleep(0.5)

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/oaicompat.py RENAMED Viewed

@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        final_messages = [{"role": "system", "content": system}]
+        final_messages = [
+            {
+                "role": "system",
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
         # Process messages
         for item in messages:
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
                     else:
                         message = {
                             "role": item["role"],
-                            "content": [{"type": "text", "text": item["content"]}],
+                            "content": [{
+                                "type": "text",
+                                "text": item["content"]
+                            }],
                         }
                     final_messages.append(message)
             else:

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/loop.py RENAMED Viewed

@@ -670,6 +670,8 @@ class OmniLoop(BaseLoop):
                     parsed_screen=parsed_screen,
                     parser=self.parser
                 )
+                # Log standardized response for ease of parsing
+                self._log_api_call("agent_response", request=None, response=openai_compatible_response)
                 # Yield the response to the caller
                 yield openai_compatible_response

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/loop.py RENAMED Viewed

@@ -276,6 +276,10 @@ class OpenAILoop(BaseLoop):
                     )
                     # Don't reset last_response_id to None - keep the previous value if available
+                # Log standardized response for ease of parsing
+                # Since this is the openAI responses format, we don't need to convert it to agent response format
+                self._log_api_call("agent_response", request=None, response=response)
                 # Process API response
                 await queue.put(response)

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/computer.py RENAMED Viewed

@@ -44,6 +44,7 @@ Action = Literal[
     "double_click",
     "screenshot",
     "scroll",
+    "drag",
 ]
@@ -162,9 +163,14 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
                 y = kwargs.get("y")
                 if x is None or y is None:
                     raise ToolError("x and y coordinates are required for scroll action")
-                scroll_x = kwargs.get("scroll_x", 0) // 20
-                scroll_y = kwargs.get("scroll_y", 0) // 20
+                scroll_x = kwargs.get("scroll_x", 0) // 50
+                scroll_y = kwargs.get("scroll_y", 0) // 50
                 return await self.handle_scroll(x, y, scroll_x, scroll_y)
+            elif type == "drag":
+                path = kwargs.get("path")
+                if not path or not isinstance(path, list) or len(path) < 2:
+                    raise ToolError("path is required for drag action and must contain at least 2 points")
+                return await self.handle_drag(path)
             elif type == "screenshot":
                 return await self.screenshot()
             elif type == "wait":
@@ -240,11 +246,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             if len(mapped_keys) > 1:
                 # For key combinations (like Ctrl+C)
-                for k in mapped_keys:
-                    await self.computer.interface.press_key(k)
-                await asyncio.sleep(0.1)
-                for k in reversed(mapped_keys):
-                    await self.computer.interface.press_key(k)
+                await self.computer.interface.hotkey(*mapped_keys)
             else:
                 # Single key press
                 await self.computer.interface.press_key(mapped_keys[0])
@@ -306,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             self.logger.error(f"Error in handle_scroll: {str(e)}")
             raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
+    async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
+        """Handle mouse drag operation using a path of coordinates.
+        Args:
+            path: List of coordinate points {"x": int, "y": int} defining the drag path
+        Returns:
+            ToolResult with the operation result and screenshot
+        """
+        try:
+            # Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
+            points = [(p["x"], p["y"]) for p in path]
+            # Perform drag action
+            if len(points) == 2:
+                await self.computer.interface.move_cursor(points[0][0], points[0][1])
+                await self.computer.interface.drag_to(points[1][0], points[1][1])
+            else:
+                await self.computer.interface.drag(points, button="left")
+            # Wait for UI to update
+            await asyncio.sleep(0.5)
+            # Take screenshot after action
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            return ToolResult(
+                output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
+                base64_image=base64_screenshot,
+            )
+        except Exception as e:
+            self.logger.error(f"Error in handle_drag: {str(e)}")
+            raise ToolError(f"Failed to perform drag operation: {str(e)}")
     async def screenshot(self) -> ToolResult:
         """Take a screenshot."""
         try:

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/clients/oaicompat.py RENAMED Viewed

@@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        final_messages = [{"role": "system", "content": system}]
+        final_messages = [
+            {
+                "role": "system",
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
         # Process messages
         for item in messages:
             if isinstance(item, dict):
@@ -138,8 +145,13 @@ class OAICompatClient(BaseUITarsClient):
                     message = {"role": "user", "content": [{"type": "text", "text": item}]}
                 final_messages.append(message)
-        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
-        payload["max_tokens"] = max_tokens or self.max_tokens
+        payload = {
+            "model": self.model,
+            "messages": final_messages,
+            "max_tokens": max_tokens or self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": 0.7,
+        }
         try:
             async with aiohttp.ClientSession() as session:
@@ -178,25 +190,21 @@ class OAICompatClient(BaseUITarsClient):
                     response_text = await response.text()
                     logger.debug(f"Response content: {response_text}")
+                    # if 503, then the endpoint is still warming up
+                    if response.status == 503:
+                        logger.error(f"Endpoint is still warming up, please try again later")
+                        raise Exception(f"Endpoint is still warming up: {response_text}")
                     # Try to parse as JSON if the content type is appropriate
                     if "application/json" in response.headers.get('Content-Type', ''):
                         response_json = await response.json()
                     else:
                         raise Exception(f"Response is not JSON format")
-                        # # Optionally try to parse it anyway
-                        # try:
-                        #     import json
-                        #     response_json = json.loads(response_text)
-                        # except json.JSONDecodeError as e:
-                        #     print(f"Failed to parse response as JSON: {e}")
                     if response.status != 200:
-                        error_msg = response_json.get("error", {}).get(
-                            "message", str(response_json)
-                        )
-                        logger.error(f"Error in API call: {error_msg}")
-                        raise Exception(f"API error: {error_msg}")
+                        logger.error(f"Error in API call: {response_text}")
+                        raise Exception(f"API error: {response_text}")
                     return response_json
         except Exception as e:

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/loop.py RENAMED Viewed

@@ -17,10 +17,10 @@ from ...core.types import AgentResponse, LLMProvider
 from ...core.visualization import VisualizationHelper
 from computer import Computer
-from .utils import add_box_token, parse_actions, parse_action_parameters
+from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
 from .tools.manager import ToolManager
 from .tools.computer import ToolResult
-from .prompts import COMPUTER_USE, SYSTEM_PROMPT
+from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
 from .clients.oaicompat import OAICompatClient
@@ -184,7 +184,7 @@ class UITARSLoop(BaseLoop):
         if first_user_idx is not None and instruction:
             # Create the computer use prompt
             user_prompt = COMPUTER_USE.format(
-                instruction=instruction,
+                instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
                 language="English"
             )
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
                     if self.client is None:
                         raise RuntimeError("Failed to initialize client")
-                # Convert messages to UI-TARS format
+                # Get messages in standard format from the message manager
+                self.message_manager.messages = messages.copy()
                 prepared_messages = self.message_manager.get_messages()
+                # Convert messages to UI-TARS format
                 uitars_messages = self.to_uitars_format(prepared_messages)
                 # Log request
@@ -437,7 +440,7 @@ class UITARSLoop(BaseLoop):
     # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
     ###########################################
-    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
         """Run the agent loop with provided messages.
         Args:
@@ -504,41 +507,16 @@ class UITARSLoop(BaseLoop):
                 # Update whether an action screenshot was saved this turn
                 action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
-                # Parse actions from the raw response
-                raw_response = response["choices"][0]["message"]["content"]
-                parsed_actions = parse_actions(raw_response)
-                # Extract thought content if available
-                thought = ""
-                if "Thought:" in raw_response:
-                    thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
-                    if thought_match:
-                        thought = thought_match.group(1).strip()
-                # Create standardized thought response format
-                thought_response = {
-                    "role": "assistant",
-                    "content": thought or raw_response,
-                    "metadata": {
-                        "title": "🧠 UI-TARS Thoughts"
-                    }
-                }
+                agent_response = await to_agent_response_format(
+                    response,
+                    messages,
+                    model=self.model,
+                )
+                # Log standardized response for ease of parsing
+                self._log_api_call("agent_response", request=None, response=agent_response)
+                yield agent_response
-                # Create action response format
-                action_response = {
-                    "role": "assistant",
-                    "content": str(parsed_actions),
-                    "metadata": {
-                        "title": "🖱️ UI-TARS Actions",
-                    }
-                }
-                # Yield both responses to the caller (thoughts first, then actions)
-                yield thought_response
-                if parsed_actions:
-                    yield action_response
                 # Check if we should continue this conversation
                 running = should_continue
@@ -559,7 +537,8 @@ class UITARSLoop(BaseLoop):
                     logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
                 yield {
-                    "error": str(e),
+                    "role": "assistant",
+                    "content": f"Error: {str(e)}",
                     "metadata": {"title": "❌ Error"},
                 }

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/prompts.py RENAMED Viewed

@@ -1,5 +1,9 @@
 """Prompts for UI-TARS agent."""
+MAC_SPECIFIC_NOTES = """
+(You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
+"""
 SYSTEM_PROMPT = "You are a helpful assistant."
 COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
@@ -56,4 +60,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
 ## User Instruction
 {instruction}
-"""
+"""

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/computer.py RENAMED Viewed

@@ -173,9 +173,13 @@ class ComputerTool(BaseComputerTool):
             elif action == "hotkey":
                 if "keys" in kwargs:
                     keys = kwargs["keys"]
-                    for key in keys:
-                        await self.computer.interface.press_key(key)
+                    if len(keys) > 1:
+                        await self.computer.interface.hotkey(*keys)
+                    else:
+                        # Single key press
+                        await self.computer.interface.press_key(keys[0])
                     # Wait for UI to update
                     await asyncio.sleep(0.3)

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/utils.py RENAMED Viewed

@@ -4,9 +4,114 @@ import logging
 import base64
 import re
 from typing import Any, Dict, List, Optional, Union, Tuple
+from datetime import datetime
 logger = logging.getLogger(__name__)
+from ...core.types import AgentResponse
+async def to_agent_response_format(
+    response: Dict[str, Any],
+    messages: List[Dict[str, Any]],
+    model: Optional[str] = None,
+) -> AgentResponse:
+    """Convert raw UI-TARS response to agent response format.
+    Args:
+        response: Raw UI-TARS response
+        messages: List of messages in standard format
+        model: Optional model name
+    Returns:
+        AgentResponse: Standardized agent response format
+    """
+    # Create unique IDs for this response
+    response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
+    reasoning_id = f"rs_{response_id}"
+    action_id = f"cu_{response_id}"
+    call_id = f"call_{response_id}"
+    # Parse actions from the raw response
+    content = response["choices"][0]["message"]["content"]
+    actions = parse_actions(content)
+    # Extract thought content if available
+    reasoning_text = ""
+    if "Thought:" in content:
+        thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
+        if thought_match:
+            reasoning_text = thought_match.group(1).strip()
+    # Create output items
+    output_items = []
+    if reasoning_text:
+        output_items.append({
+            "type": "reasoning",
+            "id": reasoning_id,
+            "text": reasoning_text
+        })
+    if actions:
+        for i, action in enumerate(actions):
+            action_name, tool_args = parse_action_parameters(action)
+            if action_name == "finished":
+                output_items.append({
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [{
+                        "type": "output_text",
+                        "text": tool_args["content"]
+                    }],
+                    "id": f"action_{i}_{action_id}",
+                    "status": "completed"
+                })
+            else:
+                if tool_args.get("action") == action_name:
+                    del tool_args["action"]
+                output_items.append({
+                    "type": "computer_call",
+                    "id": f"{action}_{i}_{action_id}",
+                    "call_id": f"call_{i}_{action_id}",
+                    "action": { "type": action_name, **tool_args },
+                    "pending_safety_checks": [],
+                    "status": "completed"
+                })
+    # Create agent response
+    agent_response = AgentResponse(
+        id=response_id,
+        object="response",
+        created_at=int(datetime.now().timestamp()),
+        status="completed",
+        error=None,
+        incomplete_details=None,
+        instructions=None,
+        max_output_tokens=None,
+        model=model or response["model"],
+        output=output_items,
+        parallel_tool_calls=True,
+        previous_response_id=None,
+        reasoning={"effort": "medium"},
+        store=True,
+        temperature=0.0,
+        top_p=0.7,
+        text={"format": {"type": "text"}},
+        tool_choice="auto",
+        tools=[
+            {
+                "type": "computer_use_preview",
+                "display_height": 768,
+                "display_width": 1024,
+                "environment": "mac",
+            }
+        ],
+        truncation="auto",
+        usage=response["usage"],
+        user=None,
+        metadata={},
+        response=response
+    )
+    return agent_response
 def add_box_token(input_string: str) -> str:
     """Add box tokens to the coordinates in the model response.
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
     """
     # Handle "finished" action
     if action.startswith("finished"):
-        return "finished", {}
+        # Parse content if it exists
+        content_match = re.search(r"content='([^']*)'", action)
+        if content_match:
+            content = content_match.group(1)
+            return "finished", {"content": content}
+        else:
+            return "finished", {}
     # Parse action parameters
     action_match = re.match(r'(\w+)\((.*)\)', action)

{cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/gradio/app.py RENAMED Viewed

@@ -35,6 +35,7 @@ from pathlib import Path
 from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
 import gradio as gr
 from gradio.components.chatbot import MetadataDict
+from typing import cast
 # Import from agent package
 from agent.core.types import AgentResponse
@@ -322,63 +323,6 @@ def get_ollama_models() -> List[str]:
         logging.error(f"Error getting Ollama models: {e}")
         return []
-def extract_synthesized_text(
-    result: Union[AgentResponse, Dict[str, Any]],
-) -> Tuple[str, MetadataDict]:
-    """Extract synthesized text from the agent result."""
-    synthesized_text = ""
-    metadata = MetadataDict()
-    if "output" in result and result["output"]:
-        for output in result["output"]:
-            if output.get("type") == "reasoning":
-                metadata["title"] = "🧠 Reasoning"
-                content = output.get("content", "")
-                if content:
-                    synthesized_text += f"{content}\n"
-            elif output.get("type") == "message":
-                # Handle message type outputs - can contain rich content
-                content = output.get("content", [])
-                # Content is usually an array of content blocks
-                if isinstance(content, list):
-                    for block in content:
-                        if isinstance(block, dict) and block.get("type") == "output_text":
-                            text_value = block.get("text", "")
-                            if text_value:
-                                synthesized_text += f"{text_value}\n"
-            elif output.get("type") == "computer_call":
-                action = output.get("action", {})
-                action_type = action.get("type", "")
-                # Create a descriptive text about the action
-                if action_type == "click":
-                    button = action.get("button", "")
-                    x = action.get("x", "")
-                    y = action.get("y", "")
-                    synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
-                elif action_type == "type":
-                    text = action.get("text", "")
-                    synthesized_text += f"Typed: {text}.\n"
-                elif action_type == "keypress":
-                    # Extract key correctly from either keys array or key field
-                    if isinstance(action.get("keys"), list):
-                        key = ", ".join(action.get("keys"))
-                    else:
-                        key = action.get("key", "")
-                    synthesized_text += f"Pressed key: {key}\n"
-                else:
-                    synthesized_text += f"Performed {action_type} action.\n"
-                metadata["status"] = "done"
-                metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
-    return synthesized_text.strip(), metadata
 def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
     """Create or get the global Computer instance."""
     global global_computer
@@ -447,66 +391,6 @@ def create_agent(
     return global_agent
-def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
-    """Process agent results for the Gradio UI."""
-    # Extract text content
-    text_obj = result.get("text", {})
-    metadata = result.get("metadata", {})
-    # Create a properly typed MetadataDict
-    metadata_dict = MetadataDict()
-    metadata_dict["title"] = metadata.get("title", "")
-    metadata_dict["status"] = "done"
-    metadata = metadata_dict
-    # For OpenAI's Computer-Use Agent, text field is an object with format property
-    if (
-        text_obj
-        and isinstance(text_obj, dict)
-        and "format" in text_obj
-        and not text_obj.get("value", "")
-    ):
-        content, metadata = extract_synthesized_text(result)
-    else:
-        if not text_obj:
-            text_obj = result
-        # For other types of results, try to get text directly
-        if isinstance(text_obj, dict):
-            if "value" in text_obj:
-                content = text_obj["value"]
-            elif "text" in text_obj:
-                content = text_obj["text"]
-            elif "content" in text_obj:
-                content = text_obj["content"]
-            else:
-                content = ""
-        else:
-            content = str(text_obj) if text_obj else ""
-    # If still no content but we have outputs, create a summary
-    if not content and "output" in result and result["output"]:
-        output = result["output"]
-        for out in output:
-            if out.get("type") == "reasoning":
-                content = out.get("content", "")
-                if content:
-                    break
-            elif out.get("type") == "computer_call":
-                action = out.get("action", {})
-                action_type = action.get("type", "")
-                if action_type:
-                    content = f"Performing action: {action_type}"
-                    break
-    # Clean up the text - ensure content is a string
-    if not isinstance(content, str):
-        content = str(content) if content else ""
-    return content, metadata
 def create_gradio_ui(
     provider_name: str = "openai",
     model_name: str = "gpt-4o",
@@ -907,17 +791,64 @@ def create_gradio_ui(
                         # Stream responses from the agent
                         async for result in global_agent.run(last_user_message):
-                            # Process result
-                            content, metadata = process_agent_result(result)
-                            # Skip empty content
-                            if content or metadata.get("title"):
-                                history.append(
-                                    gr.ChatMessage(
-                                        role="assistant", content=content, metadata=metadata
+                            print(f"DEBUG - Agent response ------- START")
+                            from pprint import pprint
+                            pprint(result)
+                            print(f"DEBUG - Agent response ------- END")
+                            def generate_gradio_messages():
+                                if result.get("content"):
+                                    yield gr.ChatMessage(
+                                        role="assistant",
+                                        content=result.get("content", ""),
+                                        metadata=cast(MetadataDict, result.get("metadata", {}))
                                     )
-                                )
-                            yield history
+                                else:
+                                    outputs = result.get("output", [])
+                                    for output in outputs:
+                                        if output.get("type") == "message":
+                                            content = output.get("content", [])
+                                            for content_part in content:
+                                                if content_part.get("text"):
+                                                    yield gr.ChatMessage(
+                                                        role=output.get("role", "assistant"),
+                                                        content=content_part.get("text", ""),
+                                                        metadata=content_part.get("metadata", {})
+                                                    )
+                                        elif output.get("type") == "reasoning":
+                                            # if it's openAI, we only have access to a summary of the reasoning
+                                            summary_content = output.get("summary", [])
+                                            if summary_content:
+                                                for summary_part in summary_content:
+                                                    if summary_part.get("type") == "summary_text":
+                                                        yield gr.ChatMessage(
+                                                            role="assistant",
+                                                            content=summary_part.get("text", "")
+                                                        )
+                                            else:
+                                                summary_content = output.get("text", "")
+                                                if summary_content:
+                                                    yield gr.ChatMessage(
+                                                        role="assistant",
+                                                        content=summary_content,
+                                                    )
+                                        elif output.get("type") == "computer_call":
+                                            action = output.get("action", {})
+                                            action_type = action.get("type", "")
+                                            if action_type:
+                                                action_title = f"🛠️ Performing {action_type}"
+                                                if action.get("x") and action.get("y"):
+                                                    action_title += f" at ({action['x']}, {action['y']})"
+                                                yield gr.ChatMessage(
+                                                    role="assistant",
+                                                    content=f"```json\n{json.dumps(action)}\n```",
+                                                    metadata={"title": action_title}
+                                                )
+                            for message in generate_gradio_messages():
+                                history.append(message)
+                                yield history
                     except Exception as e:
                         import traceback

{cua_agent-0.1.30 → cua_agent-0.1.32}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
 [project]
 name = "cua-agent"
-version = "0.1.30"
+version = "0.1.32"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [
@@ -108,7 +108,7 @@ target-version = [
 [tool.ruff]
 line-length = 100
-target-version = "0.1.30"
+target-version = "0.1.32"
 select = [
     "E",
     "F",
@@ -122,7 +122,7 @@ docstring-code-format = true
 [tool.mypy]
 strict = true
-python_version = "0.1.30"
+python_version = "0.1.32"
 ignore_missing_imports = true
 disallow_untyped_defs = true
 check_untyped_defs = true