PyPI - cua-agent - Versions diffs - 0.4.36__tar.gz → 0.4.37__tar.gz - Mend

cua-agent 0.4.36tar.gz → 0.4.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

{cua_agent-0.4.36 → cua_agent-0.4.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.4.36
+Version: 0.4.37
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.12

{cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/omniparser.py RENAMED Viewed

@@ -14,67 +14,73 @@ import litellm
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
+from ..responses import (
+    convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
+)
 from ..types import AgentCapability, AgentResponse, Messages, Tools
 SOM_TOOL_SCHEMA = {
     "type": "function",
-    "name": "computer",
-    "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
-    "parameters": {
-        "type": "object",
-        "properties": {
-            "action": {
-                "type": "string",
-                "enum": [
-                    "screenshot",
-                    "click",
-                    "double_click",
-                    "drag",
-                    "type",
-                    "keypress",
-                    "scroll",
-                    "move",
-                    "wait",
-                    "get_current_url",
-                    "get_dimensions",
-                    "get_environment",
-                ],
-                "description": "The action to perform",
-            },
-            "element_id": {
-                "type": "integer",
-                "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
-            },
-            "start_element_id": {
-                "type": "integer",
-                "description": "The ID of the element to start dragging from (required for drag action)",
-            },
-            "end_element_id": {
-                "type": "integer",
-                "description": "The ID of the element to drag to (required for drag action)",
-            },
-            "text": {
-                "type": "string",
-                "description": "The text to type (required for type action)",
-            },
-            "keys": {
-                "type": "string",
-                "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
-            },
-            "button": {
-                "type": "string",
-                "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
-            },
-            "scroll_x": {
-                "type": "integer",
-                "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
-            },
-            "scroll_y": {
-                "type": "integer",
-                "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+    "function": {
+        "name": "computer",
+        "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform",
+                },
+                "element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
+                },
+                "start_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to start dragging from (required for drag action)",
+                },
+                "end_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to drag to (required for drag action)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type action)",
+                },
+                "keys": {
+                    "type": "string",
+                    "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
+                },
+                "button": {
+                    "type": "string",
+                    "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+                },
             },
+            "required": ["action", "element_id"],
         },
-        "required": ["action"],
     },
 }
@@ -256,7 +262,7 @@ async def replace_computer_call_with_function(
             {
                 "type": "function_call_output",
                 "call_id": item.get("call_id"),
-                "output": output,
+                "output": item.get("output"),
                 "id": item.get("id"),
                 "status": "completed",
             }
@@ -298,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):
         llm_model = model.split("+")[-1]
+        # Get screen dimensions from computer handler
+        try:
+            width, height = await computer_handler.get_dimensions()
+        except Exception:
+            # Fallback to default dimensions if method fails
+            width, height = 1024, 768
         # Prepare tools for OpenAI API
         openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
@@ -311,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
                 result = parser.parse(image_data)
                 if _on_screenshot:
                     await _on_screenshot(result.annotated_image_base64, "annotated_image")
+                # Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
                 for element in result.elements:
-                    id2xy[element.id] = (
-                        (element.bbox.x1 + element.bbox.x2) / 2,
-                        (element.bbox.y1 + element.bbox.y2) / 2,
-                    )
-        # handle computer calls -> function calls
-        new_messages = []
-        for message in messages:
+                    norm_x = (element.bbox.x1 + element.bbox.x2) / 2
+                    norm_y = (element.bbox.y1 + element.bbox.y2) / 2
+                    pixel_x = int(norm_x * width)
+                    pixel_y = int(norm_y * height)
+                    id2xy[element.id] = (pixel_x, pixel_y)
+                # Replace the original screenshot with the annotated image
+                annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
+                last_computer_call_output["output"]["image_url"] = annotated_image_url
+        xy2id = {v: k for k, v in id2xy.items()}
+        messages_with_element_ids = []
+        for i, message in enumerate(messages):
             if not isinstance(message, dict):
                 message = message.__dict__
-            new_messages += await replace_computer_call_with_function(message, id2xy)  # type: ignore
-        messages = new_messages
+            msg_type = message.get("type")
+            if msg_type == "computer_call" and "action" in message:
+                action = message.get("action", {})
+            converted = await replace_computer_call_with_function(message, xy2id)  # type: ignore
+            messages_with_element_ids += converted
+        completion_messages = convert_responses_items_to_completion_messages(
+            messages_with_element_ids, allow_images_in_tool_results=False
+        )
         # Prepare API call kwargs
         api_kwargs = {
             "model": llm_model,
-            "input": messages,
+            "messages": completion_messages,
             "tools": openai_tools if openai_tools else None,
             "stream": stream,
-            "truncation": "auto",
             "num_retries": max_retries,
             **kwargs,
         }
@@ -342,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):
         print(str(api_kwargs)[:1000])
-        # Use liteLLM responses
-        response = await litellm.aresponses(**api_kwargs)
+        # Use liteLLM completion
+        response = await litellm.acompletion(**api_kwargs)
         # Call API end hook
         if _on_api_end:
@@ -357,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
         if _on_usage:
             await _on_usage(usage)
-        # handle som function calls -> xy computer calls
-        new_output = []
-        for i in range(len(response.output)):  # type: ignore
-            new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)  # type: ignore
+        response_dict = response.model_dump()  # type: ignore
+        choice_messages = [choice["message"] for choice in response_dict["choices"]]
+        responses_items = []
+        for choice_message in choice_messages:
+            responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
+        # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
+        final_output = []
+        for item in responses_items:
+            if item.get("type") == "computer_call" and "action" in item:
+                action = item["action"].copy()
+                # Handle single element_id
+                if "element_id" in action:
+                    element_id = action["element_id"]
+                    if element_id in id2xy:
+                        x, y = id2xy[element_id]
+                        action["x"] = x
+                        action["y"] = y
+                        del action["element_id"]
+                # Handle start_element_id and end_element_id for drag operations
+                elif "start_element_id" in action and "end_element_id" in action:
+                    start_id = action["start_element_id"]
+                    end_id = action["end_element_id"]
+                    if start_id in id2xy and end_id in id2xy:
+                        start_x, start_y = id2xy[start_id]
+                        end_x, end_y = id2xy[end_id]
+                        action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
+                        del action["start_element_id"]
+                        del action["end_element_id"]
+                converted_item = item.copy()
+                converted_item["action"] = action
+                final_output.append(converted_item)
+            else:
+                final_output.append(item)
-        return {"output": new_output, "usage": usage}
+        return {"output": final_output, "usage": usage}
     async def predict_click(
         self, model: str, image_b64: str, instruction: str, **kwargs

{cua_agent-0.4.36 → cua_agent-0.4.37}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
 [project]
 name = "cua-agent"
-version = "0.4.36"
+version = "0.4.37"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [