PyPI - cua-agent - Versions diffs - 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show

agent/__init__.py +4 -19
agent/__main__.py +2 -1
agent/adapters/__init__.py +6 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +370 -0
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +431 -241
agent/callbacks/__init__.py +10 -3
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +140 -0
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +106 -69
agent/callbacks/trajectory_saver.py +178 -70
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +164 -74
agent/integrations/hud/agent.py +338 -342
agent/integrations/hud/proxy.py +297 -0
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +590 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +142 -144
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +63 -56
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +262 -212
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +196 -0
agent/proxy/handlers.py +255 -0
agent/responses.py +486 -339
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +20 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
agent/integrations/hud/adapter.py +0 -121
agent/integrations/hud/computer_handler.py +0 -187
agent/telemetry.py +0 -142
cua_agent-0.4.14.dist-info/METADATA +0 -436
cua_agent-0.4.14.dist-info/RECORD +0 -50
{cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/responses.py CHANGED Viewed

@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
 import base64
 import json
 import uuid
-from typing import List, Dict, Any, Literal, Union, Optional
+from typing import Any, Dict, List, Literal, Optional, Union
+from openai.types.responses.easy_input_message_param import EasyInputMessageParam
 from openai.types.responses.response_computer_tool_call_param import (
-    ResponseComputerToolCallParam,
     ActionClick,
     ActionDoubleClick,
     ActionDrag,
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
     ActionMove,
     ActionScreenshot,
     ActionScroll,
+)
+from openai.types.responses.response_computer_tool_call_param import (
     ActionType as ActionTypeAction,
+)
+from openai.types.responses.response_computer_tool_call_param import (
     ActionWait,
-    PendingSafetyCheck
+    PendingSafetyCheck,
+    ResponseComputerToolCallParam,
+)
+from openai.types.responses.response_function_tool_call_param import (
+    ResponseFunctionToolCallParam,
 )
-from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
-from openai.types.responses.response_output_text_param import ResponseOutputTextParam
-from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
-from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
-from openai.types.responses.easy_input_message_param import EasyInputMessageParam
 from openai.types.responses.response_input_image_param import ResponseInputImageParam
+from openai.types.responses.response_output_message_param import (
+    ResponseOutputMessageParam,
+)
+from openai.types.responses.response_output_text_param import ResponseOutputTextParam
+from openai.types.responses.response_reasoning_item_param import (
+    ResponseReasoningItemParam,
+    Summary,
+)
 def random_id():
     return str(uuid.uuid4())
 # User message items
 def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
     return EasyInputMessageParam(
         content=[
             ResponseInputImageParam(
                 type="input_image",
-                image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
-            ) # type: ignore
+                image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
+            )  # type: ignore
         ],
         role="user",
-        type="message"
+        type="message",
     )
 # Text items
 def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
     return ResponseReasoningItemParam(
-        id=random_id(),
-        summary=[
-            Summary(text=reasoning, type="summary_text")
-        ],
-        type="reasoning"
+        id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
     )
 def make_output_text_item(content: str) -> ResponseOutputMessageParam:
     return ResponseOutputMessageParam(
         id=random_id(),
-        content=[
-            ResponseOutputTextParam(
-                text=content,
-                type="output_text",
-                annotations=[]
-            )
-        ],
+        content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
         role="assistant",
         status="completed",
-        type="message"
+        type="message",
     )
 # Function call items
-def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
+def make_function_call_item(
+    function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
+) -> ResponseFunctionToolCallParam:
     return ResponseFunctionToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
         name=function_name,
         arguments=json.dumps(arguments),
         status="completed",
-        type="function_call"
+        type="function_call",
     )
 # Computer tool call items
-def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+def make_click_item(
+    x: int,
+    y: int,
+    button: Literal["left", "right", "wheel", "back", "forward"] = "left",
+    call_id: Optional[str] = None,
+) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionClick(
-            button=button,
-            type="click",
-            x=x,
-            y=y
-        ),
+        action=ActionClick(button=button, type="click", x=x, y=y),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
-def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+def make_double_click_item(
+    x: int, y: int, call_id: Optional[str] = None
+) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionDoubleClick(
-            type="double_click",
-            x=x,
-            y=y
-        ),
+        action=ActionDoubleClick(type="double_click", x=x, y=y),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
-def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+def make_drag_item(
+    path: List[Dict[str, int]], call_id: Optional[str] = None
+) -> ResponseComputerToolCallParam:
     drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionDrag(
-            path=drag_path,
-            type="drag"
-        ),
+        action=ActionDrag(path=drag_path, type="drag"),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
-def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+def make_keypress_item(
+    keys: List[str], call_id: Optional[str] = None
+) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionKeypress(
-            keys=keys,
-            type="keypress"
-        ),
+        action=ActionKeypress(keys=keys, type="keypress"),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
 def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionMove(
-            type="move",
-            x=x,
-            y=y
-        ),
+        action=ActionMove(type="move", x=x, y=y),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
 def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionScreenshot(
-            type="screenshot"
-        ),
+        action=ActionScreenshot(type="screenshot"),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
-def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
+def make_scroll_item(
+    x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
+) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionScroll(
-            scroll_x=scroll_x,
-            scroll_y=scroll_y,
-            type="scroll",
-            x=x,
-            y=y
-        ),
+        action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
 def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionTypeAction(
-            text=text,
-            type="type"
-        ),
+        action=ActionTypeAction(text=text, type="type"),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
 def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
     return ResponseComputerToolCallParam(
         id=random_id(),
         call_id=call_id if call_id else random_id(),
-        action=ActionWait(
-            type="wait"
-        ),
+        action=ActionWait(type="wait"),
         pending_safety_checks=[],
         status="completed",
-        type="computer_call"
+        type="computer_call",
     )
 # Extra anthropic computer calls
-def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
+def make_left_mouse_down_item(
+    x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
+) -> Dict[str, Any]:
     return {
         "id": random_id(),
         "call_id": call_id if call_id else random_id(),
-        "action": {
-            "type": "left_mouse_down",
-            "x": x,
-            "y": y
-        },
+        "action": {"type": "left_mouse_down", "x": x, "y": y},
         "pending_safety_checks": [],
         "status": "completed",
-        "type": "computer_call"
+        "type": "computer_call",
     }
-def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
+def make_left_mouse_up_item(
+    x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
+) -> Dict[str, Any]:
     return {
         "id": random_id(),
         "call_id": call_id if call_id else random_id(),
-        "action": {
-            "type": "left_mouse_up",
-            "x": x,
-            "y": y
-        },
+        "action": {"type": "left_mouse_up", "x": x, "y": y},
         "pending_safety_checks": [],
         "status": "completed",
-        "type": "computer_call"
+        "type": "computer_call",
     }
-def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]:
+def make_failed_tool_call_items(
+    tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     call_id = call_id if call_id else random_id()
     return [
         {
@@ -249,27 +247,80 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
             "type": "function_call_output",
             "call_id": call_id,
             "output": json.dumps({"error": error_message}),
-        }
+        },
     ]
+def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
+    call_id = call_id if call_id else random_id()
+    return {
+        "type": "function_call_output",
+        "call_id": call_id,
+        "output": json.dumps({"error": error_message}),
+    }
+def replace_failed_computer_calls_with_function_calls(
+    messages: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """
+    Replace computer_call items with function_call items if they share a call_id with a function_call_output.
+    This indicates the computer call failed and should be treated as a function call instead.
+    We do this because the computer_call_output items do not support text output.
+    Args:
+        messages: List of message items to process
+    """
+    messages = messages.copy()
+    # Find all call_ids that have function_call_output items
+    failed_call_ids = set()
+    for msg in messages:
+        if msg.get("type") == "function_call_output":
+            call_id = msg.get("call_id")
+            if call_id:
+                failed_call_ids.add(call_id)
+    # Replace computer_call items that have matching call_ids
+    for i, msg in enumerate(messages):
+        if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
+            # Extract action from computer_call
+            action = msg.get("action", {})
+            call_id = msg.get("call_id")
+            # Create function_call replacement
+            messages[i] = {
+                "type": "function_call",
+                "id": msg.get("id", random_id()),
+                "call_id": call_id,
+                "name": "computer",
+                "arguments": json.dumps(action),
+            }
+    return messages
 # Conversion functions between element descriptions and coordinates
-def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
+def convert_computer_calls_desc2xy(
+    responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
+) -> List[Dict[str, Any]]:
     """
     Convert computer calls from element descriptions to x,y coordinates.
     Args:
         responses_items: List of response items containing computer calls with element_description
         desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
     Returns:
         List of response items with element_description replaced by x,y coordinates
     """
     converted_items = []
     for item in responses_items:
         if item.get("type") == "computer_call" and "action" in item:
             action = item["action"].copy()
             # Handle single element_description
             if "element_description" in action:
                 desc = action["element_description"]
@@ -278,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
                     action["x"] = x
                     action["y"] = y
                     del action["element_description"]
             # Handle start_element_description and end_element_description for drag operations
             elif "start_element_description" in action and "end_element_description" in action:
                 start_desc = action["start_element_description"]
                 end_desc = action["end_element_description"]
                 if start_desc in desc2xy and end_desc in desc2xy:
                     start_x, start_y = desc2xy[start_desc]
                     end_x, end_y = desc2xy[end_desc]
                     action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
                     del action["start_element_description"]
                     del action["end_element_description"]
             converted_item = item.copy()
             converted_item["action"] = action
             converted_items.append(converted_item)
         else:
             converted_items.append(item)
     return converted_items
-def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
+def convert_computer_calls_xy2desc(
+    responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
+) -> List[Dict[str, Any]]:
     """
     Convert computer calls from x,y coordinates to element descriptions.
     Args:
         responses_items: List of response items containing computer calls with x,y coordinates
         desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
     Returns:
         List of response items with x,y coordinates replaced by element_description
     """
     # Create reverse mapping from coordinates to descriptions
     xy2desc = {coords: desc for desc, coords in desc2xy.items()}
     converted_items = []
     for item in responses_items:
         if item.get("type") == "computer_call" and "action" in item:
             action = item["action"].copy()
             # Handle single x,y coordinates
             if "x" in action and "y" in action:
                 coords = (action["x"], action["y"])
@@ -327,77 +380,94 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
                     action["element_description"] = xy2desc[coords]
                     del action["x"]
                     del action["y"]
             # Handle path for drag operations
             elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
                 start_point = action["path"][0]
                 end_point = action["path"][1]
-                if ("x" in start_point and "y" in start_point and
-                    "x" in end_point and "y" in end_point):
+                if (
+                    "x" in start_point
+                    and "y" in start_point
+                    and "x" in end_point
+                    and "y" in end_point
+                ):
                     start_coords = (start_point["x"], start_point["y"])
                     end_coords = (end_point["x"], end_point["y"])
                     if start_coords in xy2desc and end_coords in xy2desc:
                         action["start_element_description"] = xy2desc[start_coords]
                         action["end_element_description"] = xy2desc[end_coords]
                         del action["path"]
             converted_item = item.copy()
             converted_item["action"] = action
             converted_items.append(converted_item)
         else:
             converted_items.append(item)
     return converted_items
 def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
     """
     Extract all element descriptions from computer calls in responses items.
     Args:
         responses_items: List of response items containing computer calls
     Returns:
         List of unique element descriptions found in computer calls
     """
     descriptions = set()
     for item in responses_items:
         if item.get("type") == "computer_call" and "action" in item:
             action = item["action"]
             # Handle single element_description
             if "element_description" in action:
                 descriptions.add(action["element_description"])
             # Handle start_element_description and end_element_description for drag operations
             if "start_element_description" in action:
                 descriptions.add(action["start_element_description"])
             if "end_element_description" in action:
                 descriptions.add(action["end_element_description"])
     return list(descriptions)
 # Conversion functions between responses_items and completion messages formats
-def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True) -> List[Dict[str, Any]]:
+def convert_responses_items_to_completion_messages(
+    messages: List[Dict[str, Any]],
+    allow_images_in_tool_results: bool = True,
+    send_multiple_user_images_per_parallel_tool_results: bool = False,
+    use_xml_tools: bool = False,
+) -> List[Dict[str, Any]]:
     """Convert responses_items message format to liteLLM completion format.
     Args:
         messages: List of responses_items format messages
         allow_images_in_tool_results: If True, include images in tool role messages.
                                     If False, send tool message + separate user message with image.
+        send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
+        use_xml_tools: If True, use XML-style <tool_call> tags instead of tool_calls array.
+                      Also sends tool results as user messages instead of tool role.
     """
+    # Assert that allow_images_in_tool_results is False when use_xml_tools is True
+    if use_xml_tools:
+        assert (
+            not allow_images_in_tool_results
+        ), "allow_images_in_tool_results must be False when use_xml_tools is True"
     completion_messages = []
-    for message in messages:
+    for i, message in enumerate(messages):
         msg_type = message.get("type")
         role = message.get("role")
         # Handle user messages (both with and without explicit type)
         if role == "user" or msg_type == "user":
             content = message.get("content", "")
@@ -406,34 +476,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
                 completion_content = []
                 for item in content:
                     if item.get("type") == "input_image":
-                        completion_content.append({
-                            "type": "image_url",
-                            "image_url": {
-                                "url": item.get("image_url")
-                            }
-                        })
+                        completion_content.append(
+                            {"type": "image_url", "image_url": {"url": item.get("image_url")}}
+                        )
                     elif item.get("type") == "input_text":
-                        completion_content.append({
-                            "type": "text",
-                            "text": item.get("text")
-                        })
+                        completion_content.append({"type": "text", "text": item.get("text")})
                     elif item.get("type") == "text":
-                        completion_content.append({
-                            "type": "text",
-                            "text": item.get("text")
-                        })
-                completion_messages.append({
-                    "role": "user",
-                    "content": completion_content
-                })
+                        completion_content.append({"type": "text", "text": item.get("text")})
+                completion_messages.append({"role": "user", "content": completion_content})
             elif isinstance(content, str):
                 # Handle string content
-                completion_messages.append({
-                    "role": "user",
-                    "content": content
-                })
+                completion_messages.append({"role": "user", "content": content})
         # Handle assistant messages
         elif role == "assistant" or msg_type == "message":
             content = message.get("content", [])
@@ -444,13 +499,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
                         text_parts.append(item.get("text", ""))
                     elif item.get("type") == "text":
                         text_parts.append(item.get("text", ""))
                 if text_parts:
-                    completion_messages.append({
-                        "role": "assistant",
-                        "content": "\n".join(text_parts)
-                    })
+                    completion_messages.append(
+                        {"role": "assistant", "content": "\n".join(text_parts)}
+                    )
         # Handle reasoning items (convert to assistant message)
         elif msg_type == "reasoning":
             summary = message.get("summary", [])
@@ -458,107 +512,185 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
             for item in summary:
                 if item.get("type") == "summary_text":
                     text_parts.append(item.get("text", ""))
             if text_parts:
-                completion_messages.append({
-                    "role": "assistant",
-                    "content": "\n".join(text_parts)
-                })
+                completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
         # Handle function calls
         elif msg_type == "function_call":
-            # Add tool call to last assistant message or create new one
-            if not completion_messages or completion_messages[-1]["role"] != "assistant":
-                completion_messages.append({
-                    "role": "assistant",
-                    "content": "",
-                    "tool_calls": []
-                })
-            if "tool_calls" not in completion_messages[-1]:
-                completion_messages[-1]["tool_calls"] = []
-            completion_messages[-1]["tool_calls"].append({
-                "id": message.get("call_id"),
-                "type": "function",
-                "function": {
-                    "name": message.get("name"),
-                    "arguments": message.get("arguments")
-                }
-            })
+            if use_xml_tools:
+                # Use XML format instead of tool_calls array
+                if not completion_messages or completion_messages[-1]["role"] != "assistant":
+                    completion_messages.append({"role": "assistant", "content": ""})
+                # Ensure arguments is a JSON string (not a dict)
+                arguments = message.get("arguments")
+                if isinstance(arguments, dict):
+                    arguments = json.dumps(arguments)
+                # Format as XML tool call
+                tool_call_xml = f'<tool_call>{{"name": "{message.get("name")}", "arguments": {arguments}}}</tool_call>'
+                if completion_messages[-1]["content"]:
+                    completion_messages[-1]["content"] += "\n" + tool_call_xml
+                else:
+                    completion_messages[-1]["content"] = tool_call_xml
+            else:
+                # Add tool call to last assistant message or create new one
+                if not completion_messages or completion_messages[-1]["role"] != "assistant":
+                    completion_messages.append(
+                        {"role": "assistant", "content": "", "tool_calls": []}
+                    )
+                if "tool_calls" not in completion_messages[-1]:
+                    completion_messages[-1]["tool_calls"] = []
+                # Ensure arguments is a JSON string (not a dict)
+                arguments = message.get("arguments")
+                if isinstance(arguments, dict):
+                    arguments = json.dumps(arguments)
+                completion_messages[-1]["tool_calls"].append(
+                    {
+                        "id": message.get("call_id"),
+                        "type": "function",
+                        "function": {
+                            "name": message.get("name"),
+                            "arguments": arguments,
+                        },
+                    }
+                )
         # Handle computer calls
         elif msg_type == "computer_call":
-            # Add tool call to last assistant message or create new one
-            if not completion_messages or completion_messages[-1]["role"] != "assistant":
-                completion_messages.append({
-                    "role": "assistant",
-                    "content": "",
-                    "tool_calls": []
-                })
-            if "tool_calls" not in completion_messages[-1]:
-                completion_messages[-1]["tool_calls"] = []
-            action = message.get("action", {})
-            completion_messages[-1]["tool_calls"].append({
-                "id": message.get("call_id"),
-                "type": "function",
-                "function": {
-                    "name": "computer",
-                    "arguments": json.dumps(action)
-                }
-            })
+            if use_xml_tools:
+                # Use XML format instead of tool_calls array
+                if not completion_messages or completion_messages[-1]["role"] != "assistant":
+                    completion_messages.append({"role": "assistant", "content": ""})
+                action = message.get("action", {})
+                # Format as XML tool call
+                tool_call_xml = f'<tool_call>{{"name": "computer", "arguments": {json.dumps(action)}}}</tool_call>'
+                if completion_messages[-1]["content"]:
+                    completion_messages[-1]["content"] += "\n" + tool_call_xml
+                else:
+                    completion_messages[-1]["content"] = tool_call_xml
+            else:
+                # Add tool call to last assistant message or create new one
+                if not completion_messages or completion_messages[-1]["role"] != "assistant":
+                    completion_messages.append(
+                        {"role": "assistant", "content": "", "tool_calls": []}
+                    )
+                if "tool_calls" not in completion_messages[-1]:
+                    completion_messages[-1]["tool_calls"] = []
+                action = message.get("action", {})
+                completion_messages[-1]["tool_calls"].append(
+                    {
+                        "id": message.get("call_id"),
+                        "type": "function",
+                        "function": {"name": "computer", "arguments": json.dumps(action)},
+                    }
+                )
         # Handle function/computer call outputs
         elif msg_type in ["function_call_output", "computer_call_output"]:
             output = message.get("output")
             call_id = message.get("call_id")
-            if isinstance(output, dict) and output.get("type") == "input_image":
-                if allow_images_in_tool_results:
-                    # Handle image output as tool response (may not work with all APIs)
-                    completion_messages.append({
-                        "role": "tool",
-                        "tool_call_id": call_id,
-                        "content": [{
-                            "type": "image_url",
-                            "image_url": {
-                                "url": output.get("image_url")
-                            }
-                        }]
-                    })
+            if use_xml_tools:
+                # When using XML tools, send all results as user messages
+                if isinstance(output, dict) and output.get("type") == "input_image":
+                    # Send image as user message
+                    completion_messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": output.get("image_url")},
+                                }
+                            ],
+                        }
+                    )
                 else:
-                    # Send tool message + separate user message with image (OpenAI compatible)
-                    completion_messages += [{
-                        "role": "tool",
-                        "tool_call_id": call_id,
-                        "content": "[Execution completed. See screenshot below]"
-                    }, {
-                        "role": "user",
-                        "content": [{
-                            "type": "image_url",
-                            "image_url": {
-                                "url": output.get("image_url")
-                            }
-                        }]
-                    }]
+                    # Send text result as user message
+                    completion_messages.append(
+                        {
+                            "role": "user",
+                            "content": str(output),
+                        }
+                    )
             else:
-                # Handle text output as tool response
-                completion_messages.append({
-                    "role": "tool",
-                    "tool_call_id": call_id,
-                    "content": str(output)
-                })
+                # Standard tool message handling
+                if isinstance(output, dict) and output.get("type") == "input_image":
+                    if allow_images_in_tool_results:
+                        # Handle image output as tool response (may not work with all APIs)
+                        completion_messages.append(
+                            {
+                                "role": "tool",
+                                "tool_call_id": call_id,
+                                "content": [
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": output.get("image_url")},
+                                    }
+                                ],
+                            }
+                        )
+                    else:
+                        # Determine if the next message is also a tool call output
+                        next_type = None
+                        if i + 1 < len(messages):
+                            next_msg = messages[i + 1]
+                            next_type = next_msg.get("type")
+                        is_next_message_image_result = next_type in [
+                            "computer_call_output",
+                        ]
+                        # Send tool message + separate user message with image (OpenAI compatible)
+                        completion_messages += (
+                            [
+                                {
+                                    "role": "tool",
+                                    "tool_call_id": call_id,
+                                    "content": "[Execution completed. See screenshot below]",
+                                },
+                                {
+                                    "role": "user",
+                                    "content": [
+                                        {
+                                            "type": "image_url",
+                                            "image_url": {"url": output.get("image_url")},
+                                        }
+                                    ],
+                                },
+                            ]
+                            if send_multiple_user_images_per_parallel_tool_results
+                            or (not is_next_message_image_result)
+                            else [
+                                {
+                                    "role": "tool",
+                                    "tool_call_id": call_id,
+                                    "content": "[Execution completed. See screenshot below]",
+                                },
+                            ]
+                        )
+                else:
+                    # Handle text output as tool response
+                    completion_messages.append(
+                        {"role": "tool", "tool_call_id": call_id, "content": str(output)}
+                    )
     return completion_messages
-def convert_completion_messages_to_responses_items(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+def convert_completion_messages_to_responses_items(
+    completion_messages: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
     """Convert completion messages format to responses_items message format."""
     responses_items = []
     skip_next = False
     for i, message in enumerate(completion_messages):
         if skip_next:
             skip_next = False
@@ -567,25 +699,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
         role = message.get("role")
         content = message.get("content")
         tool_calls = message.get("tool_calls", [])
         # Handle assistant messages with text content
         if role == "assistant" and content and isinstance(content, str):
-            responses_items.append({
-                "type": "message",
-                "role": "assistant",
-                "content": [{
-                    "type": "output_text",
-                    "text": content
-                }]
-            })
+            responses_items.append(
+                {
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [{"type": "output_text", "text": content}],
+                }
+            )
         # Handle tool calls
         if tool_calls:
             for tool_call in tool_calls:
                 if tool_call.get("type") == "function":
                     function = tool_call.get("function", {})
                     function_name = function.get("name")
                     if function_name == "computer":
                         # Parse computer action
                         try:
@@ -594,31 +725,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
                             if action.get("action"):
                                 action["type"] = action["action"]
                                 del action["action"]
-                            responses_items.append({
-                                "type": "computer_call",
-                                "call_id": tool_call.get("id"),
-                                "action": action,
-                                "status": "completed"
-                            })
+                            responses_items.append(
+                                {
+                                    "type": "computer_call",
+                                    "call_id": tool_call.get("id"),
+                                    "action": action,
+                                    "status": "completed",
+                                }
+                            )
                         except json.JSONDecodeError:
                             # Fallback to function call format
-                            responses_items.append({
+                            responses_items.append(
+                                {
+                                    "type": "function_call",
+                                    "call_id": tool_call.get("id"),
+                                    "name": function_name,
+                                    "arguments": function.get("arguments", "{}"),
+                                    "status": "completed",
+                                }
+                            )
+                    else:
+                        # Regular function call
+                        responses_items.append(
+                            {
                                 "type": "function_call",
                                 "call_id": tool_call.get("id"),
                                 "name": function_name,
                                 "arguments": function.get("arguments", "{}"),
-                                "status": "completed"
-                            })
-                    else:
-                        # Regular function call
-                        responses_items.append({
-                            "type": "function_call",
-                            "call_id": tool_call.get("id"),
-                            "name": function_name,
-                            "arguments": function.get("arguments", "{}"),
-                            "status": "completed"
-                        })
+                                "status": "completed",
+                            }
+                        )
         # Handle tool messages (function/computer call outputs)
         elif role == "tool" and content:
             tool_call_id = message.get("tool_call_id")
@@ -627,74 +764,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
                 if content == "[Execution completed. See screenshot below]":
                     # Look ahead for the next user message with image
                     next_idx = i + 1
-                    if (next_idx < len(completion_messages) and
-                        completion_messages[next_idx].get("role") == "user" and
-                        isinstance(completion_messages[next_idx].get("content"), list)):
+                    if (
+                        next_idx < len(completion_messages)
+                        and completion_messages[next_idx].get("role") == "user"
+                        and isinstance(completion_messages[next_idx].get("content"), list)
+                    ):
                         # Found the pattern - extract image from next message
                         next_content = completion_messages[next_idx]["content"]
                         for item in next_content:
                             if item.get("type") == "image_url":
-                                responses_items.append({
-                                    "type": "computer_call_output",
-                                    "call_id": tool_call_id,
-                                    "output": {
-                                        "type": "input_image",
-                                        "image_url": item.get("image_url", {}).get("url")
+                                responses_items.append(
+                                    {
+                                        "type": "computer_call_output",
+                                        "call_id": tool_call_id,
+                                        "output": {
+                                            "type": "input_image",
+                                            "image_url": item.get("image_url", {}).get("url"),
+                                        },
                                     }
-                                })
+                                )
                                 # Skip the next user message since we processed it
                                 skip_next = True
                                 break
                     else:
                         # No matching user message, treat as regular text
-                        responses_items.append({
-                            "type": "computer_call_output",
-                            "call_id": tool_call_id,
-                            "output": content
-                        })
+                        responses_items.append(
+                            {
+                                "type": "computer_call_output",
+                                "call_id": tool_call_id,
+                                "output": content,
+                            }
+                        )
                 else:
                     # Determine if this is a computer call or function call output
                     try:
                         # Try to parse as structured output
                         parsed_content = json.loads(content)
                         if parsed_content.get("type") == "input_image":
-                            responses_items.append({
-                                "type": "computer_call_output",
-                                "call_id": tool_call_id,
-                                "output": parsed_content
-                            })
+                            responses_items.append(
+                                {
+                                    "type": "computer_call_output",
+                                    "call_id": tool_call_id,
+                                    "output": parsed_content,
+                                }
+                            )
                         else:
-                            responses_items.append({
-                                "type": "computer_call_output",
-                                "call_id": tool_call_id,
-                                "output": content
-                            })
+                            responses_items.append(
+                                {
+                                    "type": "computer_call_output",
+                                    "call_id": tool_call_id,
+                                    "output": content,
+                                }
+                            )
                     except json.JSONDecodeError:
                         # Plain text output - could be function or computer call
-                        responses_items.append({
-                            "type": "function_call_output",
-                            "call_id": tool_call_id,
-                            "output": content
-                        })
+                        responses_items.append(
+                            {
+                                "type": "function_call_output",
+                                "call_id": tool_call_id,
+                                "output": content,
+                            }
+                        )
             elif isinstance(content, list):
                 # Handle structured content (e.g., images)
                 for item in content:
                     if item.get("type") == "image_url":
-                        responses_items.append({
-                            "type": "computer_call_output",
-                            "call_id": tool_call_id,
-                            "output": {
-                                "type": "input_image",
-                                "image_url": item.get("image_url", {}).get("url")
+                        responses_items.append(
+                            {
+                                "type": "computer_call_output",
+                                "call_id": tool_call_id,
+                                "output": {
+                                    "type": "input_image",
+                                    "image_url": item.get("image_url", {}).get("url"),
+                                },
                             }
-                        })
+                        )
                     elif item.get("type") == "text":
-                        responses_items.append({
-                            "type": "function_call_output",
-                            "call_id": tool_call_id,
-                            "output": item.get("text")
-                        })
+                        responses_items.append(
+                            {
+                                "type": "function_call_output",
+                                "call_id": tool_call_id,
+                                "output": item.get("text"),
+                            }
+                        )
         # Handle actual user messages
         elif role == "user" and content:
             if isinstance(content, list):
@@ -702,27 +855,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
                 user_content = []
                 for item in content:
                     if item.get("type") == "image_url":
-                        user_content.append({
-                            "type": "input_image",
-                            "image_url": item.get("image_url", {}).get("url")
-                        })
+                        user_content.append(
+                            {
+                                "type": "input_image",
+                                "image_url": item.get("image_url", {}).get("url"),
+                            }
+                        )
                     elif item.get("type") == "text":
-                        user_content.append({
-                            "type": "input_text",
-                            "text": item.get("text")
-                        })
+                        user_content.append({"type": "input_text", "text": item.get("text")})
                 if user_content:
-                    responses_items.append({
-                        "role": "user",
-                        "type": "message",
-                        "content": user_content
-                    })
+                    responses_items.append(
+                        {"role": "user", "type": "message", "content": user_content}
+                    )
             elif isinstance(content, str):
                 # Handle simple text user message
-                responses_items.append({
-                    "role": "user",
-                    "content": content
-                })
+                responses_items.append({"role": "user", "content": content})
     return responses_items

cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.14py3-none-any.whl → 0.7.16py3-none-any.whl