PyPI - cua-agent - Versions diffs - 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl - Mend

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show

agent/__init__.py +4 -10
agent/__main__.py +2 -1
agent/adapters/__init__.py +4 -0
agent/adapters/azure_ml_adapter.py +283 -0
agent/adapters/cua_adapter.py +161 -0
agent/adapters/huggingfacelocal_adapter.py +67 -125
agent/adapters/human_adapter.py +116 -114
agent/adapters/mlxvlm_adapter.py +110 -99
agent/adapters/models/__init__.py +41 -0
agent/adapters/models/generic.py +78 -0
agent/adapters/models/internvl.py +290 -0
agent/adapters/models/opencua.py +115 -0
agent/adapters/models/qwen2_5_vl.py +78 -0
agent/agent.py +337 -185
agent/callbacks/__init__.py +9 -4
agent/callbacks/base.py +45 -31
agent/callbacks/budget_manager.py +22 -10
agent/callbacks/image_retention.py +54 -98
agent/callbacks/logging.py +55 -42
agent/callbacks/operator_validator.py +35 -33
agent/callbacks/otel.py +291 -0
agent/callbacks/pii_anonymization.py +19 -16
agent/callbacks/prompt_instructions.py +47 -0
agent/callbacks/telemetry.py +99 -61
agent/callbacks/trajectory_saver.py +95 -69
agent/cli.py +269 -119
agent/computers/__init__.py +14 -9
agent/computers/base.py +32 -19
agent/computers/cua.py +52 -25
agent/computers/custom.py +78 -71
agent/decorators.py +23 -14
agent/human_tool/__init__.py +2 -7
agent/human_tool/__main__.py +6 -2
agent/human_tool/server.py +48 -37
agent/human_tool/ui.py +359 -235
agent/integrations/hud/__init__.py +38 -99
agent/integrations/hud/agent.py +369 -0
agent/integrations/hud/proxy.py +166 -52
agent/loops/__init__.py +44 -14
agent/loops/anthropic.py +579 -492
agent/loops/base.py +19 -15
agent/loops/composed_grounded.py +136 -150
agent/loops/fara/__init__.py +8 -0
agent/loops/fara/config.py +506 -0
agent/loops/fara/helpers.py +357 -0
agent/loops/fara/schema.py +143 -0
agent/loops/gelato.py +183 -0
agent/loops/gemini.py +935 -0
agent/loops/generic_vlm.py +601 -0
agent/loops/glm45v.py +140 -135
agent/loops/gta1.py +48 -51
agent/loops/holo.py +218 -0
agent/loops/internvl.py +180 -0
agent/loops/moondream3.py +493 -0
agent/loops/omniparser.py +326 -226
agent/loops/openai.py +50 -51
agent/loops/opencua.py +134 -0
agent/loops/uiins.py +175 -0
agent/loops/uitars.py +247 -206
agent/loops/uitars2.py +951 -0
agent/playground/__init__.py +5 -0
agent/playground/server.py +301 -0
agent/proxy/examples.py +61 -57
agent/proxy/handlers.py +46 -39
agent/responses.py +447 -347
agent/tools/__init__.py +24 -0
agent/tools/base.py +253 -0
agent/tools/browser_tool.py +423 -0
agent/types.py +11 -5
agent/ui/__init__.py +1 -1
agent/ui/__main__.py +1 -1
agent/ui/gradio/app.py +25 -22
agent/ui/gradio/ui_components.py +314 -167
cua_agent-0.7.16.dist-info/METADATA +85 -0
cua_agent-0.7.16.dist-info/RECORD +79 -0
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
cua_agent-0.4.22.dist-info/METADATA +0 -436
cua_agent-0.4.22.dist-info/RECORD +0 -51
{cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0

agent/loops/anthropic.py CHANGED Viewed

@@ -4,69 +4,68 @@ Anthropic hosted tools agent loop implementation using liteLLM
 import asyncio
 import json
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import litellm
-from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
+from litellm.responses.litellm_completion_transformation.transformation import (
+    LiteLLMCompletionResponsesConfig,
+)
 from ..decorators import register_agent
-from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
 from ..responses import (
-    make_reasoning_item,
-    make_output_text_item,
     make_click_item,
     make_double_click_item,
     make_drag_item,
+    make_failed_tool_call_items,
+    make_input_image_item,
     make_keypress_item,
+    make_left_mouse_down_item,
+    make_left_mouse_up_item,
     make_move_item,
+    make_output_text_item,
+    make_reasoning_item,
+    make_screenshot_item,
     make_scroll_item,
     make_type_item,
     make_wait_item,
-    make_input_image_item,
-    make_screenshot_item,
-    make_failed_tool_call_items,
-    make_left_mouse_down_item,
-    make_left_mouse_up_item
 )
+from ..types import AgentCapability, AgentResponse, Messages, Tools
 # Model version mapping to tool version and beta flag
 MODEL_TOOL_MAPPING = [
     # Claude 4 models
     {
-        "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
+        "pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
         "tool_version": "computer_20250124",
-        "beta_flag": "computer-use-2025-01-24"
+        "beta_flag": "computer-use-2025-01-24",
     },
     # Claude 3.7 models
     {
         "pattern": r"claude-3\.?7|claude-3-7",
         "tool_version": "computer_20250124",
-        "beta_flag": "computer-use-2025-01-24"
+        "beta_flag": "computer-use-2025-01-24",
     },
     # Claude 3.5 models (fallback)
     {
         "pattern": r"claude-3\.?5|claude-3-5",
         "tool_version": "computer_20241022",
-        "beta_flag": "computer-use-2024-10-22"
-    }
+        "beta_flag": "computer-use-2024-10-22",
+    },
 ]
 def _get_tool_config_for_model(model: str) -> Dict[str, str]:
     """Get tool version and beta flag for the given model."""
     import re
     for mapping in MODEL_TOOL_MAPPING:
         if re.search(mapping["pattern"], model, re.IGNORECASE):
-            return {
-                "tool_version": mapping["tool_version"],
-                "beta_flag": mapping["beta_flag"]
-            }
+            return {"tool_version": mapping["tool_version"], "beta_flag": mapping["beta_flag"]}
     # Default to Claude 3.5 configuration
-    return {
-        "tool_version": "computer_20241022",
-        "beta_flag": "computer-use-2024-10-22"
-    }
+    return {"tool_version": "computer_20241022", "beta_flag": "computer-use-2024-10-22"}
 async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
     """Map a computer tool to Anthropic's hosted tool schema."""
@@ -76,7 +75,7 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
     except Exception:
         # Fallback to default dimensions if method fails
         width, height = 1024, 768
     return {
         "type": tool_version,
         "function": {
@@ -89,32 +88,34 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
         },
     }
 async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
     """Prepare tools for Anthropic API format."""
     tool_config = _get_tool_config_for_model(model)
     anthropic_tools = []
     for schema in tool_schemas:
         if schema["type"] == "computer":
             # Map computer tool to Anthropic format
-            anthropic_tools.append(await _map_computer_tool_to_anthropic(
-                schema["computer"],
-                tool_config["tool_version"]
-            ))
+            anthropic_tools.append(
+                await _map_computer_tool_to_anthropic(
+                    schema["computer"], tool_config["tool_version"]
+                )
+            )
         elif schema["type"] == "function":
             # Function tools - convert to Anthropic format
             function_schema = schema["function"]
-            anthropic_tools.append({
-                "type": "function",
-                "function": {
+            anthropic_tools.append(
+                {
                     "name": function_schema["name"],
                     "description": function_schema.get("description", ""),
-                    "parameters": function_schema.get("parameters", {})
+                    "input_schema": function_schema.get("parameters", {}),
                 }
-            })
+            )
     return anthropic_tools
 def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
     """Convert responses_items message format to liteLLM completion format."""
     completion_messages = []
@@ -123,7 +124,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
     for message in messages:
         msg_type = message.get("type")
         role = message.get("role")
         # Handle user messages (both with and without explicit type)
         if role == "user" or msg_type == "user":
             content = message.get("content", "")
@@ -132,55 +133,41 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 converted_content = []
                 for item in content:
                     if isinstance(item, dict) and item.get("type") == "input_image":
-                        # Convert input_image to Anthropic image format
+                        # Convert input_image to OpenAI image format
                         image_url = item.get("image_url", "")
                         if image_url and image_url != "[omitted]":
-                            # Extract base64 data from data URL
-                            if "," in image_url:
-                                base64_data = image_url.split(",")[-1]
-                            else:
-                                base64_data = image_url
-                            converted_content.append({
-                                "type": "image",
-                                "source": {
-                                    "type": "base64",
-                                    "media_type": "image/png",
-                                    "data": base64_data
-                                }
-                            })
+                            converted_content.append(
+                                {"type": "image_url", "image_url": {"url": image_url}}
+                            )
+                    elif isinstance(item, dict) and item.get("type") == "input_text":
+                        # Convert input_text to OpenAI text format
+                        text = item.get("text", "")
+                        converted_content.append({"type": "text", "text": text})
                     else:
                         # Keep other content types as-is
                         converted_content.append(item)
-                completion_messages.append({
-                    "role": "user",
-                    "content": converted_content if converted_content else content
-                })
+                completion_messages.append(
+                    {"role": "user", "content": converted_content if converted_content else content}
+                )
             else:
                 # Text content
-                completion_messages.append({
-                    "role": "user",
-                    "content": content
-                })
+                completion_messages.append({"role": "user", "content": content})
         # Handle assistant messages
         elif role == "assistant":
             content = message.get("content", [])
             if isinstance(content, str):
-                content = [{ "type": "output_text", "text": content }]
+                content = [{"type": "output_text", "text": content}]
             content = "\n".join(item.get("text", "") for item in content)
-            completion_messages.append({
-                "role": "assistant",
-                "content": content
-            })
+            completion_messages.append({"role": "assistant", "content": content})
         elif msg_type == "reasoning":
             # Reasoning becomes part of assistant message
             summary = message.get("summary", [])
             reasoning_text = ""
             if isinstance(summary, list) and summary:
                 # Extract text from summary items
                 for item in summary:
@@ -190,58 +177,54 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
             else:
                 # Fallback to direct reasoning field
                 reasoning_text = message.get("reasoning", "")
             if reasoning_text:
-                completion_messages.append({
-                    "role": "assistant",
-                    "content": reasoning_text
-                })
+                completion_messages.append({"role": "assistant", "content": reasoning_text})
         elif msg_type == "function_call":
             fn_name = message.get("name")
             fn_args = message.get("arguments", "{}")
             call_id = message.get("call_id", "call_1")
             call_id_to_fn_name[call_id] = fn_name
-            openai_tool_calls = [{
-                "id": call_id,
-                "type": "function",
-                "function": {
-                    "name": fn_name,
-                    "arguments": fn_args
+            openai_tool_calls = [
+                {
+                    "id": call_id,
+                    "type": "function",
+                    "function": {"name": fn_name, "arguments": fn_args},
                 }
-            }]            # If the last completion message is an assistant message, extend the tool_calls
+            ]  # If the last completion message is an assistant message, extend the tool_calls
             if completion_messages and completion_messages[-1].get("role") == "assistant":
                 if "tool_calls" not in completion_messages[-1]:
                     completion_messages[-1]["tool_calls"] = []
                 completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
             else:
                 # Create new assistant message with tool calls
-                completion_messages.append({
-                    "role": "assistant",
-                    "content": None,
-                    "tool_calls": openai_tool_calls
-                })
+                completion_messages.append(
+                    {"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
+                )
         elif msg_type == "function_call_output":
             call_id = message.get("call_id", "call_1")
             fn_output = message.get("output", "")
             fn_name = call_id_to_fn_name.get(call_id, "computer")
-            completion_messages.append({
-                "role": "function",
-                "name": fn_name,
-                "tool_call_id": call_id,
-                "content": str(fn_output)
-            })
+            completion_messages.append(
+                {
+                    "role": "function",
+                    "name": fn_name,
+                    "tool_call_id": call_id,
+                    "content": str(fn_output),
+                }
+            )
         elif msg_type == "computer_call":
             # Computer call becomes tool use in assistant message
             action = message.get("action", {})
             action_type = action.get("type")
             call_id = message.get("call_id", "call_1")
             tool_use_content = []
             # Basic actions (all versions)
             if action_type == "click":
                 # Input:
@@ -254,7 +237,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "y": 200
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -268,16 +251,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "type": "function"
                 # }
                 button = action.get("button", "left")
-                action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": action_name,
-                        "coordinate": [action.get("x", 0), action.get("y", 0)]
+                action_name = (
+                    "right_click"
+                    if button == "right"
+                    else "middle_click" if button == "wheel" else "left_click"
+                )
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": action_name,
+                            "coordinate": [action.get("x", 0), action.get("y", 0)],
+                        },
                     }
-                })
+                )
             elif action_type == "double_click":
                 # Input:
                 # {
@@ -289,7 +278,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "y": 240
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -302,15 +291,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "id": "call_1",
                 #     "type": "function"
                 # }
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "double_click",
-                        "coordinate": [action.get("x", 0), action.get("y", 0)]
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": "double_click",
+                            "coordinate": [action.get("x", 0), action.get("y", 0)],
+                        },
                     }
-                })
+                )
             elif action_type == "type":
                 # Input:
                 # {
@@ -321,7 +312,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "text": "Hello World"
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -334,15 +325,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "id": "call_1",
                 #     "type": "function"
                 # }
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "type",
-                        "text": action.get("text", "")
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {"action": "type", "text": action.get("text", "")},
                     }
-                })
+                )
             elif action_type == "keypress":
                 # Input:
                 # {
@@ -353,7 +343,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "keys": ["ctrl", "c"]
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -366,15 +356,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "id": "call_1",
                 #     "type": "function"
                 # }
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "key",
-                        "text": "+".join(action.get("keys", []))
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {"action": "key", "text": "+".join(action.get("keys", []))},
                     }
-                })
+                )
             elif action_type in ["mouse_move", "move"]:
                 # Input:
                 # {
@@ -386,7 +375,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "y": 250
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -399,15 +388,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "id": "call_1",
                 #     "type": "function"
                 # }
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "mouse_move",
-                        "coordinate": [action.get("x", 0), action.get("y", 0)]
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": "mouse_move",
+                            "coordinate": [action.get("x", 0), action.get("y", 0)],
+                        },
                     }
-                })
+                )
             elif action_type == "scroll":
                 # Input:
                 # {
@@ -421,7 +412,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "scroll_y": -5
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -454,18 +445,20 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 else:
                     direction = "down"
                     amount = 3
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "scroll",
-                        "coordinate": [action.get("x", 0), action.get("y", 0)],
-                        "scroll_direction": direction,
-                        "scroll_amount": amount
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": "scroll",
+                            "coordinate": [action.get("x", 0), action.get("y", 0)],
+                            "scroll_direction": direction,
+                            "scroll_amount": amount,
+                        },
                     }
-                })
+                )
             elif action_type == "drag":
                 # Input:
                 # {
@@ -479,7 +472,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         ]
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -499,17 +492,19 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 if isinstance(path, list) and len(path) >= 2:
                     start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
                     end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "left_click_drag",
-                        "start_coordinate": start_coord,
-                        "end_coordinate": end_coord
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": "left_click_drag",
+                            "start_coordinate": start_coord,
+                            "end_coordinate": end_coord,
+                        },
                     }
-                })
+                )
             elif action_type == "wait":
                 # Input:
                 # {
@@ -519,7 +514,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "type": "wait"
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -531,14 +526,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "id": "call_1",
                 #     "type": "function"
                 # }
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "wait"
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {"action": "wait"},
                     }
-                })
+                )
             elif action_type == "screenshot":
                 # Input:
                 # {
@@ -548,7 +543,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #         "type": "screenshot"
                 #     }
                 # }
                 # Output:
                 # {
                 #     "function": {
@@ -560,47 +555,53 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 #     "id": "call_1",
                 #     "type": "function"
                 # }
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "screenshot"
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {"action": "screenshot"},
                     }
-                })
+                )
             elif action_type == "left_mouse_down":
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "left_mouse_down",
-                        "coordinate": [action.get("x", None), action.get("y", None)]
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": "left_mouse_down",
+                            "coordinate": [action.get("x", None), action.get("y", None)],
+                        },
                     }
-                })
+                )
             elif action_type == "left_mouse_up":
-                tool_use_content.append({
-                    "type": "tool_use",
-                    "id": call_id,
-                    "name": "computer",
-                    "input": {
-                        "action": "left_mouse_up",
-                        "coordinate": [action.get("x", None), action.get("y", None)]
+                tool_use_content.append(
+                    {
+                        "type": "tool_use",
+                        "id": call_id,
+                        "name": "computer",
+                        "input": {
+                            "action": "left_mouse_up",
+                            "coordinate": [action.get("x", None), action.get("y", None)],
+                        },
                     }
-                })
+                )
             # Convert tool_use_content to OpenAI tool_calls format
             openai_tool_calls = []
             for tool_use in tool_use_content:
-                openai_tool_calls.append({
-                    "id": tool_use["id"],
-                    "type": "function",
-                    "function": {
-                        "name": tool_use["name"],
-                        "arguments": json.dumps(tool_use["input"])
+                openai_tool_calls.append(
+                    {
+                        "id": tool_use["id"],
+                        "type": "function",
+                        "function": {
+                            "name": tool_use["name"],
+                            "arguments": json.dumps(tool_use["input"]),
+                        },
                     }
-                })
+                )
             # If the last completion message is an assistant message, extend the tool_calls
             if completion_messages and completion_messages[-1].get("role") == "assistant":
                 if "tool_calls" not in completion_messages[-1]:
@@ -608,54 +609,52 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
                 completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
             else:
                 # Create new assistant message with tool calls
-                completion_messages.append({
-                    "role": "assistant",
-                    "content": None,
-                    "tool_calls": openai_tool_calls
-                })
+                completion_messages.append(
+                    {"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
+                )
         elif msg_type == "computer_call_output":
             # Computer call output becomes OpenAI function result
             output = message.get("output", {})
             call_id = message.get("call_id", "call_1")
             if output.get("type") == "input_image":
                 # Screenshot result - convert to OpenAI format with image_url content
                 image_url = output.get("image_url", "")
-                completion_messages.append({
-                    "role": "function",
-                    "name": "computer",
-                    "tool_call_id": call_id,
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }]
-                })
+                completion_messages.append(
+                    {
+                        "role": "function",
+                        "name": "computer",
+                        "tool_call_id": call_id,
+                        "content": [{"type": "image_url", "image_url": {"url": image_url}}],
+                    }
+                )
             else:
                 # Text result - convert to OpenAI format
-                completion_messages.append({
-                    "role": "function",
-                    "name": "computer",
-                    "tool_call_id": call_id,
-                    "content": str(output)
-                })
+                completion_messages.append(
+                    {
+                        "role": "function",
+                        "name": "computer",
+                        "tool_call_id": call_id,
+                        "content": str(output),
+                    }
+                )
     return completion_messages
 def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
     """Convert liteLLM completion response to responses_items message format."""
     responses_items = []
-    if not response or not hasattr(response, 'choices') or not response.choices:
+    if not response or not hasattr(response, "choices") or not response.choices:
         return responses_items
     choice = response.choices[0]
     message = choice.message
     # Handle text content
-    if hasattr(message, 'content') and message.content:
+    if hasattr(message, "content") and message.content:
         if isinstance(message.content, str):
             responses_items.append(make_output_text_item(message.content))
         elif isinstance(message.content, list):
@@ -664,35 +663,54 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                     if content_item.get("type") == "text":
                         responses_items.append(make_output_text_item(content_item.get("text", "")))
                     elif content_item.get("type") == "tool_use":
-                        # Convert tool use to computer call
+                        # Check if this is a custom function tool or computer tool
+                        tool_name = content_item.get("name", "computer")
                         tool_input = content_item.get("input", {})
-                        action_type = tool_input.get("action")
                         call_id = content_item.get("id")
+                        # Handle custom function tools (not computer tools)
+                        if tool_name != "computer":
+                            from ..responses import make_function_call_item
+                            responses_items.append(
+                                make_function_call_item(
+                                    function_name=tool_name, arguments=tool_input, call_id=call_id
+                                )
+                            )
+                            continue
+                        # Computer tool - process actions
+                        action_type = tool_input.get("action")
                         # Action reference:
                         # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
                         try:
                             # Basic actions (all versions)
                             if action_type == "screenshot":
                                 responses_items.append(make_screenshot_item(call_id=call_id))
                             elif action_type in ["click", "left_click"]:
                                 coordinate = tool_input.get("coordinate", [0, 0])
-                                responses_items.append(make_click_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else 0,
-                                    y=coordinate[1] if len(coordinate) > 1 else 0,
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_click_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else 0,
+                                        y=coordinate[1] if len(coordinate) > 1 else 0,
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type in ["type", "type_text"]:
-                                responses_items.append(make_type_item(
-                                    text=tool_input.get("text", ""),
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_type_item(text=tool_input.get("text", ""), call_id=call_id)
+                                )
                             elif action_type in ["key", "keypress", "hotkey"]:
-                                responses_items.append(make_keypress_item(
-                                    keys=tool_input.get("text", "").replace("+", "-").split("-"),
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_keypress_item(
+                                        keys=tool_input.get("text", "")
+                                        .replace("+", "-")
+                                        .split("-"),
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type in ["mouse_move", "move_cursor", "move"]:
                                 # Mouse move - create a custom action item
                                 coordinate = tool_input.get("coordinate", [0, 0])
@@ -700,64 +718,88 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                                     make_move_item(
                                         x=coordinate[0] if len(coordinate) > 0 else 0,
                                         y=coordinate[1] if len(coordinate) > 1 else 0,
-                                        call_id=call_id
+                                        call_id=call_id,
                                     )
                                 )
                             # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
                             elif action_type == "scroll":
                                 coordinate = tool_input.get("coordinate", [0, 0])
                                 scroll_amount = tool_input.get("scroll_amount", 3)
-                                scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
-                                    -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
-                                scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
-                                    -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
-                                responses_items.append(make_scroll_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else 0,
-                                    y=coordinate[1] if len(coordinate) > 1 else 0,
-                                    scroll_x=scroll_x,
-                                    scroll_y=scroll_y,
-                                    call_id=call_id
-                                ))
+                                scroll_x = (
+                                    scroll_amount
+                                    if tool_input.get("scroll_direction", "down") == "right"
+                                    else (
+                                        -scroll_amount
+                                        if tool_input.get("scroll_direction", "down") == "left"
+                                        else 0
+                                    )
+                                )
+                                scroll_y = (
+                                    scroll_amount
+                                    if tool_input.get("scroll_direction", "down") == "down"
+                                    else (
+                                        -scroll_amount
+                                        if tool_input.get("scroll_direction", "down") == "up"
+                                        else 0
+                                    )
+                                )
+                                responses_items.append(
+                                    make_scroll_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else 0,
+                                        y=coordinate[1] if len(coordinate) > 1 else 0,
+                                        scroll_x=scroll_x,
+                                        scroll_y=scroll_y,
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type in ["left_click_drag", "drag"]:
                                 start_coord = tool_input.get("start_coordinate", [0, 0])
                                 end_coord = tool_input.get("end_coordinate", [0, 0])
-                                responses_items.append(make_drag_item(
-                                    path=[
-                                        {
-                                            "x": start_coord[0] if len(start_coord) > 0 else 0,
-                                            "y": start_coord[1] if len(start_coord) > 1 else 0
-                                        },
-                                        {
-                                            "x": end_coord[0] if len(end_coord) > 0 else 0,
-                                            "y": end_coord[1] if len(end_coord) > 1 else 0
-                                        }
-                                    ],
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_drag_item(
+                                        path=[
+                                            {
+                                                "x": start_coord[0] if len(start_coord) > 0 else 0,
+                                                "y": start_coord[1] if len(start_coord) > 1 else 0,
+                                            },
+                                            {
+                                                "x": end_coord[0] if len(end_coord) > 0 else 0,
+                                                "y": end_coord[1] if len(end_coord) > 1 else 0,
+                                            },
+                                        ],
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type == "right_click":
                                 coordinate = tool_input.get("coordinate", [0, 0])
-                                responses_items.append(make_click_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else 0,
-                                    y=coordinate[1] if len(coordinate) > 1 else 0,
-                                    button="right",
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_click_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else 0,
+                                        y=coordinate[1] if len(coordinate) > 1 else 0,
+                                        button="right",
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type == "middle_click":
                                 coordinate = tool_input.get("coordinate", [0, 0])
-                                responses_items.append(make_click_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else 0,
-                                    y=coordinate[1] if len(coordinate) > 1 else 0,
-                                    button="wheel",
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_click_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else 0,
+                                        y=coordinate[1] if len(coordinate) > 1 else 0,
+                                        button="wheel",
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type == "double_click":
                                 coordinate = tool_input.get("coordinate", [0, 0])
-                                responses_items.append(make_double_click_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else 0,
-                                    y=coordinate[1] if len(coordinate) > 1 else 0,
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_double_click_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else 0,
+                                        y=coordinate[1] if len(coordinate) > 1 else 0,
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type == "triple_click":
                                 # coordinate = tool_input.get("coordinate", [0, 0])
                                 # responses_items.append({
@@ -783,11 +825,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                                 #     }
                                 # })
                                 coordinate = tool_input.get("coordinate", [None, None])
-                                responses_items.append(make_left_mouse_down_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else None,
-                                    y=coordinate[1] if len(coordinate) > 1 else None,
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_left_mouse_down_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else None,
+                                        y=coordinate[1] if len(coordinate) > 1 else None,
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type == "left_mouse_up":
                                 # coordinate = tool_input.get("coordinate", [0, 0])
                                 # responses_items.append({
@@ -801,11 +845,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                                 #     }
                                 # })
                                 coordinate = tool_input.get("coordinate", [None, None])
-                                responses_items.append(make_left_mouse_up_item(
-                                    x=coordinate[0] if len(coordinate) > 0 else None,
-                                    y=coordinate[1] if len(coordinate) > 1 else None,
-                                    call_id=call_id
-                                ))
+                                responses_items.append(
+                                    make_left_mouse_up_item(
+                                        x=coordinate[0] if len(coordinate) > 0 else None,
+                                        y=coordinate[1] if len(coordinate) > 1 else None,
+                                        call_id=call_id,
+                                    )
+                                )
                             elif action_type == "hold_key":
                                 # responses_items.append({
                                 #     "type": "computer_call",
@@ -817,22 +863,41 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                                 # })
                                 raise NotImplementedError("hold_key")
                             elif action_type == "wait":
-                                responses_items.append(make_wait_item(
-                                    call_id=call_id
-                                ))
+                                responses_items.append(make_wait_item(call_id=call_id))
                             else:
                                 raise ValueError(f"Unknown action type: {action_type}")
                         except Exception as e:
-                            responses_items.extend(make_failed_tool_call_items(
-                                tool_name="computer",
-                                tool_kwargs=tool_input,
-                                error_message=repr(e),
-                                call_id=call_id
-                            ))
+                            responses_items.extend(
+                                make_failed_tool_call_items(
+                                    tool_name="computer",
+                                    tool_kwargs=tool_input,
+                                    error_message=repr(e),
+                                    call_id=call_id,
+                                )
+                            )
     # Handle tool calls (alternative format)
-    if hasattr(message, 'tool_calls') and message.tool_calls:
+    if hasattr(message, "tool_calls") and message.tool_calls:
         for tool_call in message.tool_calls:
+            tool_name = tool_call.function.name
+            # Handle custom function tools
+            if tool_name != "computer":
+                from ..responses import make_function_call_item
+                # tool_call.function.arguments is a JSON string, need to parse it
+                try:
+                    args_dict = json.loads(tool_call.function.arguments)
+                except json.JSONDecodeError:
+                    args_dict = {}
+                responses_items.append(
+                    make_function_call_item(
+                        function_name=tool_name, arguments=args_dict, call_id=tool_call.id
+                    )
+                )
+                continue
+            # Handle computer tool
             if tool_call.function.name == "computer":
                 try:
                     try:
@@ -853,7 +918,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -862,9 +927,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #         "type": "screenshot"
                             #     }
                             # }
-                            responses_items.append(make_screenshot_item(
-                                call_id=call_id
-                            ))
+                            responses_items.append(make_screenshot_item(call_id=call_id))
                         elif action_type in ["click", "left_click"]:
                             # Input:
                             # {
@@ -878,7 +941,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -890,11 +953,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [0, 0])
-                            responses_items.append(make_click_item(
-                                x=coordinate[0] if len(coordinate) > 0 else 0,
-                                y=coordinate[1] if len(coordinate) > 1 else 0,
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_click_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else 0,
+                                    y=coordinate[1] if len(coordinate) > 1 else 0,
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type in ["type", "type_text"]:
                             # Input:
                             # {
@@ -908,7 +973,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -918,10 +983,9 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #         "text": "Hello World"
                             #     }
                             # }
-                            responses_items.append(make_type_item(
-                                text=args.get("text", ""),
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_type_item(text=args.get("text", ""), call_id=call_id)
+                            )
                         elif action_type in ["key", "keypress", "hotkey"]:
                             # Input:
                             # {
@@ -935,7 +999,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -945,10 +1009,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #         "keys": ["ctrl", "c"]
                             #     }
                             # }
-                            responses_items.append(make_keypress_item(
-                                keys=args.get("text", "").replace("+", "-").split("-"),
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_keypress_item(
+                                    keys=args.get("text", "").replace("+", "-").split("-"),
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type in ["mouse_move", "move_cursor", "move"]:
                             # Input:
                             # {
@@ -962,7 +1028,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -974,12 +1040,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [0, 0])
-                            responses_items.append(make_move_item(
-                                x=coordinate[0] if len(coordinate) > 0 else 0,
-                                y=coordinate[1] if len(coordinate) > 1 else 0,
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_move_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else 0,
+                                    y=coordinate[1] if len(coordinate) > 1 else 0,
+                                    call_id=call_id,
+                                )
+                            )
                         # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
                         elif action_type == "scroll":
                             # Input:
@@ -996,7 +1064,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1012,17 +1080,25 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             coordinate = args.get("coordinate", [0, 0])
                             direction = args.get("scroll_direction", "down")
                             amount = args.get("scroll_amount", 3)
-                            scroll_x = amount if direction == "left" else \
-                                    -amount if direction == "right" else 0
-                            scroll_y = amount if direction == "up" else \
-                                    -amount if direction == "down" else 0
-                            responses_items.append(make_scroll_item(
-                                x=coordinate[0] if len(coordinate) > 0 else 0,
-                                y=coordinate[1] if len(coordinate) > 1 else 0,
-                                scroll_x=scroll_x,
-                                scroll_y=scroll_y,
-                                call_id=call_id
-                            ))
+                            scroll_x = (
+                                amount
+                                if direction == "left"
+                                else -amount if direction == "right" else 0
+                            )
+                            scroll_y = (
+                                amount
+                                if direction == "up"
+                                else -amount if direction == "down" else 0
+                            )
+                            responses_items.append(
+                                make_scroll_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else 0,
+                                    y=coordinate[1] if len(coordinate) > 1 else 0,
+                                    scroll_x=scroll_x,
+                                    scroll_y=scroll_y,
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type in ["left_click_drag", "drag"]:
                             # Input:
                             # {
@@ -1037,7 +1113,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1052,19 +1128,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             # }
                             start_coord = args.get("start_coordinate", [0, 0])
                             end_coord = args.get("end_coordinate", [0, 0])
-                            responses_items.append(make_drag_item(
-                                path=[
-                                    {
-                                        "x": start_coord[0] if len(start_coord) > 0 else 0,
-                                        "y": start_coord[1] if len(start_coord) > 1 else 0
-                                    },
-                                    {
-                                        "x": end_coord[0] if len(end_coord) > 0 else 0,
-                                        "y": end_coord[1] if len(end_coord) > 1 else 0
-                                    }
-                                ],
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_drag_item(
+                                    path=[
+                                        {
+                                            "x": start_coord[0] if len(start_coord) > 0 else 0,
+                                            "y": start_coord[1] if len(start_coord) > 1 else 0,
+                                        },
+                                        {
+                                            "x": end_coord[0] if len(end_coord) > 0 else 0,
+                                            "y": end_coord[1] if len(end_coord) > 1 else 0,
+                                        },
+                                    ],
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type == "right_click":
                             # Input:
                             # {
@@ -1078,7 +1156,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1091,12 +1169,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [0, 0])
-                            responses_items.append(make_click_item(
-                                x=coordinate[0] if len(coordinate) > 0 else 0,
-                                y=coordinate[1] if len(coordinate) > 1 else 0,
-                                button="right",
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_click_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else 0,
+                                    y=coordinate[1] if len(coordinate) > 1 else 0,
+                                    button="right",
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type == "middle_click":
                             # Input:
                             # {
@@ -1110,7 +1190,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1123,12 +1203,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [0, 0])
-                            responses_items.append(make_click_item(
-                                x=coordinate[0] if len(coordinate) > 0 else 0,
-                                y=coordinate[1] if len(coordinate) > 1 else 0,
-                                button="wheel",
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_click_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else 0,
+                                    y=coordinate[1] if len(coordinate) > 1 else 0,
+                                    button="wheel",
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type == "double_click":
                             # Input:
                             # {
@@ -1142,7 +1224,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1154,11 +1236,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [0, 0])
-                            responses_items.append(make_double_click_item(
-                                x=coordinate[0] if len(coordinate) > 0 else 0,
-                                y=coordinate[1] if len(coordinate) > 1 else 0,
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_double_click_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else 0,
+                                    y=coordinate[1] if len(coordinate) > 1 else 0,
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type == "triple_click":
                             # Input:
                             # {
@@ -1172,7 +1256,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1197,7 +1281,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1210,11 +1294,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [None, None])
-                            responses_items.append(make_left_mouse_down_item(
-                                x=coordinate[0] if len(coordinate) > 0 else None,
-                                y=coordinate[1] if len(coordinate) > 1 else None,
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_left_mouse_down_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else None,
+                                    y=coordinate[1] if len(coordinate) > 1 else None,
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type == "left_mouse_up":
                             # Input:
                             # {
@@ -1228,7 +1314,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1241,11 +1327,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     }
                             # }
                             coordinate = args.get("coordinate", [None, None])
-                            responses_items.append(make_left_mouse_up_item(
-                                x=coordinate[0] if len(coordinate) > 0 else None,
-                                y=coordinate[1] if len(coordinate) > 1 else None,
-                                call_id=call_id
-                            ))
+                            responses_items.append(
+                                make_left_mouse_up_item(
+                                    x=coordinate[0] if len(coordinate) > 0 else None,
+                                    y=coordinate[1] if len(coordinate) > 1 else None,
+                                    call_id=call_id,
+                                )
+                            )
                         elif action_type == "hold_key":
                             # Input:
                             # {
@@ -1259,7 +1347,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1282,7 +1370,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #     "id": "call_1",
                             #     "type": "function"
                             # }
                             # Output:
                             # {
                             #     "type": "computer_call",
@@ -1291,74 +1379,77 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                             #         "type": "wait"
                             #     }
                             # }
-                            responses_items.append(make_wait_item(
-                                call_id=call_id
-                            ))
+                            responses_items.append(make_wait_item(call_id=call_id))
                     except Exception as e:
-                        responses_items.extend(make_failed_tool_call_items(
-                            tool_name="computer",
-                            tool_kwargs=args,
-                            error_message=repr(e),
-                            call_id=call_id
-                        ))
+                        responses_items.extend(
+                            make_failed_tool_call_items(
+                                tool_name="computer",
+                                tool_kwargs=args,
+                                error_message=repr(e),
+                                call_id=call_id,
+                            )
+                        )
                 except json.JSONDecodeError:
                     print("Failed to decode tool call arguments")
                     # Skip malformed tool calls
                     continue
     return responses_items
 def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Add cache control to completion messages"""
     num_writes = 0
     for message in completion_messages:
-        message["cache_control"] = { "type": "ephemeral" }
+        message["cache_control"] = {"type": "ephemeral"}
         num_writes += 1
         # Cache control has a maximum of 4 blocks
         if num_writes >= 4:
             break
     return completion_messages
 def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Combine completion messages with the same role"""
     if not completion_messages:
         return completion_messages
     combined_messages = []
     for message in completion_messages:
         # If this is the first message or role is different from last, add as new message
         if not combined_messages or combined_messages[-1]["role"] != message["role"]:
             # Ensure content is a list format and normalize text content
             new_message = message.copy()
             new_message["content"] = _normalize_content(message.get("content", ""))
             # Copy tool_calls if present
             if "tool_calls" in message:
                 new_message["tool_calls"] = message["tool_calls"].copy()
             combined_messages.append(new_message)
         else:
             # Same role as previous message, combine them
             last_message = combined_messages[-1]
             # Combine content
             current_content = _normalize_content(message.get("content", ""))
             last_message["content"].extend(current_content)
             # Combine tool_calls if present
             if "tool_calls" in message:
                 if "tool_calls" not in last_message:
                     last_message["tool_calls"] = []
                 last_message["tool_calls"].extend(message["tool_calls"])
     # Post-process to merge consecutive text blocks
     for message in combined_messages:
         message["content"] = _merge_consecutive_text(message["content"])
     return combined_messages
 def _normalize_content(content) -> List[Dict[str, Any]]:
     """Normalize content to list format"""
     if isinstance(content, str):
@@ -1371,28 +1462,28 @@ def _normalize_content(content) -> List[Dict[str, Any]]:
     else:
         return []
 def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Merge consecutive text blocks with newlines"""
     if not content_list:
         return content_list
     merged = []
     for item in content_list:
-        if (item.get("type") == "text" and
-            merged and
-            merged[-1].get("type") == "text"):
+        if item.get("type") == "text" and merged and merged[-1].get("type") == "text":
             # Merge with previous text block
             merged[-1]["text"] += "\n" + item["text"]
         else:
             merged.append(item.copy())
     return merged
 @register_agent(models=r".*claude-.*")
 class AnthropicHostedToolsConfig(AsyncAgentConfig):
     """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
     async def predict_step(
         self,
         messages: Messages,
@@ -1406,21 +1497,21 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
         _on_api_end=None,
         _on_usage=None,
         _on_screenshot=None,
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         """
         Anthropic hosted tools agent loop using liteLLM acompletion.
         Supports Anthropic's computer use models with hosted tools.
         """
         tools = tools or []
         # Get tool configuration for this model
         tool_config = _get_tool_config_for_model(model)
         # Prepare tools for Anthropic API
         anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
         # Convert responses_items messages to completion format
         completion_messages = _convert_responses_items_to_completion_messages(messages)
         if use_prompt_caching:
@@ -1428,7 +1519,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
             completion_messages = _combine_completion_messages(completion_messages)
             # Then add cache control, anthropic requires explicit "cache_control" dicts
             completion_messages = _add_cache_control(completion_messages)
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
@@ -1436,80 +1527,74 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
             "tools": anthropic_tools if anthropic_tools else None,
             "stream": stream,
             "num_retries": max_retries,
-            **kwargs
+            **kwargs,
         }
         # Add beta header for computer use
         if anthropic_tools:
-            api_kwargs["headers"] = {
-                "anthropic-beta": tool_config["beta_flag"]
-            }
+            api_kwargs["headers"] = {"anthropic-beta": tool_config["beta_flag"]}
         # Call API start hook
         if _on_api_start:
             await _on_api_start(api_kwargs)
         # Use liteLLM acompletion
         response = await litellm.acompletion(**api_kwargs)
         # Call API end hook
         if _on_api_end:
             await _on_api_end(api_kwargs, response)
         # Convert response to responses_items format
         responses_items = _convert_completion_to_responses_items(response)
         # Extract usage information
-        responses_usage = {
-            **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
+        responses_usage = {
+            **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
+                response.usage
+            ).model_dump(),
             "response_cost": response._hidden_params.get("response_cost", 0.0),
         }
         if _on_usage:
             await _on_usage(responses_usage)
         # Return in AsyncAgentConfig format
-        return {
-            "output": responses_items,
-            "usage": responses_usage
-        }
+        return {"output": responses_items, "usage": responses_usage}
     async def predict_click(
-        self,
-        model: str,
-        image_b64: str,
-        instruction: str,
-        **kwargs
+        self, model: str, image_b64: str, instruction: str, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """
         Predict click coordinates based on image and instruction.
         Uses Anthropic's computer use models with a custom prompt that instructs
         the agent to only output clicks.
         Args:
             model: Model name to use
             image_b64: Base64 encoded image
             instruction: Instruction for where to click
         Returns:
             Tuple of (x, y) coordinates or None if prediction fails
         """
         # Get image dimensions from base64 data
         try:
             import base64
-            from PIL import Image
             from io import BytesIO
+            from PIL import Image
             image_data = base64.b64decode(image_b64)
             image = Image.open(BytesIO(image_data))
             display_width, display_height = image.size
         except Exception:
             # Fallback to default dimensions if image parsing fails
             display_width, display_height = 1024, 768
         # Get tool configuration for this model
         tool_config = _get_tool_config_for_model(model)
         # Prepare computer tool for Anthropic format
         computer_tool = {
             "type": tool_config["tool_version"],
@@ -1522,7 +1607,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
                 },
             },
         }
         # Construct messages in OpenAI chat completion format for liteLLM
         messages = [
             {
@@ -1541,18 +1626,16 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
 7. Be decisive and action-oriented. Complete the requested task fully.
 Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
-Task: Click {instruction}. Output ONLY a click action on the target element."""
+Task: Click {instruction}. Output ONLY a click action on the target element.""",
                     },
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{image_b64}"
-                        }
-                    }
-                ]
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                ],
             }
         ]
         # Prepare API call kwargs
         api_kwargs = {
             "model": model,
@@ -1560,32 +1643,36 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
             "tools": [computer_tool],
             "stream": False,
             "max_tokens": 100,  # Keep response short for click prediction
-            "headers": {
-                "anthropic-beta": tool_config["beta_flag"]
-            }
+            "headers": {"anthropic-beta": tool_config["beta_flag"]},
         }
+        # Thread optional API params
+        if "api_key" in kwargs and kwargs.get("api_key") is not None:
+            api_kwargs["api_key"] = kwargs.get("api_key")
+        if "api_base" in kwargs and kwargs.get("api_base") is not None:
+            api_kwargs["api_base"] = kwargs.get("api_base")
         # Use liteLLM acompletion
         response = await litellm.acompletion(**api_kwargs)
         # Convert response to responses_items format to extract click coordinates
         responses_items = _convert_completion_to_responses_items(response)
         # Look for computer_call with click action
         for item in responses_items:
-            if (isinstance(item, dict) and
-                item.get("type") == "computer_call" and
-                isinstance(item.get("action"), dict)):
+            if (
+                isinstance(item, dict)
+                and item.get("type") == "computer_call"
+                and isinstance(item.get("action"), dict)
+            ):
                 action = item["action"]
-                if action.get("type") == "click":
+                if action.get("x") and action.get("y"):
                     x = action.get("x")
                     y = action.get("y")
-                    if x is not None and y is not None:
-                        return (int(x), int(y))
+                    return (int(x), int(y))
         return None
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
         return ["click", "step"]

cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.22py3-none-any.whl → 0.7.16py3-none-any.whl