cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/anthropic.py
CHANGED
|
@@ -4,69 +4,68 @@ Anthropic hosted tools agent loop implementation using liteLLM
|
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
8
|
+
|
|
8
9
|
import litellm
|
|
9
|
-
from litellm.responses.litellm_completion_transformation.transformation import
|
|
10
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
11
|
+
LiteLLMCompletionResponsesConfig,
|
|
12
|
+
)
|
|
10
13
|
|
|
11
14
|
from ..decorators import register_agent
|
|
12
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
13
15
|
from ..loops.base import AsyncAgentConfig
|
|
14
16
|
from ..responses import (
|
|
15
|
-
make_reasoning_item,
|
|
16
|
-
make_output_text_item,
|
|
17
17
|
make_click_item,
|
|
18
18
|
make_double_click_item,
|
|
19
19
|
make_drag_item,
|
|
20
|
+
make_failed_tool_call_items,
|
|
21
|
+
make_input_image_item,
|
|
20
22
|
make_keypress_item,
|
|
23
|
+
make_left_mouse_down_item,
|
|
24
|
+
make_left_mouse_up_item,
|
|
21
25
|
make_move_item,
|
|
26
|
+
make_output_text_item,
|
|
27
|
+
make_reasoning_item,
|
|
28
|
+
make_screenshot_item,
|
|
22
29
|
make_scroll_item,
|
|
23
30
|
make_type_item,
|
|
24
31
|
make_wait_item,
|
|
25
|
-
make_input_image_item,
|
|
26
|
-
make_screenshot_item,
|
|
27
|
-
make_failed_tool_call_items,
|
|
28
|
-
make_left_mouse_down_item,
|
|
29
|
-
make_left_mouse_up_item
|
|
30
32
|
)
|
|
33
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
31
34
|
|
|
32
35
|
# Model version mapping to tool version and beta flag
|
|
33
36
|
MODEL_TOOL_MAPPING = [
|
|
34
37
|
# Claude 4 models
|
|
35
38
|
{
|
|
36
|
-
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
|
|
39
|
+
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
|
|
37
40
|
"tool_version": "computer_20250124",
|
|
38
|
-
"beta_flag": "computer-use-2025-01-24"
|
|
41
|
+
"beta_flag": "computer-use-2025-01-24",
|
|
39
42
|
},
|
|
40
43
|
# Claude 3.7 models
|
|
41
44
|
{
|
|
42
45
|
"pattern": r"claude-3\.?7|claude-3-7",
|
|
43
46
|
"tool_version": "computer_20250124",
|
|
44
|
-
"beta_flag": "computer-use-2025-01-24"
|
|
47
|
+
"beta_flag": "computer-use-2025-01-24",
|
|
45
48
|
},
|
|
46
49
|
# Claude 3.5 models (fallback)
|
|
47
50
|
{
|
|
48
51
|
"pattern": r"claude-3\.?5|claude-3-5",
|
|
49
52
|
"tool_version": "computer_20241022",
|
|
50
|
-
"beta_flag": "computer-use-2024-10-22"
|
|
51
|
-
}
|
|
53
|
+
"beta_flag": "computer-use-2024-10-22",
|
|
54
|
+
},
|
|
52
55
|
]
|
|
53
56
|
|
|
57
|
+
|
|
54
58
|
def _get_tool_config_for_model(model: str) -> Dict[str, str]:
|
|
55
59
|
"""Get tool version and beta flag for the given model."""
|
|
56
60
|
import re
|
|
57
|
-
|
|
61
|
+
|
|
58
62
|
for mapping in MODEL_TOOL_MAPPING:
|
|
59
63
|
if re.search(mapping["pattern"], model, re.IGNORECASE):
|
|
60
|
-
return {
|
|
61
|
-
|
|
62
|
-
"beta_flag": mapping["beta_flag"]
|
|
63
|
-
}
|
|
64
|
-
|
|
64
|
+
return {"tool_version": mapping["tool_version"], "beta_flag": mapping["beta_flag"]}
|
|
65
|
+
|
|
65
66
|
# Default to Claude 3.5 configuration
|
|
66
|
-
return {
|
|
67
|
-
|
|
68
|
-
"beta_flag": "computer-use-2024-10-22"
|
|
69
|
-
}
|
|
67
|
+
return {"tool_version": "computer_20241022", "beta_flag": "computer-use-2024-10-22"}
|
|
68
|
+
|
|
70
69
|
|
|
71
70
|
async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
72
71
|
"""Map a computer tool to Anthropic's hosted tool schema."""
|
|
@@ -76,7 +75,7 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
|
|
|
76
75
|
except Exception:
|
|
77
76
|
# Fallback to default dimensions if method fails
|
|
78
77
|
width, height = 1024, 768
|
|
79
|
-
|
|
78
|
+
|
|
80
79
|
return {
|
|
81
80
|
"type": tool_version,
|
|
82
81
|
"function": {
|
|
@@ -89,32 +88,34 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
|
|
|
89
88
|
},
|
|
90
89
|
}
|
|
91
90
|
|
|
91
|
+
|
|
92
92
|
async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
93
93
|
"""Prepare tools for Anthropic API format."""
|
|
94
94
|
tool_config = _get_tool_config_for_model(model)
|
|
95
95
|
anthropic_tools = []
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
for schema in tool_schemas:
|
|
98
98
|
if schema["type"] == "computer":
|
|
99
99
|
# Map computer tool to Anthropic format
|
|
100
|
-
anthropic_tools.append(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
anthropic_tools.append(
|
|
101
|
+
await _map_computer_tool_to_anthropic(
|
|
102
|
+
schema["computer"], tool_config["tool_version"]
|
|
103
|
+
)
|
|
104
|
+
)
|
|
104
105
|
elif schema["type"] == "function":
|
|
105
106
|
# Function tools - convert to Anthropic format
|
|
106
107
|
function_schema = schema["function"]
|
|
107
|
-
anthropic_tools.append(
|
|
108
|
-
|
|
109
|
-
"function": {
|
|
108
|
+
anthropic_tools.append(
|
|
109
|
+
{
|
|
110
110
|
"name": function_schema["name"],
|
|
111
111
|
"description": function_schema.get("description", ""),
|
|
112
|
-
"
|
|
112
|
+
"input_schema": function_schema.get("parameters", {}),
|
|
113
113
|
}
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
)
|
|
115
|
+
|
|
116
116
|
return anthropic_tools
|
|
117
117
|
|
|
118
|
+
|
|
118
119
|
def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
|
|
119
120
|
"""Convert responses_items message format to liteLLM completion format."""
|
|
120
121
|
completion_messages = []
|
|
@@ -123,7 +124,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
123
124
|
for message in messages:
|
|
124
125
|
msg_type = message.get("type")
|
|
125
126
|
role = message.get("role")
|
|
126
|
-
|
|
127
|
+
|
|
127
128
|
# Handle user messages (both with and without explicit type)
|
|
128
129
|
if role == "user" or msg_type == "user":
|
|
129
130
|
content = message.get("content", "")
|
|
@@ -132,55 +133,41 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
132
133
|
converted_content = []
|
|
133
134
|
for item in content:
|
|
134
135
|
if isinstance(item, dict) and item.get("type") == "input_image":
|
|
135
|
-
# Convert input_image to
|
|
136
|
+
# Convert input_image to OpenAI image format
|
|
136
137
|
image_url = item.get("image_url", "")
|
|
137
138
|
if image_url and image_url != "[omitted]":
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
"type": "image",
|
|
146
|
-
"source": {
|
|
147
|
-
"type": "base64",
|
|
148
|
-
"media_type": "image/png",
|
|
149
|
-
"data": base64_data
|
|
150
|
-
}
|
|
151
|
-
})
|
|
139
|
+
converted_content.append(
|
|
140
|
+
{"type": "image_url", "image_url": {"url": image_url}}
|
|
141
|
+
)
|
|
142
|
+
elif isinstance(item, dict) and item.get("type") == "input_text":
|
|
143
|
+
# Convert input_text to OpenAI text format
|
|
144
|
+
text = item.get("text", "")
|
|
145
|
+
converted_content.append({"type": "text", "text": text})
|
|
152
146
|
else:
|
|
153
147
|
# Keep other content types as-is
|
|
154
148
|
converted_content.append(item)
|
|
155
|
-
|
|
156
|
-
completion_messages.append(
|
|
157
|
-
"role": "user",
|
|
158
|
-
|
|
159
|
-
})
|
|
149
|
+
|
|
150
|
+
completion_messages.append(
|
|
151
|
+
{"role": "user", "content": converted_content if converted_content else content}
|
|
152
|
+
)
|
|
160
153
|
else:
|
|
161
154
|
# Text content
|
|
162
|
-
completion_messages.append({
|
|
163
|
-
|
|
164
|
-
"content": content
|
|
165
|
-
})
|
|
166
|
-
|
|
155
|
+
completion_messages.append({"role": "user", "content": content})
|
|
156
|
+
|
|
167
157
|
# Handle assistant messages
|
|
168
158
|
elif role == "assistant":
|
|
169
159
|
content = message.get("content", [])
|
|
170
160
|
if isinstance(content, str):
|
|
171
|
-
content = [{
|
|
172
|
-
|
|
161
|
+
content = [{"type": "output_text", "text": content}]
|
|
162
|
+
|
|
173
163
|
content = "\n".join(item.get("text", "") for item in content)
|
|
174
|
-
completion_messages.append({
|
|
175
|
-
|
|
176
|
-
"content": content
|
|
177
|
-
})
|
|
178
|
-
|
|
164
|
+
completion_messages.append({"role": "assistant", "content": content})
|
|
165
|
+
|
|
179
166
|
elif msg_type == "reasoning":
|
|
180
167
|
# Reasoning becomes part of assistant message
|
|
181
168
|
summary = message.get("summary", [])
|
|
182
169
|
reasoning_text = ""
|
|
183
|
-
|
|
170
|
+
|
|
184
171
|
if isinstance(summary, list) and summary:
|
|
185
172
|
# Extract text from summary items
|
|
186
173
|
for item in summary:
|
|
@@ -190,58 +177,54 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
190
177
|
else:
|
|
191
178
|
# Fallback to direct reasoning field
|
|
192
179
|
reasoning_text = message.get("reasoning", "")
|
|
193
|
-
|
|
180
|
+
|
|
194
181
|
if reasoning_text:
|
|
195
|
-
completion_messages.append({
|
|
196
|
-
|
|
197
|
-
"content": reasoning_text
|
|
198
|
-
})
|
|
199
|
-
|
|
182
|
+
completion_messages.append({"role": "assistant", "content": reasoning_text})
|
|
183
|
+
|
|
200
184
|
elif msg_type == "function_call":
|
|
201
185
|
fn_name = message.get("name")
|
|
202
186
|
fn_args = message.get("arguments", "{}")
|
|
203
187
|
call_id = message.get("call_id", "call_1")
|
|
204
188
|
call_id_to_fn_name[call_id] = fn_name
|
|
205
|
-
openai_tool_calls = [
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
"name": fn_name,
|
|
210
|
-
"arguments": fn_args
|
|
189
|
+
openai_tool_calls = [
|
|
190
|
+
{
|
|
191
|
+
"id": call_id,
|
|
192
|
+
"type": "function",
|
|
193
|
+
"function": {"name": fn_name, "arguments": fn_args},
|
|
211
194
|
}
|
|
212
|
-
|
|
195
|
+
] # If the last completion message is an assistant message, extend the tool_calls
|
|
213
196
|
if completion_messages and completion_messages[-1].get("role") == "assistant":
|
|
214
197
|
if "tool_calls" not in completion_messages[-1]:
|
|
215
198
|
completion_messages[-1]["tool_calls"] = []
|
|
216
199
|
completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
|
|
217
200
|
else:
|
|
218
201
|
# Create new assistant message with tool calls
|
|
219
|
-
completion_messages.append(
|
|
220
|
-
"role": "assistant",
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
})
|
|
224
|
-
|
|
202
|
+
completion_messages.append(
|
|
203
|
+
{"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
|
|
204
|
+
)
|
|
205
|
+
|
|
225
206
|
elif msg_type == "function_call_output":
|
|
226
207
|
call_id = message.get("call_id", "call_1")
|
|
227
208
|
fn_output = message.get("output", "")
|
|
228
209
|
fn_name = call_id_to_fn_name.get(call_id, "computer")
|
|
229
210
|
|
|
230
|
-
completion_messages.append(
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
211
|
+
completion_messages.append(
|
|
212
|
+
{
|
|
213
|
+
"role": "function",
|
|
214
|
+
"name": fn_name,
|
|
215
|
+
"tool_call_id": call_id,
|
|
216
|
+
"content": str(fn_output),
|
|
217
|
+
}
|
|
218
|
+
)
|
|
219
|
+
|
|
237
220
|
elif msg_type == "computer_call":
|
|
238
221
|
# Computer call becomes tool use in assistant message
|
|
239
222
|
action = message.get("action", {})
|
|
240
223
|
action_type = action.get("type")
|
|
241
224
|
call_id = message.get("call_id", "call_1")
|
|
242
|
-
|
|
225
|
+
|
|
243
226
|
tool_use_content = []
|
|
244
|
-
|
|
227
|
+
|
|
245
228
|
# Basic actions (all versions)
|
|
246
229
|
if action_type == "click":
|
|
247
230
|
# Input:
|
|
@@ -254,7 +237,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
254
237
|
# "y": 200
|
|
255
238
|
# }
|
|
256
239
|
# }
|
|
257
|
-
|
|
240
|
+
|
|
258
241
|
# Output:
|
|
259
242
|
# {
|
|
260
243
|
# "function": {
|
|
@@ -268,16 +251,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
268
251
|
# "type": "function"
|
|
269
252
|
# }
|
|
270
253
|
button = action.get("button", "left")
|
|
271
|
-
action_name =
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
"
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
"
|
|
254
|
+
action_name = (
|
|
255
|
+
"right_click"
|
|
256
|
+
if button == "right"
|
|
257
|
+
else "middle_click" if button == "wheel" else "left_click"
|
|
258
|
+
)
|
|
259
|
+
tool_use_content.append(
|
|
260
|
+
{
|
|
261
|
+
"type": "tool_use",
|
|
262
|
+
"id": call_id,
|
|
263
|
+
"name": "computer",
|
|
264
|
+
"input": {
|
|
265
|
+
"action": action_name,
|
|
266
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
267
|
+
},
|
|
279
268
|
}
|
|
280
|
-
|
|
269
|
+
)
|
|
281
270
|
elif action_type == "double_click":
|
|
282
271
|
# Input:
|
|
283
272
|
# {
|
|
@@ -289,7 +278,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
289
278
|
# "y": 240
|
|
290
279
|
# }
|
|
291
280
|
# }
|
|
292
|
-
|
|
281
|
+
|
|
293
282
|
# Output:
|
|
294
283
|
# {
|
|
295
284
|
# "function": {
|
|
@@ -302,15 +291,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
302
291
|
# "id": "call_1",
|
|
303
292
|
# "type": "function"
|
|
304
293
|
# }
|
|
305
|
-
tool_use_content.append(
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
"
|
|
311
|
-
|
|
294
|
+
tool_use_content.append(
|
|
295
|
+
{
|
|
296
|
+
"type": "tool_use",
|
|
297
|
+
"id": call_id,
|
|
298
|
+
"name": "computer",
|
|
299
|
+
"input": {
|
|
300
|
+
"action": "double_click",
|
|
301
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
302
|
+
},
|
|
312
303
|
}
|
|
313
|
-
|
|
304
|
+
)
|
|
314
305
|
elif action_type == "type":
|
|
315
306
|
# Input:
|
|
316
307
|
# {
|
|
@@ -321,7 +312,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
321
312
|
# "text": "Hello World"
|
|
322
313
|
# }
|
|
323
314
|
# }
|
|
324
|
-
|
|
315
|
+
|
|
325
316
|
# Output:
|
|
326
317
|
# {
|
|
327
318
|
# "function": {
|
|
@@ -334,15 +325,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
334
325
|
# "id": "call_1",
|
|
335
326
|
# "type": "function"
|
|
336
327
|
# }
|
|
337
|
-
tool_use_content.append(
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
"action": "type",
|
|
343
|
-
"text": action.get("text", "")
|
|
328
|
+
tool_use_content.append(
|
|
329
|
+
{
|
|
330
|
+
"type": "tool_use",
|
|
331
|
+
"id": call_id,
|
|
332
|
+
"name": "computer",
|
|
333
|
+
"input": {"action": "type", "text": action.get("text", "")},
|
|
344
334
|
}
|
|
345
|
-
|
|
335
|
+
)
|
|
346
336
|
elif action_type == "keypress":
|
|
347
337
|
# Input:
|
|
348
338
|
# {
|
|
@@ -353,7 +343,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
353
343
|
# "keys": ["ctrl", "c"]
|
|
354
344
|
# }
|
|
355
345
|
# }
|
|
356
|
-
|
|
346
|
+
|
|
357
347
|
# Output:
|
|
358
348
|
# {
|
|
359
349
|
# "function": {
|
|
@@ -366,15 +356,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
366
356
|
# "id": "call_1",
|
|
367
357
|
# "type": "function"
|
|
368
358
|
# }
|
|
369
|
-
tool_use_content.append(
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
"action": "key",
|
|
375
|
-
"text": "+".join(action.get("keys", []))
|
|
359
|
+
tool_use_content.append(
|
|
360
|
+
{
|
|
361
|
+
"type": "tool_use",
|
|
362
|
+
"id": call_id,
|
|
363
|
+
"name": "computer",
|
|
364
|
+
"input": {"action": "key", "text": "+".join(action.get("keys", []))},
|
|
376
365
|
}
|
|
377
|
-
|
|
366
|
+
)
|
|
378
367
|
elif action_type in ["mouse_move", "move"]:
|
|
379
368
|
# Input:
|
|
380
369
|
# {
|
|
@@ -386,7 +375,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
386
375
|
# "y": 250
|
|
387
376
|
# }
|
|
388
377
|
# }
|
|
389
|
-
|
|
378
|
+
|
|
390
379
|
# Output:
|
|
391
380
|
# {
|
|
392
381
|
# "function": {
|
|
@@ -399,15 +388,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
399
388
|
# "id": "call_1",
|
|
400
389
|
# "type": "function"
|
|
401
390
|
# }
|
|
402
|
-
tool_use_content.append(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
"
|
|
408
|
-
|
|
391
|
+
tool_use_content.append(
|
|
392
|
+
{
|
|
393
|
+
"type": "tool_use",
|
|
394
|
+
"id": call_id,
|
|
395
|
+
"name": "computer",
|
|
396
|
+
"input": {
|
|
397
|
+
"action": "mouse_move",
|
|
398
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
399
|
+
},
|
|
409
400
|
}
|
|
410
|
-
|
|
401
|
+
)
|
|
411
402
|
elif action_type == "scroll":
|
|
412
403
|
# Input:
|
|
413
404
|
# {
|
|
@@ -421,7 +412,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
421
412
|
# "scroll_y": -5
|
|
422
413
|
# }
|
|
423
414
|
# }
|
|
424
|
-
|
|
415
|
+
|
|
425
416
|
# Output:
|
|
426
417
|
# {
|
|
427
418
|
# "function": {
|
|
@@ -454,18 +445,20 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
454
445
|
else:
|
|
455
446
|
direction = "down"
|
|
456
447
|
amount = 3
|
|
457
|
-
|
|
458
|
-
tool_use_content.append(
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
"
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
448
|
+
|
|
449
|
+
tool_use_content.append(
|
|
450
|
+
{
|
|
451
|
+
"type": "tool_use",
|
|
452
|
+
"id": call_id,
|
|
453
|
+
"name": "computer",
|
|
454
|
+
"input": {
|
|
455
|
+
"action": "scroll",
|
|
456
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
457
|
+
"scroll_direction": direction,
|
|
458
|
+
"scroll_amount": amount,
|
|
459
|
+
},
|
|
467
460
|
}
|
|
468
|
-
|
|
461
|
+
)
|
|
469
462
|
elif action_type == "drag":
|
|
470
463
|
# Input:
|
|
471
464
|
# {
|
|
@@ -479,7 +472,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
479
472
|
# ]
|
|
480
473
|
# }
|
|
481
474
|
# }
|
|
482
|
-
|
|
475
|
+
|
|
483
476
|
# Output:
|
|
484
477
|
# {
|
|
485
478
|
# "function": {
|
|
@@ -499,17 +492,19 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
499
492
|
if isinstance(path, list) and len(path) >= 2:
|
|
500
493
|
start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
|
|
501
494
|
end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
|
|
502
|
-
|
|
503
|
-
tool_use_content.append(
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
"
|
|
509
|
-
|
|
510
|
-
|
|
495
|
+
|
|
496
|
+
tool_use_content.append(
|
|
497
|
+
{
|
|
498
|
+
"type": "tool_use",
|
|
499
|
+
"id": call_id,
|
|
500
|
+
"name": "computer",
|
|
501
|
+
"input": {
|
|
502
|
+
"action": "left_click_drag",
|
|
503
|
+
"start_coordinate": start_coord,
|
|
504
|
+
"end_coordinate": end_coord,
|
|
505
|
+
},
|
|
511
506
|
}
|
|
512
|
-
|
|
507
|
+
)
|
|
513
508
|
elif action_type == "wait":
|
|
514
509
|
# Input:
|
|
515
510
|
# {
|
|
@@ -519,7 +514,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
519
514
|
# "type": "wait"
|
|
520
515
|
# }
|
|
521
516
|
# }
|
|
522
|
-
|
|
517
|
+
|
|
523
518
|
# Output:
|
|
524
519
|
# {
|
|
525
520
|
# "function": {
|
|
@@ -531,14 +526,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
531
526
|
# "id": "call_1",
|
|
532
527
|
# "type": "function"
|
|
533
528
|
# }
|
|
534
|
-
tool_use_content.append(
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
"action": "wait"
|
|
529
|
+
tool_use_content.append(
|
|
530
|
+
{
|
|
531
|
+
"type": "tool_use",
|
|
532
|
+
"id": call_id,
|
|
533
|
+
"name": "computer",
|
|
534
|
+
"input": {"action": "wait"},
|
|
540
535
|
}
|
|
541
|
-
|
|
536
|
+
)
|
|
542
537
|
elif action_type == "screenshot":
|
|
543
538
|
# Input:
|
|
544
539
|
# {
|
|
@@ -548,7 +543,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
548
543
|
# "type": "screenshot"
|
|
549
544
|
# }
|
|
550
545
|
# }
|
|
551
|
-
|
|
546
|
+
|
|
552
547
|
# Output:
|
|
553
548
|
# {
|
|
554
549
|
# "function": {
|
|
@@ -560,47 +555,53 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
560
555
|
# "id": "call_1",
|
|
561
556
|
# "type": "function"
|
|
562
557
|
# }
|
|
563
|
-
tool_use_content.append(
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
"action": "screenshot"
|
|
558
|
+
tool_use_content.append(
|
|
559
|
+
{
|
|
560
|
+
"type": "tool_use",
|
|
561
|
+
"id": call_id,
|
|
562
|
+
"name": "computer",
|
|
563
|
+
"input": {"action": "screenshot"},
|
|
569
564
|
}
|
|
570
|
-
|
|
565
|
+
)
|
|
571
566
|
elif action_type == "left_mouse_down":
|
|
572
|
-
tool_use_content.append(
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
"
|
|
578
|
-
|
|
567
|
+
tool_use_content.append(
|
|
568
|
+
{
|
|
569
|
+
"type": "tool_use",
|
|
570
|
+
"id": call_id,
|
|
571
|
+
"name": "computer",
|
|
572
|
+
"input": {
|
|
573
|
+
"action": "left_mouse_down",
|
|
574
|
+
"coordinate": [action.get("x", None), action.get("y", None)],
|
|
575
|
+
},
|
|
579
576
|
}
|
|
580
|
-
|
|
577
|
+
)
|
|
581
578
|
elif action_type == "left_mouse_up":
|
|
582
|
-
tool_use_content.append(
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
"
|
|
588
|
-
|
|
579
|
+
tool_use_content.append(
|
|
580
|
+
{
|
|
581
|
+
"type": "tool_use",
|
|
582
|
+
"id": call_id,
|
|
583
|
+
"name": "computer",
|
|
584
|
+
"input": {
|
|
585
|
+
"action": "left_mouse_up",
|
|
586
|
+
"coordinate": [action.get("x", None), action.get("y", None)],
|
|
587
|
+
},
|
|
589
588
|
}
|
|
590
|
-
|
|
591
|
-
|
|
589
|
+
)
|
|
590
|
+
|
|
592
591
|
# Convert tool_use_content to OpenAI tool_calls format
|
|
593
592
|
openai_tool_calls = []
|
|
594
593
|
for tool_use in tool_use_content:
|
|
595
|
-
openai_tool_calls.append(
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
"
|
|
600
|
-
|
|
594
|
+
openai_tool_calls.append(
|
|
595
|
+
{
|
|
596
|
+
"id": tool_use["id"],
|
|
597
|
+
"type": "function",
|
|
598
|
+
"function": {
|
|
599
|
+
"name": tool_use["name"],
|
|
600
|
+
"arguments": json.dumps(tool_use["input"]),
|
|
601
|
+
},
|
|
601
602
|
}
|
|
602
|
-
|
|
603
|
-
|
|
603
|
+
)
|
|
604
|
+
|
|
604
605
|
# If the last completion message is an assistant message, extend the tool_calls
|
|
605
606
|
if completion_messages and completion_messages[-1].get("role") == "assistant":
|
|
606
607
|
if "tool_calls" not in completion_messages[-1]:
|
|
@@ -608,54 +609,52 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
608
609
|
completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
|
|
609
610
|
else:
|
|
610
611
|
# Create new assistant message with tool calls
|
|
611
|
-
completion_messages.append(
|
|
612
|
-
"role": "assistant",
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
})
|
|
616
|
-
|
|
612
|
+
completion_messages.append(
|
|
613
|
+
{"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
|
|
614
|
+
)
|
|
615
|
+
|
|
617
616
|
elif msg_type == "computer_call_output":
|
|
618
617
|
# Computer call output becomes OpenAI function result
|
|
619
618
|
output = message.get("output", {})
|
|
620
619
|
call_id = message.get("call_id", "call_1")
|
|
621
|
-
|
|
620
|
+
|
|
622
621
|
if output.get("type") == "input_image":
|
|
623
622
|
# Screenshot result - convert to OpenAI format with image_url content
|
|
624
623
|
image_url = output.get("image_url", "")
|
|
625
|
-
completion_messages.append(
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
"type": "image_url",
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
}
|
|
634
|
-
}]
|
|
635
|
-
})
|
|
624
|
+
completion_messages.append(
|
|
625
|
+
{
|
|
626
|
+
"role": "function",
|
|
627
|
+
"name": "computer",
|
|
628
|
+
"tool_call_id": call_id,
|
|
629
|
+
"content": [{"type": "image_url", "image_url": {"url": image_url}}],
|
|
630
|
+
}
|
|
631
|
+
)
|
|
636
632
|
else:
|
|
637
633
|
# Text result - convert to OpenAI format
|
|
638
|
-
completion_messages.append(
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
634
|
+
completion_messages.append(
|
|
635
|
+
{
|
|
636
|
+
"role": "function",
|
|
637
|
+
"name": "computer",
|
|
638
|
+
"tool_call_id": call_id,
|
|
639
|
+
"content": str(output),
|
|
640
|
+
}
|
|
641
|
+
)
|
|
642
|
+
|
|
645
643
|
return completion_messages
|
|
646
644
|
|
|
645
|
+
|
|
647
646
|
def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
|
|
648
647
|
"""Convert liteLLM completion response to responses_items message format."""
|
|
649
648
|
responses_items = []
|
|
650
|
-
|
|
651
|
-
if not response or not hasattr(response,
|
|
649
|
+
|
|
650
|
+
if not response or not hasattr(response, "choices") or not response.choices:
|
|
652
651
|
return responses_items
|
|
653
|
-
|
|
652
|
+
|
|
654
653
|
choice = response.choices[0]
|
|
655
654
|
message = choice.message
|
|
656
|
-
|
|
655
|
+
|
|
657
656
|
# Handle text content
|
|
658
|
-
if hasattr(message,
|
|
657
|
+
if hasattr(message, "content") and message.content:
|
|
659
658
|
if isinstance(message.content, str):
|
|
660
659
|
responses_items.append(make_output_text_item(message.content))
|
|
661
660
|
elif isinstance(message.content, list):
|
|
@@ -664,35 +663,54 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
664
663
|
if content_item.get("type") == "text":
|
|
665
664
|
responses_items.append(make_output_text_item(content_item.get("text", "")))
|
|
666
665
|
elif content_item.get("type") == "tool_use":
|
|
667
|
-
#
|
|
666
|
+
# Check if this is a custom function tool or computer tool
|
|
667
|
+
tool_name = content_item.get("name", "computer")
|
|
668
668
|
tool_input = content_item.get("input", {})
|
|
669
|
-
action_type = tool_input.get("action")
|
|
670
669
|
call_id = content_item.get("id")
|
|
671
|
-
|
|
670
|
+
|
|
671
|
+
# Handle custom function tools (not computer tools)
|
|
672
|
+
if tool_name != "computer":
|
|
673
|
+
from ..responses import make_function_call_item
|
|
674
|
+
|
|
675
|
+
responses_items.append(
|
|
676
|
+
make_function_call_item(
|
|
677
|
+
function_name=tool_name, arguments=tool_input, call_id=call_id
|
|
678
|
+
)
|
|
679
|
+
)
|
|
680
|
+
continue
|
|
681
|
+
|
|
682
|
+
# Computer tool - process actions
|
|
683
|
+
action_type = tool_input.get("action")
|
|
684
|
+
|
|
672
685
|
# Action reference:
|
|
673
686
|
# https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
|
|
674
|
-
|
|
687
|
+
|
|
675
688
|
try:
|
|
676
689
|
# Basic actions (all versions)
|
|
677
690
|
if action_type == "screenshot":
|
|
678
691
|
responses_items.append(make_screenshot_item(call_id=call_id))
|
|
679
692
|
elif action_type in ["click", "left_click"]:
|
|
680
693
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
681
|
-
responses_items.append(
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
694
|
+
responses_items.append(
|
|
695
|
+
make_click_item(
|
|
696
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
697
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
698
|
+
call_id=call_id,
|
|
699
|
+
)
|
|
700
|
+
)
|
|
686
701
|
elif action_type in ["type", "type_text"]:
|
|
687
|
-
responses_items.append(
|
|
688
|
-
text=tool_input.get("text", ""),
|
|
689
|
-
|
|
690
|
-
))
|
|
702
|
+
responses_items.append(
|
|
703
|
+
make_type_item(text=tool_input.get("text", ""), call_id=call_id)
|
|
704
|
+
)
|
|
691
705
|
elif action_type in ["key", "keypress", "hotkey"]:
|
|
692
|
-
responses_items.append(
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
706
|
+
responses_items.append(
|
|
707
|
+
make_keypress_item(
|
|
708
|
+
keys=tool_input.get("text", "")
|
|
709
|
+
.replace("+", "-")
|
|
710
|
+
.split("-"),
|
|
711
|
+
call_id=call_id,
|
|
712
|
+
)
|
|
713
|
+
)
|
|
696
714
|
elif action_type in ["mouse_move", "move_cursor", "move"]:
|
|
697
715
|
# Mouse move - create a custom action item
|
|
698
716
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
@@ -700,64 +718,88 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
700
718
|
make_move_item(
|
|
701
719
|
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
702
720
|
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
703
|
-
call_id=call_id
|
|
721
|
+
call_id=call_id,
|
|
704
722
|
)
|
|
705
723
|
)
|
|
706
|
-
|
|
724
|
+
|
|
707
725
|
# Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
|
|
708
726
|
elif action_type == "scroll":
|
|
709
727
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
710
728
|
scroll_amount = tool_input.get("scroll_amount", 3)
|
|
711
|
-
scroll_x =
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
729
|
+
scroll_x = (
|
|
730
|
+
scroll_amount
|
|
731
|
+
if tool_input.get("scroll_direction", "down") == "right"
|
|
732
|
+
else (
|
|
733
|
+
-scroll_amount
|
|
734
|
+
if tool_input.get("scroll_direction", "down") == "left"
|
|
735
|
+
else 0
|
|
736
|
+
)
|
|
737
|
+
)
|
|
738
|
+
scroll_y = (
|
|
739
|
+
scroll_amount
|
|
740
|
+
if tool_input.get("scroll_direction", "down") == "down"
|
|
741
|
+
else (
|
|
742
|
+
-scroll_amount
|
|
743
|
+
if tool_input.get("scroll_direction", "down") == "up"
|
|
744
|
+
else 0
|
|
745
|
+
)
|
|
746
|
+
)
|
|
747
|
+
responses_items.append(
|
|
748
|
+
make_scroll_item(
|
|
749
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
750
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
751
|
+
scroll_x=scroll_x,
|
|
752
|
+
scroll_y=scroll_y,
|
|
753
|
+
call_id=call_id,
|
|
754
|
+
)
|
|
755
|
+
)
|
|
722
756
|
elif action_type in ["left_click_drag", "drag"]:
|
|
723
757
|
start_coord = tool_input.get("start_coordinate", [0, 0])
|
|
724
758
|
end_coord = tool_input.get("end_coordinate", [0, 0])
|
|
725
|
-
responses_items.append(
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
759
|
+
responses_items.append(
|
|
760
|
+
make_drag_item(
|
|
761
|
+
path=[
|
|
762
|
+
{
|
|
763
|
+
"x": start_coord[0] if len(start_coord) > 0 else 0,
|
|
764
|
+
"y": start_coord[1] if len(start_coord) > 1 else 0,
|
|
765
|
+
},
|
|
766
|
+
{
|
|
767
|
+
"x": end_coord[0] if len(end_coord) > 0 else 0,
|
|
768
|
+
"y": end_coord[1] if len(end_coord) > 1 else 0,
|
|
769
|
+
},
|
|
770
|
+
],
|
|
771
|
+
call_id=call_id,
|
|
772
|
+
)
|
|
773
|
+
)
|
|
738
774
|
elif action_type == "right_click":
|
|
739
775
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
740
|
-
responses_items.append(
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
776
|
+
responses_items.append(
|
|
777
|
+
make_click_item(
|
|
778
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
779
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
780
|
+
button="right",
|
|
781
|
+
call_id=call_id,
|
|
782
|
+
)
|
|
783
|
+
)
|
|
746
784
|
elif action_type == "middle_click":
|
|
747
785
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
748
|
-
responses_items.append(
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
786
|
+
responses_items.append(
|
|
787
|
+
make_click_item(
|
|
788
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
789
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
790
|
+
button="wheel",
|
|
791
|
+
call_id=call_id,
|
|
792
|
+
)
|
|
793
|
+
)
|
|
754
794
|
elif action_type == "double_click":
|
|
755
795
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
756
|
-
responses_items.append(
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
796
|
+
responses_items.append(
|
|
797
|
+
make_double_click_item(
|
|
798
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
799
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
800
|
+
call_id=call_id,
|
|
801
|
+
)
|
|
802
|
+
)
|
|
761
803
|
elif action_type == "triple_click":
|
|
762
804
|
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
763
805
|
# responses_items.append({
|
|
@@ -783,11 +825,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
783
825
|
# }
|
|
784
826
|
# })
|
|
785
827
|
coordinate = tool_input.get("coordinate", [None, None])
|
|
786
|
-
responses_items.append(
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
828
|
+
responses_items.append(
|
|
829
|
+
make_left_mouse_down_item(
|
|
830
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
831
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
832
|
+
call_id=call_id,
|
|
833
|
+
)
|
|
834
|
+
)
|
|
791
835
|
elif action_type == "left_mouse_up":
|
|
792
836
|
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
793
837
|
# responses_items.append({
|
|
@@ -801,11 +845,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
801
845
|
# }
|
|
802
846
|
# })
|
|
803
847
|
coordinate = tool_input.get("coordinate", [None, None])
|
|
804
|
-
responses_items.append(
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
848
|
+
responses_items.append(
|
|
849
|
+
make_left_mouse_up_item(
|
|
850
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
851
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
852
|
+
call_id=call_id,
|
|
853
|
+
)
|
|
854
|
+
)
|
|
809
855
|
elif action_type == "hold_key":
|
|
810
856
|
# responses_items.append({
|
|
811
857
|
# "type": "computer_call",
|
|
@@ -817,22 +863,41 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
817
863
|
# })
|
|
818
864
|
raise NotImplementedError("hold_key")
|
|
819
865
|
elif action_type == "wait":
|
|
820
|
-
responses_items.append(make_wait_item(
|
|
821
|
-
call_id=call_id
|
|
822
|
-
))
|
|
866
|
+
responses_items.append(make_wait_item(call_id=call_id))
|
|
823
867
|
else:
|
|
824
868
|
raise ValueError(f"Unknown action type: {action_type}")
|
|
825
869
|
except Exception as e:
|
|
826
|
-
responses_items.extend(
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
870
|
+
responses_items.extend(
|
|
871
|
+
make_failed_tool_call_items(
|
|
872
|
+
tool_name="computer",
|
|
873
|
+
tool_kwargs=tool_input,
|
|
874
|
+
error_message=repr(e),
|
|
875
|
+
call_id=call_id,
|
|
876
|
+
)
|
|
877
|
+
)
|
|
878
|
+
|
|
833
879
|
# Handle tool calls (alternative format)
|
|
834
|
-
if hasattr(message,
|
|
880
|
+
if hasattr(message, "tool_calls") and message.tool_calls:
|
|
835
881
|
for tool_call in message.tool_calls:
|
|
882
|
+
tool_name = tool_call.function.name
|
|
883
|
+
|
|
884
|
+
# Handle custom function tools
|
|
885
|
+
if tool_name != "computer":
|
|
886
|
+
from ..responses import make_function_call_item
|
|
887
|
+
|
|
888
|
+
# tool_call.function.arguments is a JSON string, need to parse it
|
|
889
|
+
try:
|
|
890
|
+
args_dict = json.loads(tool_call.function.arguments)
|
|
891
|
+
except json.JSONDecodeError:
|
|
892
|
+
args_dict = {}
|
|
893
|
+
responses_items.append(
|
|
894
|
+
make_function_call_item(
|
|
895
|
+
function_name=tool_name, arguments=args_dict, call_id=tool_call.id
|
|
896
|
+
)
|
|
897
|
+
)
|
|
898
|
+
continue
|
|
899
|
+
|
|
900
|
+
# Handle computer tool
|
|
836
901
|
if tool_call.function.name == "computer":
|
|
837
902
|
try:
|
|
838
903
|
try:
|
|
@@ -853,7 +918,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
853
918
|
# "id": "call_1",
|
|
854
919
|
# "type": "function"
|
|
855
920
|
# }
|
|
856
|
-
|
|
921
|
+
|
|
857
922
|
# Output:
|
|
858
923
|
# {
|
|
859
924
|
# "type": "computer_call",
|
|
@@ -862,9 +927,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
862
927
|
# "type": "screenshot"
|
|
863
928
|
# }
|
|
864
929
|
# }
|
|
865
|
-
responses_items.append(make_screenshot_item(
|
|
866
|
-
call_id=call_id
|
|
867
|
-
))
|
|
930
|
+
responses_items.append(make_screenshot_item(call_id=call_id))
|
|
868
931
|
elif action_type in ["click", "left_click"]:
|
|
869
932
|
# Input:
|
|
870
933
|
# {
|
|
@@ -878,7 +941,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
878
941
|
# "id": "call_1",
|
|
879
942
|
# "type": "function"
|
|
880
943
|
# }
|
|
881
|
-
|
|
944
|
+
|
|
882
945
|
# Output:
|
|
883
946
|
# {
|
|
884
947
|
# "type": "computer_call",
|
|
@@ -890,11 +953,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
890
953
|
# }
|
|
891
954
|
# }
|
|
892
955
|
coordinate = args.get("coordinate", [0, 0])
|
|
893
|
-
responses_items.append(
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
956
|
+
responses_items.append(
|
|
957
|
+
make_click_item(
|
|
958
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
959
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
960
|
+
call_id=call_id,
|
|
961
|
+
)
|
|
962
|
+
)
|
|
898
963
|
elif action_type in ["type", "type_text"]:
|
|
899
964
|
# Input:
|
|
900
965
|
# {
|
|
@@ -908,7 +973,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
908
973
|
# "id": "call_1",
|
|
909
974
|
# "type": "function"
|
|
910
975
|
# }
|
|
911
|
-
|
|
976
|
+
|
|
912
977
|
# Output:
|
|
913
978
|
# {
|
|
914
979
|
# "type": "computer_call",
|
|
@@ -918,10 +983,9 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
918
983
|
# "text": "Hello World"
|
|
919
984
|
# }
|
|
920
985
|
# }
|
|
921
|
-
responses_items.append(
|
|
922
|
-
text=args.get("text", ""),
|
|
923
|
-
|
|
924
|
-
))
|
|
986
|
+
responses_items.append(
|
|
987
|
+
make_type_item(text=args.get("text", ""), call_id=call_id)
|
|
988
|
+
)
|
|
925
989
|
elif action_type in ["key", "keypress", "hotkey"]:
|
|
926
990
|
# Input:
|
|
927
991
|
# {
|
|
@@ -935,7 +999,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
935
999
|
# "id": "call_1",
|
|
936
1000
|
# "type": "function"
|
|
937
1001
|
# }
|
|
938
|
-
|
|
1002
|
+
|
|
939
1003
|
# Output:
|
|
940
1004
|
# {
|
|
941
1005
|
# "type": "computer_call",
|
|
@@ -945,10 +1009,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
945
1009
|
# "keys": ["ctrl", "c"]
|
|
946
1010
|
# }
|
|
947
1011
|
# }
|
|
948
|
-
responses_items.append(
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1012
|
+
responses_items.append(
|
|
1013
|
+
make_keypress_item(
|
|
1014
|
+
keys=args.get("text", "").replace("+", "-").split("-"),
|
|
1015
|
+
call_id=call_id,
|
|
1016
|
+
)
|
|
1017
|
+
)
|
|
952
1018
|
elif action_type in ["mouse_move", "move_cursor", "move"]:
|
|
953
1019
|
# Input:
|
|
954
1020
|
# {
|
|
@@ -962,7 +1028,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
962
1028
|
# "id": "call_1",
|
|
963
1029
|
# "type": "function"
|
|
964
1030
|
# }
|
|
965
|
-
|
|
1031
|
+
|
|
966
1032
|
# Output:
|
|
967
1033
|
# {
|
|
968
1034
|
# "type": "computer_call",
|
|
@@ -974,12 +1040,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
974
1040
|
# }
|
|
975
1041
|
# }
|
|
976
1042
|
coordinate = args.get("coordinate", [0, 0])
|
|
977
|
-
responses_items.append(
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
1043
|
+
responses_items.append(
|
|
1044
|
+
make_move_item(
|
|
1045
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1046
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1047
|
+
call_id=call_id,
|
|
1048
|
+
)
|
|
1049
|
+
)
|
|
1050
|
+
|
|
983
1051
|
# Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
|
|
984
1052
|
elif action_type == "scroll":
|
|
985
1053
|
# Input:
|
|
@@ -996,7 +1064,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
996
1064
|
# "id": "call_1",
|
|
997
1065
|
# "type": "function"
|
|
998
1066
|
# }
|
|
999
|
-
|
|
1067
|
+
|
|
1000
1068
|
# Output:
|
|
1001
1069
|
# {
|
|
1002
1070
|
# "type": "computer_call",
|
|
@@ -1012,17 +1080,25 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1012
1080
|
coordinate = args.get("coordinate", [0, 0])
|
|
1013
1081
|
direction = args.get("scroll_direction", "down")
|
|
1014
1082
|
amount = args.get("scroll_amount", 3)
|
|
1015
|
-
scroll_x =
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1083
|
+
scroll_x = (
|
|
1084
|
+
amount
|
|
1085
|
+
if direction == "left"
|
|
1086
|
+
else -amount if direction == "right" else 0
|
|
1087
|
+
)
|
|
1088
|
+
scroll_y = (
|
|
1089
|
+
amount
|
|
1090
|
+
if direction == "up"
|
|
1091
|
+
else -amount if direction == "down" else 0
|
|
1092
|
+
)
|
|
1093
|
+
responses_items.append(
|
|
1094
|
+
make_scroll_item(
|
|
1095
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1096
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1097
|
+
scroll_x=scroll_x,
|
|
1098
|
+
scroll_y=scroll_y,
|
|
1099
|
+
call_id=call_id,
|
|
1100
|
+
)
|
|
1101
|
+
)
|
|
1026
1102
|
elif action_type in ["left_click_drag", "drag"]:
|
|
1027
1103
|
# Input:
|
|
1028
1104
|
# {
|
|
@@ -1037,7 +1113,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1037
1113
|
# "id": "call_1",
|
|
1038
1114
|
# "type": "function"
|
|
1039
1115
|
# }
|
|
1040
|
-
|
|
1116
|
+
|
|
1041
1117
|
# Output:
|
|
1042
1118
|
# {
|
|
1043
1119
|
# "type": "computer_call",
|
|
@@ -1052,19 +1128,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1052
1128
|
# }
|
|
1053
1129
|
start_coord = args.get("start_coordinate", [0, 0])
|
|
1054
1130
|
end_coord = args.get("end_coordinate", [0, 0])
|
|
1055
|
-
responses_items.append(
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1131
|
+
responses_items.append(
|
|
1132
|
+
make_drag_item(
|
|
1133
|
+
path=[
|
|
1134
|
+
{
|
|
1135
|
+
"x": start_coord[0] if len(start_coord) > 0 else 0,
|
|
1136
|
+
"y": start_coord[1] if len(start_coord) > 1 else 0,
|
|
1137
|
+
},
|
|
1138
|
+
{
|
|
1139
|
+
"x": end_coord[0] if len(end_coord) > 0 else 0,
|
|
1140
|
+
"y": end_coord[1] if len(end_coord) > 1 else 0,
|
|
1141
|
+
},
|
|
1142
|
+
],
|
|
1143
|
+
call_id=call_id,
|
|
1144
|
+
)
|
|
1145
|
+
)
|
|
1068
1146
|
elif action_type == "right_click":
|
|
1069
1147
|
# Input:
|
|
1070
1148
|
# {
|
|
@@ -1078,7 +1156,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1078
1156
|
# "id": "call_1",
|
|
1079
1157
|
# "type": "function"
|
|
1080
1158
|
# }
|
|
1081
|
-
|
|
1159
|
+
|
|
1082
1160
|
# Output:
|
|
1083
1161
|
# {
|
|
1084
1162
|
# "type": "computer_call",
|
|
@@ -1091,12 +1169,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1091
1169
|
# }
|
|
1092
1170
|
# }
|
|
1093
1171
|
coordinate = args.get("coordinate", [0, 0])
|
|
1094
|
-
responses_items.append(
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1172
|
+
responses_items.append(
|
|
1173
|
+
make_click_item(
|
|
1174
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1175
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1176
|
+
button="right",
|
|
1177
|
+
call_id=call_id,
|
|
1178
|
+
)
|
|
1179
|
+
)
|
|
1100
1180
|
elif action_type == "middle_click":
|
|
1101
1181
|
# Input:
|
|
1102
1182
|
# {
|
|
@@ -1110,7 +1190,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1110
1190
|
# "id": "call_1",
|
|
1111
1191
|
# "type": "function"
|
|
1112
1192
|
# }
|
|
1113
|
-
|
|
1193
|
+
|
|
1114
1194
|
# Output:
|
|
1115
1195
|
# {
|
|
1116
1196
|
# "type": "computer_call",
|
|
@@ -1123,12 +1203,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1123
1203
|
# }
|
|
1124
1204
|
# }
|
|
1125
1205
|
coordinate = args.get("coordinate", [0, 0])
|
|
1126
|
-
responses_items.append(
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1206
|
+
responses_items.append(
|
|
1207
|
+
make_click_item(
|
|
1208
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1209
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1210
|
+
button="wheel",
|
|
1211
|
+
call_id=call_id,
|
|
1212
|
+
)
|
|
1213
|
+
)
|
|
1132
1214
|
elif action_type == "double_click":
|
|
1133
1215
|
# Input:
|
|
1134
1216
|
# {
|
|
@@ -1142,7 +1224,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1142
1224
|
# "id": "call_1",
|
|
1143
1225
|
# "type": "function"
|
|
1144
1226
|
# }
|
|
1145
|
-
|
|
1227
|
+
|
|
1146
1228
|
# Output:
|
|
1147
1229
|
# {
|
|
1148
1230
|
# "type": "computer_call",
|
|
@@ -1154,11 +1236,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1154
1236
|
# }
|
|
1155
1237
|
# }
|
|
1156
1238
|
coordinate = args.get("coordinate", [0, 0])
|
|
1157
|
-
responses_items.append(
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1239
|
+
responses_items.append(
|
|
1240
|
+
make_double_click_item(
|
|
1241
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1242
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1243
|
+
call_id=call_id,
|
|
1244
|
+
)
|
|
1245
|
+
)
|
|
1162
1246
|
elif action_type == "triple_click":
|
|
1163
1247
|
# Input:
|
|
1164
1248
|
# {
|
|
@@ -1172,7 +1256,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1172
1256
|
# "id": "call_1",
|
|
1173
1257
|
# "type": "function"
|
|
1174
1258
|
# }
|
|
1175
|
-
|
|
1259
|
+
|
|
1176
1260
|
# Output:
|
|
1177
1261
|
# {
|
|
1178
1262
|
# "type": "computer_call",
|
|
@@ -1197,7 +1281,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1197
1281
|
# "id": "call_1",
|
|
1198
1282
|
# "type": "function"
|
|
1199
1283
|
# }
|
|
1200
|
-
|
|
1284
|
+
|
|
1201
1285
|
# Output:
|
|
1202
1286
|
# {
|
|
1203
1287
|
# "type": "computer_call",
|
|
@@ -1210,11 +1294,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1210
1294
|
# }
|
|
1211
1295
|
# }
|
|
1212
1296
|
coordinate = args.get("coordinate", [None, None])
|
|
1213
|
-
responses_items.append(
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1297
|
+
responses_items.append(
|
|
1298
|
+
make_left_mouse_down_item(
|
|
1299
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
1300
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
1301
|
+
call_id=call_id,
|
|
1302
|
+
)
|
|
1303
|
+
)
|
|
1218
1304
|
elif action_type == "left_mouse_up":
|
|
1219
1305
|
# Input:
|
|
1220
1306
|
# {
|
|
@@ -1228,7 +1314,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1228
1314
|
# "id": "call_1",
|
|
1229
1315
|
# "type": "function"
|
|
1230
1316
|
# }
|
|
1231
|
-
|
|
1317
|
+
|
|
1232
1318
|
# Output:
|
|
1233
1319
|
# {
|
|
1234
1320
|
# "type": "computer_call",
|
|
@@ -1241,11 +1327,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1241
1327
|
# }
|
|
1242
1328
|
# }
|
|
1243
1329
|
coordinate = args.get("coordinate", [None, None])
|
|
1244
|
-
responses_items.append(
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1330
|
+
responses_items.append(
|
|
1331
|
+
make_left_mouse_up_item(
|
|
1332
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
1333
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
1334
|
+
call_id=call_id,
|
|
1335
|
+
)
|
|
1336
|
+
)
|
|
1249
1337
|
elif action_type == "hold_key":
|
|
1250
1338
|
# Input:
|
|
1251
1339
|
# {
|
|
@@ -1259,7 +1347,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1259
1347
|
# "id": "call_1",
|
|
1260
1348
|
# "type": "function"
|
|
1261
1349
|
# }
|
|
1262
|
-
|
|
1350
|
+
|
|
1263
1351
|
# Output:
|
|
1264
1352
|
# {
|
|
1265
1353
|
# "type": "computer_call",
|
|
@@ -1282,7 +1370,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1282
1370
|
# "id": "call_1",
|
|
1283
1371
|
# "type": "function"
|
|
1284
1372
|
# }
|
|
1285
|
-
|
|
1373
|
+
|
|
1286
1374
|
# Output:
|
|
1287
1375
|
# {
|
|
1288
1376
|
# "type": "computer_call",
|
|
@@ -1291,74 +1379,77 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1291
1379
|
# "type": "wait"
|
|
1292
1380
|
# }
|
|
1293
1381
|
# }
|
|
1294
|
-
responses_items.append(make_wait_item(
|
|
1295
|
-
call_id=call_id
|
|
1296
|
-
))
|
|
1382
|
+
responses_items.append(make_wait_item(call_id=call_id))
|
|
1297
1383
|
except Exception as e:
|
|
1298
|
-
responses_items.extend(
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1384
|
+
responses_items.extend(
|
|
1385
|
+
make_failed_tool_call_items(
|
|
1386
|
+
tool_name="computer",
|
|
1387
|
+
tool_kwargs=args,
|
|
1388
|
+
error_message=repr(e),
|
|
1389
|
+
call_id=call_id,
|
|
1390
|
+
)
|
|
1391
|
+
)
|
|
1304
1392
|
except json.JSONDecodeError:
|
|
1305
1393
|
print("Failed to decode tool call arguments")
|
|
1306
1394
|
# Skip malformed tool calls
|
|
1307
1395
|
continue
|
|
1308
|
-
|
|
1396
|
+
|
|
1309
1397
|
return responses_items
|
|
1310
1398
|
|
|
1399
|
+
|
|
1311
1400
|
def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1312
1401
|
"""Add cache control to completion messages"""
|
|
1313
1402
|
num_writes = 0
|
|
1314
1403
|
for message in completion_messages:
|
|
1315
|
-
message["cache_control"] = {
|
|
1404
|
+
message["cache_control"] = {"type": "ephemeral"}
|
|
1316
1405
|
num_writes += 1
|
|
1317
1406
|
# Cache control has a maximum of 4 blocks
|
|
1318
1407
|
if num_writes >= 4:
|
|
1319
1408
|
break
|
|
1320
|
-
|
|
1409
|
+
|
|
1321
1410
|
return completion_messages
|
|
1322
1411
|
|
|
1412
|
+
|
|
1323
1413
|
def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1324
1414
|
"""Combine completion messages with the same role"""
|
|
1325
1415
|
if not completion_messages:
|
|
1326
1416
|
return completion_messages
|
|
1327
|
-
|
|
1417
|
+
|
|
1328
1418
|
combined_messages = []
|
|
1329
|
-
|
|
1419
|
+
|
|
1330
1420
|
for message in completion_messages:
|
|
1331
1421
|
# If this is the first message or role is different from last, add as new message
|
|
1332
1422
|
if not combined_messages or combined_messages[-1]["role"] != message["role"]:
|
|
1333
1423
|
# Ensure content is a list format and normalize text content
|
|
1334
1424
|
new_message = message.copy()
|
|
1335
1425
|
new_message["content"] = _normalize_content(message.get("content", ""))
|
|
1336
|
-
|
|
1426
|
+
|
|
1337
1427
|
# Copy tool_calls if present
|
|
1338
1428
|
if "tool_calls" in message:
|
|
1339
1429
|
new_message["tool_calls"] = message["tool_calls"].copy()
|
|
1340
|
-
|
|
1430
|
+
|
|
1341
1431
|
combined_messages.append(new_message)
|
|
1342
1432
|
else:
|
|
1343
1433
|
# Same role as previous message, combine them
|
|
1344
1434
|
last_message = combined_messages[-1]
|
|
1345
|
-
|
|
1435
|
+
|
|
1346
1436
|
# Combine content
|
|
1347
1437
|
current_content = _normalize_content(message.get("content", ""))
|
|
1348
1438
|
last_message["content"].extend(current_content)
|
|
1349
|
-
|
|
1439
|
+
|
|
1350
1440
|
# Combine tool_calls if present
|
|
1351
1441
|
if "tool_calls" in message:
|
|
1352
1442
|
if "tool_calls" not in last_message:
|
|
1353
1443
|
last_message["tool_calls"] = []
|
|
1354
1444
|
last_message["tool_calls"].extend(message["tool_calls"])
|
|
1355
|
-
|
|
1445
|
+
|
|
1356
1446
|
# Post-process to merge consecutive text blocks
|
|
1357
1447
|
for message in combined_messages:
|
|
1358
1448
|
message["content"] = _merge_consecutive_text(message["content"])
|
|
1359
|
-
|
|
1449
|
+
|
|
1360
1450
|
return combined_messages
|
|
1361
1451
|
|
|
1452
|
+
|
|
1362
1453
|
def _normalize_content(content) -> List[Dict[str, Any]]:
|
|
1363
1454
|
"""Normalize content to list format"""
|
|
1364
1455
|
if isinstance(content, str):
|
|
@@ -1371,28 +1462,28 @@ def _normalize_content(content) -> List[Dict[str, Any]]:
|
|
|
1371
1462
|
else:
|
|
1372
1463
|
return []
|
|
1373
1464
|
|
|
1465
|
+
|
|
1374
1466
|
def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1375
1467
|
"""Merge consecutive text blocks with newlines"""
|
|
1376
1468
|
if not content_list:
|
|
1377
1469
|
return content_list
|
|
1378
|
-
|
|
1470
|
+
|
|
1379
1471
|
merged = []
|
|
1380
|
-
|
|
1472
|
+
|
|
1381
1473
|
for item in content_list:
|
|
1382
|
-
if
|
|
1383
|
-
merged and
|
|
1384
|
-
merged[-1].get("type") == "text"):
|
|
1474
|
+
if item.get("type") == "text" and merged and merged[-1].get("type") == "text":
|
|
1385
1475
|
# Merge with previous text block
|
|
1386
1476
|
merged[-1]["text"] += "\n" + item["text"]
|
|
1387
1477
|
else:
|
|
1388
1478
|
merged.append(item.copy())
|
|
1389
|
-
|
|
1479
|
+
|
|
1390
1480
|
return merged
|
|
1391
1481
|
|
|
1482
|
+
|
|
1392
1483
|
@register_agent(models=r".*claude-.*")
|
|
1393
1484
|
class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
1394
1485
|
"""Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
|
|
1395
|
-
|
|
1486
|
+
|
|
1396
1487
|
async def predict_step(
|
|
1397
1488
|
self,
|
|
1398
1489
|
messages: Messages,
|
|
@@ -1406,21 +1497,21 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1406
1497
|
_on_api_end=None,
|
|
1407
1498
|
_on_usage=None,
|
|
1408
1499
|
_on_screenshot=None,
|
|
1409
|
-
**kwargs
|
|
1500
|
+
**kwargs,
|
|
1410
1501
|
) -> Dict[str, Any]:
|
|
1411
1502
|
"""
|
|
1412
1503
|
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
1413
|
-
|
|
1504
|
+
|
|
1414
1505
|
Supports Anthropic's computer use models with hosted tools.
|
|
1415
1506
|
"""
|
|
1416
1507
|
tools = tools or []
|
|
1417
|
-
|
|
1508
|
+
|
|
1418
1509
|
# Get tool configuration for this model
|
|
1419
1510
|
tool_config = _get_tool_config_for_model(model)
|
|
1420
|
-
|
|
1511
|
+
|
|
1421
1512
|
# Prepare tools for Anthropic API
|
|
1422
1513
|
anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
|
|
1423
|
-
|
|
1514
|
+
|
|
1424
1515
|
# Convert responses_items messages to completion format
|
|
1425
1516
|
completion_messages = _convert_responses_items_to_completion_messages(messages)
|
|
1426
1517
|
if use_prompt_caching:
|
|
@@ -1428,7 +1519,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1428
1519
|
completion_messages = _combine_completion_messages(completion_messages)
|
|
1429
1520
|
# Then add cache control, anthropic requires explicit "cache_control" dicts
|
|
1430
1521
|
completion_messages = _add_cache_control(completion_messages)
|
|
1431
|
-
|
|
1522
|
+
|
|
1432
1523
|
# Prepare API call kwargs
|
|
1433
1524
|
api_kwargs = {
|
|
1434
1525
|
"model": model,
|
|
@@ -1436,80 +1527,74 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1436
1527
|
"tools": anthropic_tools if anthropic_tools else None,
|
|
1437
1528
|
"stream": stream,
|
|
1438
1529
|
"num_retries": max_retries,
|
|
1439
|
-
**kwargs
|
|
1530
|
+
**kwargs,
|
|
1440
1531
|
}
|
|
1441
|
-
|
|
1532
|
+
|
|
1442
1533
|
# Add beta header for computer use
|
|
1443
1534
|
if anthropic_tools:
|
|
1444
|
-
api_kwargs["headers"] = {
|
|
1445
|
-
|
|
1446
|
-
}
|
|
1447
|
-
|
|
1535
|
+
api_kwargs["headers"] = {"anthropic-beta": tool_config["beta_flag"]}
|
|
1536
|
+
|
|
1448
1537
|
# Call API start hook
|
|
1449
1538
|
if _on_api_start:
|
|
1450
1539
|
await _on_api_start(api_kwargs)
|
|
1451
|
-
|
|
1540
|
+
|
|
1452
1541
|
# Use liteLLM acompletion
|
|
1453
1542
|
response = await litellm.acompletion(**api_kwargs)
|
|
1454
|
-
|
|
1543
|
+
|
|
1455
1544
|
# Call API end hook
|
|
1456
1545
|
if _on_api_end:
|
|
1457
1546
|
await _on_api_end(api_kwargs, response)
|
|
1458
|
-
|
|
1547
|
+
|
|
1459
1548
|
# Convert response to responses_items format
|
|
1460
1549
|
responses_items = _convert_completion_to_responses_items(response)
|
|
1461
1550
|
|
|
1462
1551
|
# Extract usage information
|
|
1463
|
-
responses_usage = {
|
|
1464
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
1552
|
+
responses_usage = {
|
|
1553
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
1554
|
+
response.usage
|
|
1555
|
+
).model_dump(),
|
|
1465
1556
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
1466
1557
|
}
|
|
1467
1558
|
if _on_usage:
|
|
1468
1559
|
await _on_usage(responses_usage)
|
|
1469
1560
|
|
|
1470
1561
|
# Return in AsyncAgentConfig format
|
|
1471
|
-
return {
|
|
1472
|
-
|
|
1473
|
-
"usage": responses_usage
|
|
1474
|
-
}
|
|
1475
|
-
|
|
1562
|
+
return {"output": responses_items, "usage": responses_usage}
|
|
1563
|
+
|
|
1476
1564
|
async def predict_click(
|
|
1477
|
-
self,
|
|
1478
|
-
model: str,
|
|
1479
|
-
image_b64: str,
|
|
1480
|
-
instruction: str,
|
|
1481
|
-
**kwargs
|
|
1565
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
1482
1566
|
) -> Optional[Tuple[int, int]]:
|
|
1483
1567
|
"""
|
|
1484
1568
|
Predict click coordinates based on image and instruction.
|
|
1485
|
-
|
|
1569
|
+
|
|
1486
1570
|
Uses Anthropic's computer use models with a custom prompt that instructs
|
|
1487
1571
|
the agent to only output clicks.
|
|
1488
|
-
|
|
1572
|
+
|
|
1489
1573
|
Args:
|
|
1490
1574
|
model: Model name to use
|
|
1491
1575
|
image_b64: Base64 encoded image
|
|
1492
1576
|
instruction: Instruction for where to click
|
|
1493
|
-
|
|
1577
|
+
|
|
1494
1578
|
Returns:
|
|
1495
1579
|
Tuple of (x, y) coordinates or None if prediction fails
|
|
1496
1580
|
"""
|
|
1497
1581
|
# Get image dimensions from base64 data
|
|
1498
1582
|
try:
|
|
1499
1583
|
import base64
|
|
1500
|
-
from PIL import Image
|
|
1501
1584
|
from io import BytesIO
|
|
1502
|
-
|
|
1585
|
+
|
|
1586
|
+
from PIL import Image
|
|
1587
|
+
|
|
1503
1588
|
image_data = base64.b64decode(image_b64)
|
|
1504
1589
|
image = Image.open(BytesIO(image_data))
|
|
1505
1590
|
display_width, display_height = image.size
|
|
1506
1591
|
except Exception:
|
|
1507
1592
|
# Fallback to default dimensions if image parsing fails
|
|
1508
1593
|
display_width, display_height = 1024, 768
|
|
1509
|
-
|
|
1594
|
+
|
|
1510
1595
|
# Get tool configuration for this model
|
|
1511
1596
|
tool_config = _get_tool_config_for_model(model)
|
|
1512
|
-
|
|
1597
|
+
|
|
1513
1598
|
# Prepare computer tool for Anthropic format
|
|
1514
1599
|
computer_tool = {
|
|
1515
1600
|
"type": tool_config["tool_version"],
|
|
@@ -1522,7 +1607,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1522
1607
|
},
|
|
1523
1608
|
},
|
|
1524
1609
|
}
|
|
1525
|
-
|
|
1610
|
+
|
|
1526
1611
|
# Construct messages in OpenAI chat completion format for liteLLM
|
|
1527
1612
|
messages = [
|
|
1528
1613
|
{
|
|
@@ -1541,18 +1626,16 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1541
1626
|
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
1542
1627
|
|
|
1543
1628
|
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
1544
|
-
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
1629
|
+
Task: Click {instruction}. Output ONLY a click action on the target element.""",
|
|
1545
1630
|
},
|
|
1546
1631
|
{
|
|
1547
1632
|
"type": "image_url",
|
|
1548
|
-
"image_url": {
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
}
|
|
1552
|
-
]
|
|
1633
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
1634
|
+
},
|
|
1635
|
+
],
|
|
1553
1636
|
}
|
|
1554
1637
|
]
|
|
1555
|
-
|
|
1638
|
+
|
|
1556
1639
|
# Prepare API call kwargs
|
|
1557
1640
|
api_kwargs = {
|
|
1558
1641
|
"model": model,
|
|
@@ -1560,32 +1643,36 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
1560
1643
|
"tools": [computer_tool],
|
|
1561
1644
|
"stream": False,
|
|
1562
1645
|
"max_tokens": 100, # Keep response short for click prediction
|
|
1563
|
-
"headers": {
|
|
1564
|
-
"anthropic-beta": tool_config["beta_flag"]
|
|
1565
|
-
}
|
|
1646
|
+
"headers": {"anthropic-beta": tool_config["beta_flag"]},
|
|
1566
1647
|
}
|
|
1567
|
-
|
|
1648
|
+
# Thread optional API params
|
|
1649
|
+
if "api_key" in kwargs and kwargs.get("api_key") is not None:
|
|
1650
|
+
api_kwargs["api_key"] = kwargs.get("api_key")
|
|
1651
|
+
if "api_base" in kwargs and kwargs.get("api_base") is not None:
|
|
1652
|
+
api_kwargs["api_base"] = kwargs.get("api_base")
|
|
1653
|
+
|
|
1568
1654
|
# Use liteLLM acompletion
|
|
1569
1655
|
response = await litellm.acompletion(**api_kwargs)
|
|
1570
|
-
|
|
1656
|
+
|
|
1571
1657
|
# Convert response to responses_items format to extract click coordinates
|
|
1572
1658
|
responses_items = _convert_completion_to_responses_items(response)
|
|
1573
|
-
|
|
1659
|
+
|
|
1574
1660
|
# Look for computer_call with click action
|
|
1575
1661
|
for item in responses_items:
|
|
1576
|
-
if (
|
|
1577
|
-
item
|
|
1578
|
-
|
|
1579
|
-
|
|
1662
|
+
if (
|
|
1663
|
+
isinstance(item, dict)
|
|
1664
|
+
and item.get("type") == "computer_call"
|
|
1665
|
+
and isinstance(item.get("action"), dict)
|
|
1666
|
+
):
|
|
1667
|
+
|
|
1580
1668
|
action = item["action"]
|
|
1581
|
-
if action.get("
|
|
1669
|
+
if action.get("x") and action.get("y"):
|
|
1582
1670
|
x = action.get("x")
|
|
1583
1671
|
y = action.get("y")
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1672
|
+
return (int(x), int(y))
|
|
1673
|
+
|
|
1587
1674
|
return None
|
|
1588
|
-
|
|
1675
|
+
|
|
1589
1676
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
1590
1677
|
"""Return the capabilities supported by this agent."""
|
|
1591
1678
|
return ["click", "step"]
|