cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/loops/anthropic.py
CHANGED
|
@@ -4,30 +4,33 @@ Anthropic hosted tools agent loop implementation using liteLLM
|
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
8
|
+
|
|
8
9
|
import litellm
|
|
9
|
-
from litellm.responses.litellm_completion_transformation.transformation import
|
|
10
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
11
|
+
LiteLLMCompletionResponsesConfig,
|
|
12
|
+
)
|
|
10
13
|
|
|
11
14
|
from ..decorators import register_agent
|
|
12
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
13
15
|
from ..loops.base import AsyncAgentConfig
|
|
14
16
|
from ..responses import (
|
|
15
|
-
make_reasoning_item,
|
|
16
|
-
make_output_text_item,
|
|
17
17
|
make_click_item,
|
|
18
18
|
make_double_click_item,
|
|
19
19
|
make_drag_item,
|
|
20
|
+
make_failed_tool_call_items,
|
|
21
|
+
make_input_image_item,
|
|
20
22
|
make_keypress_item,
|
|
23
|
+
make_left_mouse_down_item,
|
|
24
|
+
make_left_mouse_up_item,
|
|
21
25
|
make_move_item,
|
|
26
|
+
make_output_text_item,
|
|
27
|
+
make_reasoning_item,
|
|
28
|
+
make_screenshot_item,
|
|
22
29
|
make_scroll_item,
|
|
23
30
|
make_type_item,
|
|
24
31
|
make_wait_item,
|
|
25
|
-
make_input_image_item,
|
|
26
|
-
make_screenshot_item,
|
|
27
|
-
make_failed_tool_call_items,
|
|
28
|
-
make_left_mouse_down_item,
|
|
29
|
-
make_left_mouse_up_item
|
|
30
32
|
)
|
|
33
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
31
34
|
|
|
32
35
|
# Model version mapping to tool version and beta flag
|
|
33
36
|
MODEL_TOOL_MAPPING = [
|
|
@@ -35,38 +38,34 @@ MODEL_TOOL_MAPPING = [
|
|
|
35
38
|
{
|
|
36
39
|
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
|
|
37
40
|
"tool_version": "computer_20250124",
|
|
38
|
-
"beta_flag": "computer-use-2025-01-24"
|
|
41
|
+
"beta_flag": "computer-use-2025-01-24",
|
|
39
42
|
},
|
|
40
43
|
# Claude 3.7 models
|
|
41
44
|
{
|
|
42
45
|
"pattern": r"claude-3\.?7|claude-3-7",
|
|
43
46
|
"tool_version": "computer_20250124",
|
|
44
|
-
"beta_flag": "computer-use-2025-01-24"
|
|
47
|
+
"beta_flag": "computer-use-2025-01-24",
|
|
45
48
|
},
|
|
46
49
|
# Claude 3.5 models (fallback)
|
|
47
50
|
{
|
|
48
51
|
"pattern": r"claude-3\.?5|claude-3-5",
|
|
49
52
|
"tool_version": "computer_20241022",
|
|
50
|
-
"beta_flag": "computer-use-2024-10-22"
|
|
51
|
-
}
|
|
53
|
+
"beta_flag": "computer-use-2024-10-22",
|
|
54
|
+
},
|
|
52
55
|
]
|
|
53
56
|
|
|
57
|
+
|
|
54
58
|
def _get_tool_config_for_model(model: str) -> Dict[str, str]:
|
|
55
59
|
"""Get tool version and beta flag for the given model."""
|
|
56
60
|
import re
|
|
57
|
-
|
|
61
|
+
|
|
58
62
|
for mapping in MODEL_TOOL_MAPPING:
|
|
59
63
|
if re.search(mapping["pattern"], model, re.IGNORECASE):
|
|
60
|
-
return {
|
|
61
|
-
|
|
62
|
-
"beta_flag": mapping["beta_flag"]
|
|
63
|
-
}
|
|
64
|
-
|
|
64
|
+
return {"tool_version": mapping["tool_version"], "beta_flag": mapping["beta_flag"]}
|
|
65
|
+
|
|
65
66
|
# Default to Claude 3.5 configuration
|
|
66
|
-
return {
|
|
67
|
-
|
|
68
|
-
"beta_flag": "computer-use-2024-10-22"
|
|
69
|
-
}
|
|
67
|
+
return {"tool_version": "computer_20241022", "beta_flag": "computer-use-2024-10-22"}
|
|
68
|
+
|
|
70
69
|
|
|
71
70
|
async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
72
71
|
"""Map a computer tool to Anthropic's hosted tool schema."""
|
|
@@ -76,7 +75,7 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
|
|
|
76
75
|
except Exception:
|
|
77
76
|
# Fallback to default dimensions if method fails
|
|
78
77
|
width, height = 1024, 768
|
|
79
|
-
|
|
78
|
+
|
|
80
79
|
return {
|
|
81
80
|
"type": tool_version,
|
|
82
81
|
"function": {
|
|
@@ -89,32 +88,37 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
|
|
|
89
88
|
},
|
|
90
89
|
}
|
|
91
90
|
|
|
91
|
+
|
|
92
92
|
async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
93
93
|
"""Prepare tools for Anthropic API format."""
|
|
94
94
|
tool_config = _get_tool_config_for_model(model)
|
|
95
95
|
anthropic_tools = []
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
for schema in tool_schemas:
|
|
98
98
|
if schema["type"] == "computer":
|
|
99
99
|
# Map computer tool to Anthropic format
|
|
100
|
-
anthropic_tools.append(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
anthropic_tools.append(
|
|
101
|
+
await _map_computer_tool_to_anthropic(
|
|
102
|
+
schema["computer"], tool_config["tool_version"]
|
|
103
|
+
)
|
|
104
|
+
)
|
|
104
105
|
elif schema["type"] == "function":
|
|
105
106
|
# Function tools - convert to Anthropic format
|
|
106
107
|
function_schema = schema["function"]
|
|
107
|
-
anthropic_tools.append(
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
"
|
|
111
|
-
|
|
112
|
-
|
|
108
|
+
anthropic_tools.append(
|
|
109
|
+
{
|
|
110
|
+
"type": "function",
|
|
111
|
+
"function": {
|
|
112
|
+
"name": function_schema["name"],
|
|
113
|
+
"description": function_schema.get("description", ""),
|
|
114
|
+
"parameters": function_schema.get("parameters", {}),
|
|
115
|
+
},
|
|
113
116
|
}
|
|
114
|
-
|
|
115
|
-
|
|
117
|
+
)
|
|
118
|
+
|
|
116
119
|
return anthropic_tools
|
|
117
120
|
|
|
121
|
+
|
|
118
122
|
def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
|
|
119
123
|
"""Convert responses_items message format to liteLLM completion format."""
|
|
120
124
|
completion_messages = []
|
|
@@ -123,7 +127,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
123
127
|
for message in messages:
|
|
124
128
|
msg_type = message.get("type")
|
|
125
129
|
role = message.get("role")
|
|
126
|
-
|
|
130
|
+
|
|
127
131
|
# Handle user messages (both with and without explicit type)
|
|
128
132
|
if role == "user" or msg_type == "user":
|
|
129
133
|
content = message.get("content", "")
|
|
@@ -135,51 +139,38 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
135
139
|
# Convert input_image to OpenAI image format
|
|
136
140
|
image_url = item.get("image_url", "")
|
|
137
141
|
if image_url and image_url != "[omitted]":
|
|
138
|
-
converted_content.append(
|
|
139
|
-
"type": "image_url",
|
|
140
|
-
|
|
141
|
-
"url": image_url
|
|
142
|
-
}
|
|
143
|
-
})
|
|
142
|
+
converted_content.append(
|
|
143
|
+
{"type": "image_url", "image_url": {"url": image_url}}
|
|
144
|
+
)
|
|
144
145
|
elif isinstance(item, dict) and item.get("type") == "input_text":
|
|
145
146
|
# Convert input_text to OpenAI text format
|
|
146
147
|
text = item.get("text", "")
|
|
147
|
-
converted_content.append({
|
|
148
|
-
"type": "text",
|
|
149
|
-
"text": text
|
|
150
|
-
})
|
|
148
|
+
converted_content.append({"type": "text", "text": text})
|
|
151
149
|
else:
|
|
152
150
|
# Keep other content types as-is
|
|
153
151
|
converted_content.append(item)
|
|
154
|
-
|
|
155
|
-
completion_messages.append(
|
|
156
|
-
"role": "user",
|
|
157
|
-
|
|
158
|
-
})
|
|
152
|
+
|
|
153
|
+
completion_messages.append(
|
|
154
|
+
{"role": "user", "content": converted_content if converted_content else content}
|
|
155
|
+
)
|
|
159
156
|
else:
|
|
160
157
|
# Text content
|
|
161
|
-
completion_messages.append({
|
|
162
|
-
|
|
163
|
-
"content": content
|
|
164
|
-
})
|
|
165
|
-
|
|
158
|
+
completion_messages.append({"role": "user", "content": content})
|
|
159
|
+
|
|
166
160
|
# Handle assistant messages
|
|
167
161
|
elif role == "assistant":
|
|
168
162
|
content = message.get("content", [])
|
|
169
163
|
if isinstance(content, str):
|
|
170
|
-
content = [{
|
|
171
|
-
|
|
164
|
+
content = [{"type": "output_text", "text": content}]
|
|
165
|
+
|
|
172
166
|
content = "\n".join(item.get("text", "") for item in content)
|
|
173
|
-
completion_messages.append({
|
|
174
|
-
|
|
175
|
-
"content": content
|
|
176
|
-
})
|
|
177
|
-
|
|
167
|
+
completion_messages.append({"role": "assistant", "content": content})
|
|
168
|
+
|
|
178
169
|
elif msg_type == "reasoning":
|
|
179
170
|
# Reasoning becomes part of assistant message
|
|
180
171
|
summary = message.get("summary", [])
|
|
181
172
|
reasoning_text = ""
|
|
182
|
-
|
|
173
|
+
|
|
183
174
|
if isinstance(summary, list) and summary:
|
|
184
175
|
# Extract text from summary items
|
|
185
176
|
for item in summary:
|
|
@@ -189,58 +180,54 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
189
180
|
else:
|
|
190
181
|
# Fallback to direct reasoning field
|
|
191
182
|
reasoning_text = message.get("reasoning", "")
|
|
192
|
-
|
|
183
|
+
|
|
193
184
|
if reasoning_text:
|
|
194
|
-
completion_messages.append({
|
|
195
|
-
|
|
196
|
-
"content": reasoning_text
|
|
197
|
-
})
|
|
198
|
-
|
|
185
|
+
completion_messages.append({"role": "assistant", "content": reasoning_text})
|
|
186
|
+
|
|
199
187
|
elif msg_type == "function_call":
|
|
200
188
|
fn_name = message.get("name")
|
|
201
189
|
fn_args = message.get("arguments", "{}")
|
|
202
190
|
call_id = message.get("call_id", "call_1")
|
|
203
191
|
call_id_to_fn_name[call_id] = fn_name
|
|
204
|
-
openai_tool_calls = [
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
"name": fn_name,
|
|
209
|
-
"arguments": fn_args
|
|
192
|
+
openai_tool_calls = [
|
|
193
|
+
{
|
|
194
|
+
"id": call_id,
|
|
195
|
+
"type": "function",
|
|
196
|
+
"function": {"name": fn_name, "arguments": fn_args},
|
|
210
197
|
}
|
|
211
|
-
|
|
198
|
+
] # If the last completion message is an assistant message, extend the tool_calls
|
|
212
199
|
if completion_messages and completion_messages[-1].get("role") == "assistant":
|
|
213
200
|
if "tool_calls" not in completion_messages[-1]:
|
|
214
201
|
completion_messages[-1]["tool_calls"] = []
|
|
215
202
|
completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
|
|
216
203
|
else:
|
|
217
204
|
# Create new assistant message with tool calls
|
|
218
|
-
completion_messages.append(
|
|
219
|
-
"role": "assistant",
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
})
|
|
223
|
-
|
|
205
|
+
completion_messages.append(
|
|
206
|
+
{"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
|
|
207
|
+
)
|
|
208
|
+
|
|
224
209
|
elif msg_type == "function_call_output":
|
|
225
210
|
call_id = message.get("call_id", "call_1")
|
|
226
211
|
fn_output = message.get("output", "")
|
|
227
212
|
fn_name = call_id_to_fn_name.get(call_id, "computer")
|
|
228
213
|
|
|
229
|
-
completion_messages.append(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
214
|
+
completion_messages.append(
|
|
215
|
+
{
|
|
216
|
+
"role": "function",
|
|
217
|
+
"name": fn_name,
|
|
218
|
+
"tool_call_id": call_id,
|
|
219
|
+
"content": str(fn_output),
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
|
|
236
223
|
elif msg_type == "computer_call":
|
|
237
224
|
# Computer call becomes tool use in assistant message
|
|
238
225
|
action = message.get("action", {})
|
|
239
226
|
action_type = action.get("type")
|
|
240
227
|
call_id = message.get("call_id", "call_1")
|
|
241
|
-
|
|
228
|
+
|
|
242
229
|
tool_use_content = []
|
|
243
|
-
|
|
230
|
+
|
|
244
231
|
# Basic actions (all versions)
|
|
245
232
|
if action_type == "click":
|
|
246
233
|
# Input:
|
|
@@ -253,7 +240,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
253
240
|
# "y": 200
|
|
254
241
|
# }
|
|
255
242
|
# }
|
|
256
|
-
|
|
243
|
+
|
|
257
244
|
# Output:
|
|
258
245
|
# {
|
|
259
246
|
# "function": {
|
|
@@ -267,16 +254,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
267
254
|
# "type": "function"
|
|
268
255
|
# }
|
|
269
256
|
button = action.get("button", "left")
|
|
270
|
-
action_name =
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
"
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
"
|
|
257
|
+
action_name = (
|
|
258
|
+
"right_click"
|
|
259
|
+
if button == "right"
|
|
260
|
+
else "middle_click" if button == "wheel" else "left_click"
|
|
261
|
+
)
|
|
262
|
+
tool_use_content.append(
|
|
263
|
+
{
|
|
264
|
+
"type": "tool_use",
|
|
265
|
+
"id": call_id,
|
|
266
|
+
"name": "computer",
|
|
267
|
+
"input": {
|
|
268
|
+
"action": action_name,
|
|
269
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
270
|
+
},
|
|
278
271
|
}
|
|
279
|
-
|
|
272
|
+
)
|
|
280
273
|
elif action_type == "double_click":
|
|
281
274
|
# Input:
|
|
282
275
|
# {
|
|
@@ -288,7 +281,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
288
281
|
# "y": 240
|
|
289
282
|
# }
|
|
290
283
|
# }
|
|
291
|
-
|
|
284
|
+
|
|
292
285
|
# Output:
|
|
293
286
|
# {
|
|
294
287
|
# "function": {
|
|
@@ -301,15 +294,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
301
294
|
# "id": "call_1",
|
|
302
295
|
# "type": "function"
|
|
303
296
|
# }
|
|
304
|
-
tool_use_content.append(
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
"
|
|
310
|
-
|
|
297
|
+
tool_use_content.append(
|
|
298
|
+
{
|
|
299
|
+
"type": "tool_use",
|
|
300
|
+
"id": call_id,
|
|
301
|
+
"name": "computer",
|
|
302
|
+
"input": {
|
|
303
|
+
"action": "double_click",
|
|
304
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
305
|
+
},
|
|
311
306
|
}
|
|
312
|
-
|
|
307
|
+
)
|
|
313
308
|
elif action_type == "type":
|
|
314
309
|
# Input:
|
|
315
310
|
# {
|
|
@@ -320,7 +315,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
320
315
|
# "text": "Hello World"
|
|
321
316
|
# }
|
|
322
317
|
# }
|
|
323
|
-
|
|
318
|
+
|
|
324
319
|
# Output:
|
|
325
320
|
# {
|
|
326
321
|
# "function": {
|
|
@@ -333,15 +328,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
333
328
|
# "id": "call_1",
|
|
334
329
|
# "type": "function"
|
|
335
330
|
# }
|
|
336
|
-
tool_use_content.append(
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
"action": "type",
|
|
342
|
-
"text": action.get("text", "")
|
|
331
|
+
tool_use_content.append(
|
|
332
|
+
{
|
|
333
|
+
"type": "tool_use",
|
|
334
|
+
"id": call_id,
|
|
335
|
+
"name": "computer",
|
|
336
|
+
"input": {"action": "type", "text": action.get("text", "")},
|
|
343
337
|
}
|
|
344
|
-
|
|
338
|
+
)
|
|
345
339
|
elif action_type == "keypress":
|
|
346
340
|
# Input:
|
|
347
341
|
# {
|
|
@@ -352,7 +346,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
352
346
|
# "keys": ["ctrl", "c"]
|
|
353
347
|
# }
|
|
354
348
|
# }
|
|
355
|
-
|
|
349
|
+
|
|
356
350
|
# Output:
|
|
357
351
|
# {
|
|
358
352
|
# "function": {
|
|
@@ -365,15 +359,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
365
359
|
# "id": "call_1",
|
|
366
360
|
# "type": "function"
|
|
367
361
|
# }
|
|
368
|
-
tool_use_content.append(
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
"action": "key",
|
|
374
|
-
"text": "+".join(action.get("keys", []))
|
|
362
|
+
tool_use_content.append(
|
|
363
|
+
{
|
|
364
|
+
"type": "tool_use",
|
|
365
|
+
"id": call_id,
|
|
366
|
+
"name": "computer",
|
|
367
|
+
"input": {"action": "key", "text": "+".join(action.get("keys", []))},
|
|
375
368
|
}
|
|
376
|
-
|
|
369
|
+
)
|
|
377
370
|
elif action_type in ["mouse_move", "move"]:
|
|
378
371
|
# Input:
|
|
379
372
|
# {
|
|
@@ -385,7 +378,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
385
378
|
# "y": 250
|
|
386
379
|
# }
|
|
387
380
|
# }
|
|
388
|
-
|
|
381
|
+
|
|
389
382
|
# Output:
|
|
390
383
|
# {
|
|
391
384
|
# "function": {
|
|
@@ -398,15 +391,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
398
391
|
# "id": "call_1",
|
|
399
392
|
# "type": "function"
|
|
400
393
|
# }
|
|
401
|
-
tool_use_content.append(
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
"
|
|
407
|
-
|
|
394
|
+
tool_use_content.append(
|
|
395
|
+
{
|
|
396
|
+
"type": "tool_use",
|
|
397
|
+
"id": call_id,
|
|
398
|
+
"name": "computer",
|
|
399
|
+
"input": {
|
|
400
|
+
"action": "mouse_move",
|
|
401
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
402
|
+
},
|
|
408
403
|
}
|
|
409
|
-
|
|
404
|
+
)
|
|
410
405
|
elif action_type == "scroll":
|
|
411
406
|
# Input:
|
|
412
407
|
# {
|
|
@@ -420,7 +415,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
420
415
|
# "scroll_y": -5
|
|
421
416
|
# }
|
|
422
417
|
# }
|
|
423
|
-
|
|
418
|
+
|
|
424
419
|
# Output:
|
|
425
420
|
# {
|
|
426
421
|
# "function": {
|
|
@@ -453,18 +448,20 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
453
448
|
else:
|
|
454
449
|
direction = "down"
|
|
455
450
|
amount = 3
|
|
456
|
-
|
|
457
|
-
tool_use_content.append(
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
"
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
451
|
+
|
|
452
|
+
tool_use_content.append(
|
|
453
|
+
{
|
|
454
|
+
"type": "tool_use",
|
|
455
|
+
"id": call_id,
|
|
456
|
+
"name": "computer",
|
|
457
|
+
"input": {
|
|
458
|
+
"action": "scroll",
|
|
459
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)],
|
|
460
|
+
"scroll_direction": direction,
|
|
461
|
+
"scroll_amount": amount,
|
|
462
|
+
},
|
|
466
463
|
}
|
|
467
|
-
|
|
464
|
+
)
|
|
468
465
|
elif action_type == "drag":
|
|
469
466
|
# Input:
|
|
470
467
|
# {
|
|
@@ -478,7 +475,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
478
475
|
# ]
|
|
479
476
|
# }
|
|
480
477
|
# }
|
|
481
|
-
|
|
478
|
+
|
|
482
479
|
# Output:
|
|
483
480
|
# {
|
|
484
481
|
# "function": {
|
|
@@ -498,17 +495,19 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
498
495
|
if isinstance(path, list) and len(path) >= 2:
|
|
499
496
|
start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
|
|
500
497
|
end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
|
|
501
|
-
|
|
502
|
-
tool_use_content.append(
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
"
|
|
508
|
-
|
|
509
|
-
|
|
498
|
+
|
|
499
|
+
tool_use_content.append(
|
|
500
|
+
{
|
|
501
|
+
"type": "tool_use",
|
|
502
|
+
"id": call_id,
|
|
503
|
+
"name": "computer",
|
|
504
|
+
"input": {
|
|
505
|
+
"action": "left_click_drag",
|
|
506
|
+
"start_coordinate": start_coord,
|
|
507
|
+
"end_coordinate": end_coord,
|
|
508
|
+
},
|
|
510
509
|
}
|
|
511
|
-
|
|
510
|
+
)
|
|
512
511
|
elif action_type == "wait":
|
|
513
512
|
# Input:
|
|
514
513
|
# {
|
|
@@ -518,7 +517,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
518
517
|
# "type": "wait"
|
|
519
518
|
# }
|
|
520
519
|
# }
|
|
521
|
-
|
|
520
|
+
|
|
522
521
|
# Output:
|
|
523
522
|
# {
|
|
524
523
|
# "function": {
|
|
@@ -530,14 +529,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
530
529
|
# "id": "call_1",
|
|
531
530
|
# "type": "function"
|
|
532
531
|
# }
|
|
533
|
-
tool_use_content.append(
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
"action": "wait"
|
|
532
|
+
tool_use_content.append(
|
|
533
|
+
{
|
|
534
|
+
"type": "tool_use",
|
|
535
|
+
"id": call_id,
|
|
536
|
+
"name": "computer",
|
|
537
|
+
"input": {"action": "wait"},
|
|
539
538
|
}
|
|
540
|
-
|
|
539
|
+
)
|
|
541
540
|
elif action_type == "screenshot":
|
|
542
541
|
# Input:
|
|
543
542
|
# {
|
|
@@ -547,7 +546,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
547
546
|
# "type": "screenshot"
|
|
548
547
|
# }
|
|
549
548
|
# }
|
|
550
|
-
|
|
549
|
+
|
|
551
550
|
# Output:
|
|
552
551
|
# {
|
|
553
552
|
# "function": {
|
|
@@ -559,47 +558,53 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
559
558
|
# "id": "call_1",
|
|
560
559
|
# "type": "function"
|
|
561
560
|
# }
|
|
562
|
-
tool_use_content.append(
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
"action": "screenshot"
|
|
561
|
+
tool_use_content.append(
|
|
562
|
+
{
|
|
563
|
+
"type": "tool_use",
|
|
564
|
+
"id": call_id,
|
|
565
|
+
"name": "computer",
|
|
566
|
+
"input": {"action": "screenshot"},
|
|
568
567
|
}
|
|
569
|
-
|
|
568
|
+
)
|
|
570
569
|
elif action_type == "left_mouse_down":
|
|
571
|
-
tool_use_content.append(
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
"
|
|
577
|
-
|
|
570
|
+
tool_use_content.append(
|
|
571
|
+
{
|
|
572
|
+
"type": "tool_use",
|
|
573
|
+
"id": call_id,
|
|
574
|
+
"name": "computer",
|
|
575
|
+
"input": {
|
|
576
|
+
"action": "left_mouse_down",
|
|
577
|
+
"coordinate": [action.get("x", None), action.get("y", None)],
|
|
578
|
+
},
|
|
578
579
|
}
|
|
579
|
-
|
|
580
|
+
)
|
|
580
581
|
elif action_type == "left_mouse_up":
|
|
581
|
-
tool_use_content.append(
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
"
|
|
587
|
-
|
|
582
|
+
tool_use_content.append(
|
|
583
|
+
{
|
|
584
|
+
"type": "tool_use",
|
|
585
|
+
"id": call_id,
|
|
586
|
+
"name": "computer",
|
|
587
|
+
"input": {
|
|
588
|
+
"action": "left_mouse_up",
|
|
589
|
+
"coordinate": [action.get("x", None), action.get("y", None)],
|
|
590
|
+
},
|
|
588
591
|
}
|
|
589
|
-
|
|
590
|
-
|
|
592
|
+
)
|
|
593
|
+
|
|
591
594
|
# Convert tool_use_content to OpenAI tool_calls format
|
|
592
595
|
openai_tool_calls = []
|
|
593
596
|
for tool_use in tool_use_content:
|
|
594
|
-
openai_tool_calls.append(
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
"
|
|
599
|
-
|
|
597
|
+
openai_tool_calls.append(
|
|
598
|
+
{
|
|
599
|
+
"id": tool_use["id"],
|
|
600
|
+
"type": "function",
|
|
601
|
+
"function": {
|
|
602
|
+
"name": tool_use["name"],
|
|
603
|
+
"arguments": json.dumps(tool_use["input"]),
|
|
604
|
+
},
|
|
600
605
|
}
|
|
601
|
-
|
|
602
|
-
|
|
606
|
+
)
|
|
607
|
+
|
|
603
608
|
# If the last completion message is an assistant message, extend the tool_calls
|
|
604
609
|
if completion_messages and completion_messages[-1].get("role") == "assistant":
|
|
605
610
|
if "tool_calls" not in completion_messages[-1]:
|
|
@@ -607,54 +612,52 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
607
612
|
completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
|
|
608
613
|
else:
|
|
609
614
|
# Create new assistant message with tool calls
|
|
610
|
-
completion_messages.append(
|
|
611
|
-
"role": "assistant",
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
})
|
|
615
|
-
|
|
615
|
+
completion_messages.append(
|
|
616
|
+
{"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
|
|
617
|
+
)
|
|
618
|
+
|
|
616
619
|
elif msg_type == "computer_call_output":
|
|
617
620
|
# Computer call output becomes OpenAI function result
|
|
618
621
|
output = message.get("output", {})
|
|
619
622
|
call_id = message.get("call_id", "call_1")
|
|
620
|
-
|
|
623
|
+
|
|
621
624
|
if output.get("type") == "input_image":
|
|
622
625
|
# Screenshot result - convert to OpenAI format with image_url content
|
|
623
626
|
image_url = output.get("image_url", "")
|
|
624
|
-
completion_messages.append(
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
"type": "image_url",
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
}
|
|
633
|
-
}]
|
|
634
|
-
})
|
|
627
|
+
completion_messages.append(
|
|
628
|
+
{
|
|
629
|
+
"role": "function",
|
|
630
|
+
"name": "computer",
|
|
631
|
+
"tool_call_id": call_id,
|
|
632
|
+
"content": [{"type": "image_url", "image_url": {"url": image_url}}],
|
|
633
|
+
}
|
|
634
|
+
)
|
|
635
635
|
else:
|
|
636
636
|
# Text result - convert to OpenAI format
|
|
637
|
-
completion_messages.append(
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
637
|
+
completion_messages.append(
|
|
638
|
+
{
|
|
639
|
+
"role": "function",
|
|
640
|
+
"name": "computer",
|
|
641
|
+
"tool_call_id": call_id,
|
|
642
|
+
"content": str(output),
|
|
643
|
+
}
|
|
644
|
+
)
|
|
645
|
+
|
|
644
646
|
return completion_messages
|
|
645
647
|
|
|
648
|
+
|
|
646
649
|
def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
|
|
647
650
|
"""Convert liteLLM completion response to responses_items message format."""
|
|
648
651
|
responses_items = []
|
|
649
|
-
|
|
650
|
-
if not response or not hasattr(response,
|
|
652
|
+
|
|
653
|
+
if not response or not hasattr(response, "choices") or not response.choices:
|
|
651
654
|
return responses_items
|
|
652
|
-
|
|
655
|
+
|
|
653
656
|
choice = response.choices[0]
|
|
654
657
|
message = choice.message
|
|
655
|
-
|
|
658
|
+
|
|
656
659
|
# Handle text content
|
|
657
|
-
if hasattr(message,
|
|
660
|
+
if hasattr(message, "content") and message.content:
|
|
658
661
|
if isinstance(message.content, str):
|
|
659
662
|
responses_items.append(make_output_text_item(message.content))
|
|
660
663
|
elif isinstance(message.content, list):
|
|
@@ -667,31 +670,36 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
667
670
|
tool_input = content_item.get("input", {})
|
|
668
671
|
action_type = tool_input.get("action")
|
|
669
672
|
call_id = content_item.get("id")
|
|
670
|
-
|
|
673
|
+
|
|
671
674
|
# Action reference:
|
|
672
675
|
# https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
|
|
673
|
-
|
|
676
|
+
|
|
674
677
|
try:
|
|
675
678
|
# Basic actions (all versions)
|
|
676
679
|
if action_type == "screenshot":
|
|
677
680
|
responses_items.append(make_screenshot_item(call_id=call_id))
|
|
678
681
|
elif action_type in ["click", "left_click"]:
|
|
679
682
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
680
|
-
responses_items.append(
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
683
|
+
responses_items.append(
|
|
684
|
+
make_click_item(
|
|
685
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
686
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
687
|
+
call_id=call_id,
|
|
688
|
+
)
|
|
689
|
+
)
|
|
685
690
|
elif action_type in ["type", "type_text"]:
|
|
686
|
-
responses_items.append(
|
|
687
|
-
text=tool_input.get("text", ""),
|
|
688
|
-
|
|
689
|
-
))
|
|
691
|
+
responses_items.append(
|
|
692
|
+
make_type_item(text=tool_input.get("text", ""), call_id=call_id)
|
|
693
|
+
)
|
|
690
694
|
elif action_type in ["key", "keypress", "hotkey"]:
|
|
691
|
-
responses_items.append(
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
+
responses_items.append(
|
|
696
|
+
make_keypress_item(
|
|
697
|
+
keys=tool_input.get("text", "")
|
|
698
|
+
.replace("+", "-")
|
|
699
|
+
.split("-"),
|
|
700
|
+
call_id=call_id,
|
|
701
|
+
)
|
|
702
|
+
)
|
|
695
703
|
elif action_type in ["mouse_move", "move_cursor", "move"]:
|
|
696
704
|
# Mouse move - create a custom action item
|
|
697
705
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
@@ -699,64 +707,88 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
699
707
|
make_move_item(
|
|
700
708
|
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
701
709
|
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
702
|
-
call_id=call_id
|
|
710
|
+
call_id=call_id,
|
|
703
711
|
)
|
|
704
712
|
)
|
|
705
|
-
|
|
713
|
+
|
|
706
714
|
# Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
|
|
707
715
|
elif action_type == "scroll":
|
|
708
716
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
709
717
|
scroll_amount = tool_input.get("scroll_amount", 3)
|
|
710
|
-
scroll_x =
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
718
|
+
scroll_x = (
|
|
719
|
+
scroll_amount
|
|
720
|
+
if tool_input.get("scroll_direction", "down") == "right"
|
|
721
|
+
else (
|
|
722
|
+
-scroll_amount
|
|
723
|
+
if tool_input.get("scroll_direction", "down") == "left"
|
|
724
|
+
else 0
|
|
725
|
+
)
|
|
726
|
+
)
|
|
727
|
+
scroll_y = (
|
|
728
|
+
scroll_amount
|
|
729
|
+
if tool_input.get("scroll_direction", "down") == "down"
|
|
730
|
+
else (
|
|
731
|
+
-scroll_amount
|
|
732
|
+
if tool_input.get("scroll_direction", "down") == "up"
|
|
733
|
+
else 0
|
|
734
|
+
)
|
|
735
|
+
)
|
|
736
|
+
responses_items.append(
|
|
737
|
+
make_scroll_item(
|
|
738
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
739
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
740
|
+
scroll_x=scroll_x,
|
|
741
|
+
scroll_y=scroll_y,
|
|
742
|
+
call_id=call_id,
|
|
743
|
+
)
|
|
744
|
+
)
|
|
721
745
|
elif action_type in ["left_click_drag", "drag"]:
|
|
722
746
|
start_coord = tool_input.get("start_coordinate", [0, 0])
|
|
723
747
|
end_coord = tool_input.get("end_coordinate", [0, 0])
|
|
724
|
-
responses_items.append(
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
748
|
+
responses_items.append(
|
|
749
|
+
make_drag_item(
|
|
750
|
+
path=[
|
|
751
|
+
{
|
|
752
|
+
"x": start_coord[0] if len(start_coord) > 0 else 0,
|
|
753
|
+
"y": start_coord[1] if len(start_coord) > 1 else 0,
|
|
754
|
+
},
|
|
755
|
+
{
|
|
756
|
+
"x": end_coord[0] if len(end_coord) > 0 else 0,
|
|
757
|
+
"y": end_coord[1] if len(end_coord) > 1 else 0,
|
|
758
|
+
},
|
|
759
|
+
],
|
|
760
|
+
call_id=call_id,
|
|
761
|
+
)
|
|
762
|
+
)
|
|
737
763
|
elif action_type == "right_click":
|
|
738
764
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
739
|
-
responses_items.append(
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
765
|
+
responses_items.append(
|
|
766
|
+
make_click_item(
|
|
767
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
768
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
769
|
+
button="right",
|
|
770
|
+
call_id=call_id,
|
|
771
|
+
)
|
|
772
|
+
)
|
|
745
773
|
elif action_type == "middle_click":
|
|
746
774
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
747
|
-
responses_items.append(
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
775
|
+
responses_items.append(
|
|
776
|
+
make_click_item(
|
|
777
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
778
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
779
|
+
button="wheel",
|
|
780
|
+
call_id=call_id,
|
|
781
|
+
)
|
|
782
|
+
)
|
|
753
783
|
elif action_type == "double_click":
|
|
754
784
|
coordinate = tool_input.get("coordinate", [0, 0])
|
|
755
|
-
responses_items.append(
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
785
|
+
responses_items.append(
|
|
786
|
+
make_double_click_item(
|
|
787
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
788
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
789
|
+
call_id=call_id,
|
|
790
|
+
)
|
|
791
|
+
)
|
|
760
792
|
elif action_type == "triple_click":
|
|
761
793
|
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
762
794
|
# responses_items.append({
|
|
@@ -782,11 +814,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
782
814
|
# }
|
|
783
815
|
# })
|
|
784
816
|
coordinate = tool_input.get("coordinate", [None, None])
|
|
785
|
-
responses_items.append(
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
817
|
+
responses_items.append(
|
|
818
|
+
make_left_mouse_down_item(
|
|
819
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
820
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
821
|
+
call_id=call_id,
|
|
822
|
+
)
|
|
823
|
+
)
|
|
790
824
|
elif action_type == "left_mouse_up":
|
|
791
825
|
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
792
826
|
# responses_items.append({
|
|
@@ -800,11 +834,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
800
834
|
# }
|
|
801
835
|
# })
|
|
802
836
|
coordinate = tool_input.get("coordinate", [None, None])
|
|
803
|
-
responses_items.append(
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
837
|
+
responses_items.append(
|
|
838
|
+
make_left_mouse_up_item(
|
|
839
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
840
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
841
|
+
call_id=call_id,
|
|
842
|
+
)
|
|
843
|
+
)
|
|
808
844
|
elif action_type == "hold_key":
|
|
809
845
|
# responses_items.append({
|
|
810
846
|
# "type": "computer_call",
|
|
@@ -816,21 +852,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
816
852
|
# })
|
|
817
853
|
raise NotImplementedError("hold_key")
|
|
818
854
|
elif action_type == "wait":
|
|
819
|
-
responses_items.append(make_wait_item(
|
|
820
|
-
call_id=call_id
|
|
821
|
-
))
|
|
855
|
+
responses_items.append(make_wait_item(call_id=call_id))
|
|
822
856
|
else:
|
|
823
857
|
raise ValueError(f"Unknown action type: {action_type}")
|
|
824
858
|
except Exception as e:
|
|
825
|
-
responses_items.extend(
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
859
|
+
responses_items.extend(
|
|
860
|
+
make_failed_tool_call_items(
|
|
861
|
+
tool_name="computer",
|
|
862
|
+
tool_kwargs=tool_input,
|
|
863
|
+
error_message=repr(e),
|
|
864
|
+
call_id=call_id,
|
|
865
|
+
)
|
|
866
|
+
)
|
|
867
|
+
|
|
832
868
|
# Handle tool calls (alternative format)
|
|
833
|
-
if hasattr(message,
|
|
869
|
+
if hasattr(message, "tool_calls") and message.tool_calls:
|
|
834
870
|
for tool_call in message.tool_calls:
|
|
835
871
|
if tool_call.function.name == "computer":
|
|
836
872
|
try:
|
|
@@ -852,7 +888,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
852
888
|
# "id": "call_1",
|
|
853
889
|
# "type": "function"
|
|
854
890
|
# }
|
|
855
|
-
|
|
891
|
+
|
|
856
892
|
# Output:
|
|
857
893
|
# {
|
|
858
894
|
# "type": "computer_call",
|
|
@@ -861,9 +897,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
861
897
|
# "type": "screenshot"
|
|
862
898
|
# }
|
|
863
899
|
# }
|
|
864
|
-
responses_items.append(make_screenshot_item(
|
|
865
|
-
call_id=call_id
|
|
866
|
-
))
|
|
900
|
+
responses_items.append(make_screenshot_item(call_id=call_id))
|
|
867
901
|
elif action_type in ["click", "left_click"]:
|
|
868
902
|
# Input:
|
|
869
903
|
# {
|
|
@@ -877,7 +911,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
877
911
|
# "id": "call_1",
|
|
878
912
|
# "type": "function"
|
|
879
913
|
# }
|
|
880
|
-
|
|
914
|
+
|
|
881
915
|
# Output:
|
|
882
916
|
# {
|
|
883
917
|
# "type": "computer_call",
|
|
@@ -889,11 +923,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
889
923
|
# }
|
|
890
924
|
# }
|
|
891
925
|
coordinate = args.get("coordinate", [0, 0])
|
|
892
|
-
responses_items.append(
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
926
|
+
responses_items.append(
|
|
927
|
+
make_click_item(
|
|
928
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
929
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
930
|
+
call_id=call_id,
|
|
931
|
+
)
|
|
932
|
+
)
|
|
897
933
|
elif action_type in ["type", "type_text"]:
|
|
898
934
|
# Input:
|
|
899
935
|
# {
|
|
@@ -907,7 +943,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
907
943
|
# "id": "call_1",
|
|
908
944
|
# "type": "function"
|
|
909
945
|
# }
|
|
910
|
-
|
|
946
|
+
|
|
911
947
|
# Output:
|
|
912
948
|
# {
|
|
913
949
|
# "type": "computer_call",
|
|
@@ -917,10 +953,9 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
917
953
|
# "text": "Hello World"
|
|
918
954
|
# }
|
|
919
955
|
# }
|
|
920
|
-
responses_items.append(
|
|
921
|
-
text=args.get("text", ""),
|
|
922
|
-
|
|
923
|
-
))
|
|
956
|
+
responses_items.append(
|
|
957
|
+
make_type_item(text=args.get("text", ""), call_id=call_id)
|
|
958
|
+
)
|
|
924
959
|
elif action_type in ["key", "keypress", "hotkey"]:
|
|
925
960
|
# Input:
|
|
926
961
|
# {
|
|
@@ -934,7 +969,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
934
969
|
# "id": "call_1",
|
|
935
970
|
# "type": "function"
|
|
936
971
|
# }
|
|
937
|
-
|
|
972
|
+
|
|
938
973
|
# Output:
|
|
939
974
|
# {
|
|
940
975
|
# "type": "computer_call",
|
|
@@ -944,10 +979,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
944
979
|
# "keys": ["ctrl", "c"]
|
|
945
980
|
# }
|
|
946
981
|
# }
|
|
947
|
-
responses_items.append(
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
982
|
+
responses_items.append(
|
|
983
|
+
make_keypress_item(
|
|
984
|
+
keys=args.get("text", "").replace("+", "-").split("-"),
|
|
985
|
+
call_id=call_id,
|
|
986
|
+
)
|
|
987
|
+
)
|
|
951
988
|
elif action_type in ["mouse_move", "move_cursor", "move"]:
|
|
952
989
|
# Input:
|
|
953
990
|
# {
|
|
@@ -961,7 +998,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
961
998
|
# "id": "call_1",
|
|
962
999
|
# "type": "function"
|
|
963
1000
|
# }
|
|
964
|
-
|
|
1001
|
+
|
|
965
1002
|
# Output:
|
|
966
1003
|
# {
|
|
967
1004
|
# "type": "computer_call",
|
|
@@ -973,12 +1010,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
973
1010
|
# }
|
|
974
1011
|
# }
|
|
975
1012
|
coordinate = args.get("coordinate", [0, 0])
|
|
976
|
-
responses_items.append(
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
1013
|
+
responses_items.append(
|
|
1014
|
+
make_move_item(
|
|
1015
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1016
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1017
|
+
call_id=call_id,
|
|
1018
|
+
)
|
|
1019
|
+
)
|
|
1020
|
+
|
|
982
1021
|
# Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
|
|
983
1022
|
elif action_type == "scroll":
|
|
984
1023
|
# Input:
|
|
@@ -995,7 +1034,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
995
1034
|
# "id": "call_1",
|
|
996
1035
|
# "type": "function"
|
|
997
1036
|
# }
|
|
998
|
-
|
|
1037
|
+
|
|
999
1038
|
# Output:
|
|
1000
1039
|
# {
|
|
1001
1040
|
# "type": "computer_call",
|
|
@@ -1011,17 +1050,25 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1011
1050
|
coordinate = args.get("coordinate", [0, 0])
|
|
1012
1051
|
direction = args.get("scroll_direction", "down")
|
|
1013
1052
|
amount = args.get("scroll_amount", 3)
|
|
1014
|
-
scroll_x =
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1053
|
+
scroll_x = (
|
|
1054
|
+
amount
|
|
1055
|
+
if direction == "left"
|
|
1056
|
+
else -amount if direction == "right" else 0
|
|
1057
|
+
)
|
|
1058
|
+
scroll_y = (
|
|
1059
|
+
amount
|
|
1060
|
+
if direction == "up"
|
|
1061
|
+
else -amount if direction == "down" else 0
|
|
1062
|
+
)
|
|
1063
|
+
responses_items.append(
|
|
1064
|
+
make_scroll_item(
|
|
1065
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1066
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1067
|
+
scroll_x=scroll_x,
|
|
1068
|
+
scroll_y=scroll_y,
|
|
1069
|
+
call_id=call_id,
|
|
1070
|
+
)
|
|
1071
|
+
)
|
|
1025
1072
|
elif action_type in ["left_click_drag", "drag"]:
|
|
1026
1073
|
# Input:
|
|
1027
1074
|
# {
|
|
@@ -1036,7 +1083,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1036
1083
|
# "id": "call_1",
|
|
1037
1084
|
# "type": "function"
|
|
1038
1085
|
# }
|
|
1039
|
-
|
|
1086
|
+
|
|
1040
1087
|
# Output:
|
|
1041
1088
|
# {
|
|
1042
1089
|
# "type": "computer_call",
|
|
@@ -1051,19 +1098,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1051
1098
|
# }
|
|
1052
1099
|
start_coord = args.get("start_coordinate", [0, 0])
|
|
1053
1100
|
end_coord = args.get("end_coordinate", [0, 0])
|
|
1054
|
-
responses_items.append(
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1101
|
+
responses_items.append(
|
|
1102
|
+
make_drag_item(
|
|
1103
|
+
path=[
|
|
1104
|
+
{
|
|
1105
|
+
"x": start_coord[0] if len(start_coord) > 0 else 0,
|
|
1106
|
+
"y": start_coord[1] if len(start_coord) > 1 else 0,
|
|
1107
|
+
},
|
|
1108
|
+
{
|
|
1109
|
+
"x": end_coord[0] if len(end_coord) > 0 else 0,
|
|
1110
|
+
"y": end_coord[1] if len(end_coord) > 1 else 0,
|
|
1111
|
+
},
|
|
1112
|
+
],
|
|
1113
|
+
call_id=call_id,
|
|
1114
|
+
)
|
|
1115
|
+
)
|
|
1067
1116
|
elif action_type == "right_click":
|
|
1068
1117
|
# Input:
|
|
1069
1118
|
# {
|
|
@@ -1077,7 +1126,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1077
1126
|
# "id": "call_1",
|
|
1078
1127
|
# "type": "function"
|
|
1079
1128
|
# }
|
|
1080
|
-
|
|
1129
|
+
|
|
1081
1130
|
# Output:
|
|
1082
1131
|
# {
|
|
1083
1132
|
# "type": "computer_call",
|
|
@@ -1090,12 +1139,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1090
1139
|
# }
|
|
1091
1140
|
# }
|
|
1092
1141
|
coordinate = args.get("coordinate", [0, 0])
|
|
1093
|
-
responses_items.append(
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1142
|
+
responses_items.append(
|
|
1143
|
+
make_click_item(
|
|
1144
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1145
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1146
|
+
button="right",
|
|
1147
|
+
call_id=call_id,
|
|
1148
|
+
)
|
|
1149
|
+
)
|
|
1099
1150
|
elif action_type == "middle_click":
|
|
1100
1151
|
# Input:
|
|
1101
1152
|
# {
|
|
@@ -1109,7 +1160,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1109
1160
|
# "id": "call_1",
|
|
1110
1161
|
# "type": "function"
|
|
1111
1162
|
# }
|
|
1112
|
-
|
|
1163
|
+
|
|
1113
1164
|
# Output:
|
|
1114
1165
|
# {
|
|
1115
1166
|
# "type": "computer_call",
|
|
@@ -1122,12 +1173,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1122
1173
|
# }
|
|
1123
1174
|
# }
|
|
1124
1175
|
coordinate = args.get("coordinate", [0, 0])
|
|
1125
|
-
responses_items.append(
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1176
|
+
responses_items.append(
|
|
1177
|
+
make_click_item(
|
|
1178
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1179
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1180
|
+
button="wheel",
|
|
1181
|
+
call_id=call_id,
|
|
1182
|
+
)
|
|
1183
|
+
)
|
|
1131
1184
|
elif action_type == "double_click":
|
|
1132
1185
|
# Input:
|
|
1133
1186
|
# {
|
|
@@ -1141,7 +1194,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1141
1194
|
# "id": "call_1",
|
|
1142
1195
|
# "type": "function"
|
|
1143
1196
|
# }
|
|
1144
|
-
|
|
1197
|
+
|
|
1145
1198
|
# Output:
|
|
1146
1199
|
# {
|
|
1147
1200
|
# "type": "computer_call",
|
|
@@ -1153,11 +1206,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1153
1206
|
# }
|
|
1154
1207
|
# }
|
|
1155
1208
|
coordinate = args.get("coordinate", [0, 0])
|
|
1156
|
-
responses_items.append(
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1209
|
+
responses_items.append(
|
|
1210
|
+
make_double_click_item(
|
|
1211
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
1212
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
1213
|
+
call_id=call_id,
|
|
1214
|
+
)
|
|
1215
|
+
)
|
|
1161
1216
|
elif action_type == "triple_click":
|
|
1162
1217
|
# Input:
|
|
1163
1218
|
# {
|
|
@@ -1171,7 +1226,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1171
1226
|
# "id": "call_1",
|
|
1172
1227
|
# "type": "function"
|
|
1173
1228
|
# }
|
|
1174
|
-
|
|
1229
|
+
|
|
1175
1230
|
# Output:
|
|
1176
1231
|
# {
|
|
1177
1232
|
# "type": "computer_call",
|
|
@@ -1196,7 +1251,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1196
1251
|
# "id": "call_1",
|
|
1197
1252
|
# "type": "function"
|
|
1198
1253
|
# }
|
|
1199
|
-
|
|
1254
|
+
|
|
1200
1255
|
# Output:
|
|
1201
1256
|
# {
|
|
1202
1257
|
# "type": "computer_call",
|
|
@@ -1209,11 +1264,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1209
1264
|
# }
|
|
1210
1265
|
# }
|
|
1211
1266
|
coordinate = args.get("coordinate", [None, None])
|
|
1212
|
-
responses_items.append(
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1267
|
+
responses_items.append(
|
|
1268
|
+
make_left_mouse_down_item(
|
|
1269
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
1270
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
1271
|
+
call_id=call_id,
|
|
1272
|
+
)
|
|
1273
|
+
)
|
|
1217
1274
|
elif action_type == "left_mouse_up":
|
|
1218
1275
|
# Input:
|
|
1219
1276
|
# {
|
|
@@ -1227,7 +1284,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1227
1284
|
# "id": "call_1",
|
|
1228
1285
|
# "type": "function"
|
|
1229
1286
|
# }
|
|
1230
|
-
|
|
1287
|
+
|
|
1231
1288
|
# Output:
|
|
1232
1289
|
# {
|
|
1233
1290
|
# "type": "computer_call",
|
|
@@ -1240,11 +1297,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1240
1297
|
# }
|
|
1241
1298
|
# }
|
|
1242
1299
|
coordinate = args.get("coordinate", [None, None])
|
|
1243
|
-
responses_items.append(
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1300
|
+
responses_items.append(
|
|
1301
|
+
make_left_mouse_up_item(
|
|
1302
|
+
x=coordinate[0] if len(coordinate) > 0 else None,
|
|
1303
|
+
y=coordinate[1] if len(coordinate) > 1 else None,
|
|
1304
|
+
call_id=call_id,
|
|
1305
|
+
)
|
|
1306
|
+
)
|
|
1248
1307
|
elif action_type == "hold_key":
|
|
1249
1308
|
# Input:
|
|
1250
1309
|
# {
|
|
@@ -1258,7 +1317,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1258
1317
|
# "id": "call_1",
|
|
1259
1318
|
# "type": "function"
|
|
1260
1319
|
# }
|
|
1261
|
-
|
|
1320
|
+
|
|
1262
1321
|
# Output:
|
|
1263
1322
|
# {
|
|
1264
1323
|
# "type": "computer_call",
|
|
@@ -1281,7 +1340,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1281
1340
|
# "id": "call_1",
|
|
1282
1341
|
# "type": "function"
|
|
1283
1342
|
# }
|
|
1284
|
-
|
|
1343
|
+
|
|
1285
1344
|
# Output:
|
|
1286
1345
|
# {
|
|
1287
1346
|
# "type": "computer_call",
|
|
@@ -1290,74 +1349,77 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
|
|
1290
1349
|
# "type": "wait"
|
|
1291
1350
|
# }
|
|
1292
1351
|
# }
|
|
1293
|
-
responses_items.append(make_wait_item(
|
|
1294
|
-
call_id=call_id
|
|
1295
|
-
))
|
|
1352
|
+
responses_items.append(make_wait_item(call_id=call_id))
|
|
1296
1353
|
except Exception as e:
|
|
1297
|
-
responses_items.extend(
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1354
|
+
responses_items.extend(
|
|
1355
|
+
make_failed_tool_call_items(
|
|
1356
|
+
tool_name="computer",
|
|
1357
|
+
tool_kwargs=args,
|
|
1358
|
+
error_message=repr(e),
|
|
1359
|
+
call_id=call_id,
|
|
1360
|
+
)
|
|
1361
|
+
)
|
|
1303
1362
|
except json.JSONDecodeError:
|
|
1304
1363
|
print("Failed to decode tool call arguments")
|
|
1305
1364
|
# Skip malformed tool calls
|
|
1306
1365
|
continue
|
|
1307
|
-
|
|
1366
|
+
|
|
1308
1367
|
return responses_items
|
|
1309
1368
|
|
|
1369
|
+
|
|
1310
1370
|
def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1311
1371
|
"""Add cache control to completion messages"""
|
|
1312
1372
|
num_writes = 0
|
|
1313
1373
|
for message in completion_messages:
|
|
1314
|
-
message["cache_control"] = {
|
|
1374
|
+
message["cache_control"] = {"type": "ephemeral"}
|
|
1315
1375
|
num_writes += 1
|
|
1316
1376
|
# Cache control has a maximum of 4 blocks
|
|
1317
1377
|
if num_writes >= 4:
|
|
1318
1378
|
break
|
|
1319
|
-
|
|
1379
|
+
|
|
1320
1380
|
return completion_messages
|
|
1321
1381
|
|
|
1382
|
+
|
|
1322
1383
|
def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1323
1384
|
"""Combine completion messages with the same role"""
|
|
1324
1385
|
if not completion_messages:
|
|
1325
1386
|
return completion_messages
|
|
1326
|
-
|
|
1387
|
+
|
|
1327
1388
|
combined_messages = []
|
|
1328
|
-
|
|
1389
|
+
|
|
1329
1390
|
for message in completion_messages:
|
|
1330
1391
|
# If this is the first message or role is different from last, add as new message
|
|
1331
1392
|
if not combined_messages or combined_messages[-1]["role"] != message["role"]:
|
|
1332
1393
|
# Ensure content is a list format and normalize text content
|
|
1333
1394
|
new_message = message.copy()
|
|
1334
1395
|
new_message["content"] = _normalize_content(message.get("content", ""))
|
|
1335
|
-
|
|
1396
|
+
|
|
1336
1397
|
# Copy tool_calls if present
|
|
1337
1398
|
if "tool_calls" in message:
|
|
1338
1399
|
new_message["tool_calls"] = message["tool_calls"].copy()
|
|
1339
|
-
|
|
1400
|
+
|
|
1340
1401
|
combined_messages.append(new_message)
|
|
1341
1402
|
else:
|
|
1342
1403
|
# Same role as previous message, combine them
|
|
1343
1404
|
last_message = combined_messages[-1]
|
|
1344
|
-
|
|
1405
|
+
|
|
1345
1406
|
# Combine content
|
|
1346
1407
|
current_content = _normalize_content(message.get("content", ""))
|
|
1347
1408
|
last_message["content"].extend(current_content)
|
|
1348
|
-
|
|
1409
|
+
|
|
1349
1410
|
# Combine tool_calls if present
|
|
1350
1411
|
if "tool_calls" in message:
|
|
1351
1412
|
if "tool_calls" not in last_message:
|
|
1352
1413
|
last_message["tool_calls"] = []
|
|
1353
1414
|
last_message["tool_calls"].extend(message["tool_calls"])
|
|
1354
|
-
|
|
1415
|
+
|
|
1355
1416
|
# Post-process to merge consecutive text blocks
|
|
1356
1417
|
for message in combined_messages:
|
|
1357
1418
|
message["content"] = _merge_consecutive_text(message["content"])
|
|
1358
|
-
|
|
1419
|
+
|
|
1359
1420
|
return combined_messages
|
|
1360
1421
|
|
|
1422
|
+
|
|
1361
1423
|
def _normalize_content(content) -> List[Dict[str, Any]]:
|
|
1362
1424
|
"""Normalize content to list format"""
|
|
1363
1425
|
if isinstance(content, str):
|
|
@@ -1370,28 +1432,28 @@ def _normalize_content(content) -> List[Dict[str, Any]]:
|
|
|
1370
1432
|
else:
|
|
1371
1433
|
return []
|
|
1372
1434
|
|
|
1435
|
+
|
|
1373
1436
|
def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
1374
1437
|
"""Merge consecutive text blocks with newlines"""
|
|
1375
1438
|
if not content_list:
|
|
1376
1439
|
return content_list
|
|
1377
|
-
|
|
1440
|
+
|
|
1378
1441
|
merged = []
|
|
1379
|
-
|
|
1442
|
+
|
|
1380
1443
|
for item in content_list:
|
|
1381
|
-
if
|
|
1382
|
-
merged and
|
|
1383
|
-
merged[-1].get("type") == "text"):
|
|
1444
|
+
if item.get("type") == "text" and merged and merged[-1].get("type") == "text":
|
|
1384
1445
|
# Merge with previous text block
|
|
1385
1446
|
merged[-1]["text"] += "\n" + item["text"]
|
|
1386
1447
|
else:
|
|
1387
1448
|
merged.append(item.copy())
|
|
1388
|
-
|
|
1449
|
+
|
|
1389
1450
|
return merged
|
|
1390
1451
|
|
|
1452
|
+
|
|
1391
1453
|
@register_agent(models=r".*claude-.*")
|
|
1392
1454
|
class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
1393
1455
|
"""Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
|
|
1394
|
-
|
|
1456
|
+
|
|
1395
1457
|
async def predict_step(
|
|
1396
1458
|
self,
|
|
1397
1459
|
messages: Messages,
|
|
@@ -1405,21 +1467,21 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1405
1467
|
_on_api_end=None,
|
|
1406
1468
|
_on_usage=None,
|
|
1407
1469
|
_on_screenshot=None,
|
|
1408
|
-
**kwargs
|
|
1470
|
+
**kwargs,
|
|
1409
1471
|
) -> Dict[str, Any]:
|
|
1410
1472
|
"""
|
|
1411
1473
|
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
1412
|
-
|
|
1474
|
+
|
|
1413
1475
|
Supports Anthropic's computer use models with hosted tools.
|
|
1414
1476
|
"""
|
|
1415
1477
|
tools = tools or []
|
|
1416
|
-
|
|
1478
|
+
|
|
1417
1479
|
# Get tool configuration for this model
|
|
1418
1480
|
tool_config = _get_tool_config_for_model(model)
|
|
1419
|
-
|
|
1481
|
+
|
|
1420
1482
|
# Prepare tools for Anthropic API
|
|
1421
1483
|
anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
|
|
1422
|
-
|
|
1484
|
+
|
|
1423
1485
|
# Convert responses_items messages to completion format
|
|
1424
1486
|
completion_messages = _convert_responses_items_to_completion_messages(messages)
|
|
1425
1487
|
if use_prompt_caching:
|
|
@@ -1427,7 +1489,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1427
1489
|
completion_messages = _combine_completion_messages(completion_messages)
|
|
1428
1490
|
# Then add cache control, anthropic requires explicit "cache_control" dicts
|
|
1429
1491
|
completion_messages = _add_cache_control(completion_messages)
|
|
1430
|
-
|
|
1492
|
+
|
|
1431
1493
|
# Prepare API call kwargs
|
|
1432
1494
|
api_kwargs = {
|
|
1433
1495
|
"model": model,
|
|
@@ -1435,80 +1497,74 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1435
1497
|
"tools": anthropic_tools if anthropic_tools else None,
|
|
1436
1498
|
"stream": stream,
|
|
1437
1499
|
"num_retries": max_retries,
|
|
1438
|
-
**kwargs
|
|
1500
|
+
**kwargs,
|
|
1439
1501
|
}
|
|
1440
|
-
|
|
1502
|
+
|
|
1441
1503
|
# Add beta header for computer use
|
|
1442
1504
|
if anthropic_tools:
|
|
1443
|
-
api_kwargs["headers"] = {
|
|
1444
|
-
|
|
1445
|
-
}
|
|
1446
|
-
|
|
1505
|
+
api_kwargs["headers"] = {"anthropic-beta": tool_config["beta_flag"]}
|
|
1506
|
+
|
|
1447
1507
|
# Call API start hook
|
|
1448
1508
|
if _on_api_start:
|
|
1449
1509
|
await _on_api_start(api_kwargs)
|
|
1450
|
-
|
|
1510
|
+
|
|
1451
1511
|
# Use liteLLM acompletion
|
|
1452
1512
|
response = await litellm.acompletion(**api_kwargs)
|
|
1453
|
-
|
|
1513
|
+
|
|
1454
1514
|
# Call API end hook
|
|
1455
1515
|
if _on_api_end:
|
|
1456
1516
|
await _on_api_end(api_kwargs, response)
|
|
1457
|
-
|
|
1517
|
+
|
|
1458
1518
|
# Convert response to responses_items format
|
|
1459
1519
|
responses_items = _convert_completion_to_responses_items(response)
|
|
1460
1520
|
|
|
1461
1521
|
# Extract usage information
|
|
1462
|
-
responses_usage = {
|
|
1463
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
1522
|
+
responses_usage = {
|
|
1523
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
1524
|
+
response.usage
|
|
1525
|
+
).model_dump(),
|
|
1464
1526
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
1465
1527
|
}
|
|
1466
1528
|
if _on_usage:
|
|
1467
1529
|
await _on_usage(responses_usage)
|
|
1468
1530
|
|
|
1469
1531
|
# Return in AsyncAgentConfig format
|
|
1470
|
-
return {
|
|
1471
|
-
|
|
1472
|
-
"usage": responses_usage
|
|
1473
|
-
}
|
|
1474
|
-
|
|
1532
|
+
return {"output": responses_items, "usage": responses_usage}
|
|
1533
|
+
|
|
1475
1534
|
async def predict_click(
|
|
1476
|
-
self,
|
|
1477
|
-
model: str,
|
|
1478
|
-
image_b64: str,
|
|
1479
|
-
instruction: str,
|
|
1480
|
-
**kwargs
|
|
1535
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
1481
1536
|
) -> Optional[Tuple[int, int]]:
|
|
1482
1537
|
"""
|
|
1483
1538
|
Predict click coordinates based on image and instruction.
|
|
1484
|
-
|
|
1539
|
+
|
|
1485
1540
|
Uses Anthropic's computer use models with a custom prompt that instructs
|
|
1486
1541
|
the agent to only output clicks.
|
|
1487
|
-
|
|
1542
|
+
|
|
1488
1543
|
Args:
|
|
1489
1544
|
model: Model name to use
|
|
1490
1545
|
image_b64: Base64 encoded image
|
|
1491
1546
|
instruction: Instruction for where to click
|
|
1492
|
-
|
|
1547
|
+
|
|
1493
1548
|
Returns:
|
|
1494
1549
|
Tuple of (x, y) coordinates or None if prediction fails
|
|
1495
1550
|
"""
|
|
1496
1551
|
# Get image dimensions from base64 data
|
|
1497
1552
|
try:
|
|
1498
1553
|
import base64
|
|
1499
|
-
from PIL import Image
|
|
1500
1554
|
from io import BytesIO
|
|
1501
|
-
|
|
1555
|
+
|
|
1556
|
+
from PIL import Image
|
|
1557
|
+
|
|
1502
1558
|
image_data = base64.b64decode(image_b64)
|
|
1503
1559
|
image = Image.open(BytesIO(image_data))
|
|
1504
1560
|
display_width, display_height = image.size
|
|
1505
1561
|
except Exception:
|
|
1506
1562
|
# Fallback to default dimensions if image parsing fails
|
|
1507
1563
|
display_width, display_height = 1024, 768
|
|
1508
|
-
|
|
1564
|
+
|
|
1509
1565
|
# Get tool configuration for this model
|
|
1510
1566
|
tool_config = _get_tool_config_for_model(model)
|
|
1511
|
-
|
|
1567
|
+
|
|
1512
1568
|
# Prepare computer tool for Anthropic format
|
|
1513
1569
|
computer_tool = {
|
|
1514
1570
|
"type": tool_config["tool_version"],
|
|
@@ -1521,7 +1577,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1521
1577
|
},
|
|
1522
1578
|
},
|
|
1523
1579
|
}
|
|
1524
|
-
|
|
1580
|
+
|
|
1525
1581
|
# Construct messages in OpenAI chat completion format for liteLLM
|
|
1526
1582
|
messages = [
|
|
1527
1583
|
{
|
|
@@ -1540,18 +1596,16 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1540
1596
|
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
1541
1597
|
|
|
1542
1598
|
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
1543
|
-
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
1599
|
+
Task: Click {instruction}. Output ONLY a click action on the target element.""",
|
|
1544
1600
|
},
|
|
1545
1601
|
{
|
|
1546
1602
|
"type": "image_url",
|
|
1547
|
-
"image_url": {
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
}
|
|
1551
|
-
]
|
|
1603
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
1604
|
+
},
|
|
1605
|
+
],
|
|
1552
1606
|
}
|
|
1553
1607
|
]
|
|
1554
|
-
|
|
1608
|
+
|
|
1555
1609
|
# Prepare API call kwargs
|
|
1556
1610
|
api_kwargs = {
|
|
1557
1611
|
"model": model,
|
|
@@ -1559,31 +1613,31 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
1559
1613
|
"tools": [computer_tool],
|
|
1560
1614
|
"stream": False,
|
|
1561
1615
|
"max_tokens": 100, # Keep response short for click prediction
|
|
1562
|
-
"headers": {
|
|
1563
|
-
"anthropic-beta": tool_config["beta_flag"]
|
|
1564
|
-
}
|
|
1616
|
+
"headers": {"anthropic-beta": tool_config["beta_flag"]},
|
|
1565
1617
|
}
|
|
1566
|
-
|
|
1618
|
+
|
|
1567
1619
|
# Use liteLLM acompletion
|
|
1568
1620
|
response = await litellm.acompletion(**api_kwargs)
|
|
1569
|
-
|
|
1621
|
+
|
|
1570
1622
|
# Convert response to responses_items format to extract click coordinates
|
|
1571
1623
|
responses_items = _convert_completion_to_responses_items(response)
|
|
1572
|
-
|
|
1624
|
+
|
|
1573
1625
|
# Look for computer_call with click action
|
|
1574
1626
|
for item in responses_items:
|
|
1575
|
-
if (
|
|
1576
|
-
item
|
|
1577
|
-
|
|
1578
|
-
|
|
1627
|
+
if (
|
|
1628
|
+
isinstance(item, dict)
|
|
1629
|
+
and item.get("type") == "computer_call"
|
|
1630
|
+
and isinstance(item.get("action"), dict)
|
|
1631
|
+
):
|
|
1632
|
+
|
|
1579
1633
|
action = item["action"]
|
|
1580
1634
|
if action.get("x") and action.get("y"):
|
|
1581
1635
|
x = action.get("x")
|
|
1582
1636
|
y = action.get("y")
|
|
1583
1637
|
return (int(x), int(y))
|
|
1584
|
-
|
|
1638
|
+
|
|
1585
1639
|
return None
|
|
1586
|
-
|
|
1640
|
+
|
|
1587
1641
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
1588
1642
|
"""Return the capabilities supported by this agent."""
|
|
1589
1643
|
return ["click", "step"]
|