cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/responses.py
CHANGED
|
@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
|
|
|
6
6
|
import base64
|
|
7
7
|
import json
|
|
8
8
|
import uuid
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
10
10
|
|
|
11
|
+
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
11
12
|
from openai.types.responses.response_computer_tool_call_param import (
|
|
12
|
-
ResponseComputerToolCallParam,
|
|
13
13
|
ActionClick,
|
|
14
14
|
ActionDoubleClick,
|
|
15
15
|
ActionDrag,
|
|
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
|
|
|
18
18
|
ActionMove,
|
|
19
19
|
ActionScreenshot,
|
|
20
20
|
ActionScroll,
|
|
21
|
+
)
|
|
22
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
21
23
|
ActionType as ActionTypeAction,
|
|
24
|
+
)
|
|
25
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
22
26
|
ActionWait,
|
|
23
|
-
PendingSafetyCheck
|
|
27
|
+
PendingSafetyCheck,
|
|
28
|
+
ResponseComputerToolCallParam,
|
|
29
|
+
)
|
|
30
|
+
from openai.types.responses.response_function_tool_call_param import (
|
|
31
|
+
ResponseFunctionToolCallParam,
|
|
24
32
|
)
|
|
25
|
-
|
|
26
|
-
from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
|
|
27
|
-
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
28
|
-
from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
|
|
29
|
-
from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
|
|
30
|
-
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
31
33
|
from openai.types.responses.response_input_image_param import ResponseInputImageParam
|
|
34
|
+
from openai.types.responses.response_output_message_param import (
|
|
35
|
+
ResponseOutputMessageParam,
|
|
36
|
+
)
|
|
37
|
+
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
38
|
+
from openai.types.responses.response_reasoning_item_param import (
|
|
39
|
+
ResponseReasoningItemParam,
|
|
40
|
+
Summary,
|
|
41
|
+
)
|
|
42
|
+
|
|
32
43
|
|
|
33
44
|
def random_id():
|
|
34
45
|
return str(uuid.uuid4())
|
|
35
46
|
|
|
47
|
+
|
|
36
48
|
# User message items
|
|
37
49
|
def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
|
|
38
50
|
return EasyInputMessageParam(
|
|
39
51
|
content=[
|
|
40
52
|
ResponseInputImageParam(
|
|
41
53
|
type="input_image",
|
|
42
|
-
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
|
|
43
|
-
)
|
|
54
|
+
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
|
|
55
|
+
) # type: ignore
|
|
44
56
|
],
|
|
45
57
|
role="user",
|
|
46
|
-
type="message"
|
|
58
|
+
type="message",
|
|
47
59
|
)
|
|
48
60
|
|
|
61
|
+
|
|
49
62
|
# Text items
|
|
50
63
|
def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
|
|
51
64
|
return ResponseReasoningItemParam(
|
|
52
|
-
id=random_id(),
|
|
53
|
-
summary=[
|
|
54
|
-
Summary(text=reasoning, type="summary_text")
|
|
55
|
-
],
|
|
56
|
-
type="reasoning"
|
|
65
|
+
id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
|
|
57
66
|
)
|
|
58
67
|
|
|
68
|
+
|
|
59
69
|
def make_output_text_item(content: str) -> ResponseOutputMessageParam:
|
|
60
70
|
return ResponseOutputMessageParam(
|
|
61
71
|
id=random_id(),
|
|
62
|
-
content=[
|
|
63
|
-
ResponseOutputTextParam(
|
|
64
|
-
text=content,
|
|
65
|
-
type="output_text",
|
|
66
|
-
annotations=[]
|
|
67
|
-
)
|
|
68
|
-
],
|
|
72
|
+
content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
|
|
69
73
|
role="assistant",
|
|
70
74
|
status="completed",
|
|
71
|
-
type="message"
|
|
75
|
+
type="message",
|
|
72
76
|
)
|
|
73
77
|
|
|
78
|
+
|
|
74
79
|
# Function call items
|
|
75
|
-
def make_function_call_item(
|
|
80
|
+
def make_function_call_item(
|
|
81
|
+
function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
|
|
82
|
+
) -> ResponseFunctionToolCallParam:
|
|
76
83
|
return ResponseFunctionToolCallParam(
|
|
77
84
|
id=random_id(),
|
|
78
85
|
call_id=call_id if call_id else random_id(),
|
|
79
86
|
name=function_name,
|
|
80
87
|
arguments=json.dumps(arguments),
|
|
81
88
|
status="completed",
|
|
82
|
-
type="function_call"
|
|
89
|
+
type="function_call",
|
|
83
90
|
)
|
|
84
91
|
|
|
92
|
+
|
|
85
93
|
# Computer tool call items
|
|
86
|
-
def make_click_item(
|
|
94
|
+
def make_click_item(
|
|
95
|
+
x: int,
|
|
96
|
+
y: int,
|
|
97
|
+
button: Literal["left", "right", "wheel", "back", "forward"] = "left",
|
|
98
|
+
call_id: Optional[str] = None,
|
|
99
|
+
) -> ResponseComputerToolCallParam:
|
|
87
100
|
return ResponseComputerToolCallParam(
|
|
88
101
|
id=random_id(),
|
|
89
102
|
call_id=call_id if call_id else random_id(),
|
|
90
|
-
action=ActionClick(
|
|
91
|
-
button=button,
|
|
92
|
-
type="click",
|
|
93
|
-
x=x,
|
|
94
|
-
y=y
|
|
95
|
-
),
|
|
103
|
+
action=ActionClick(button=button, type="click", x=x, y=y),
|
|
96
104
|
pending_safety_checks=[],
|
|
97
105
|
status="completed",
|
|
98
|
-
type="computer_call"
|
|
106
|
+
type="computer_call",
|
|
99
107
|
)
|
|
100
108
|
|
|
101
|
-
|
|
109
|
+
|
|
110
|
+
def make_double_click_item(
|
|
111
|
+
x: int, y: int, call_id: Optional[str] = None
|
|
112
|
+
) -> ResponseComputerToolCallParam:
|
|
102
113
|
return ResponseComputerToolCallParam(
|
|
103
114
|
id=random_id(),
|
|
104
115
|
call_id=call_id if call_id else random_id(),
|
|
105
|
-
action=ActionDoubleClick(
|
|
106
|
-
type="double_click",
|
|
107
|
-
x=x,
|
|
108
|
-
y=y
|
|
109
|
-
),
|
|
116
|
+
action=ActionDoubleClick(type="double_click", x=x, y=y),
|
|
110
117
|
pending_safety_checks=[],
|
|
111
118
|
status="completed",
|
|
112
|
-
type="computer_call"
|
|
119
|
+
type="computer_call",
|
|
113
120
|
)
|
|
114
121
|
|
|
115
|
-
|
|
122
|
+
|
|
123
|
+
def make_drag_item(
|
|
124
|
+
path: List[Dict[str, int]], call_id: Optional[str] = None
|
|
125
|
+
) -> ResponseComputerToolCallParam:
|
|
116
126
|
drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
|
|
117
127
|
return ResponseComputerToolCallParam(
|
|
118
128
|
id=random_id(),
|
|
119
129
|
call_id=call_id if call_id else random_id(),
|
|
120
|
-
action=ActionDrag(
|
|
121
|
-
path=drag_path,
|
|
122
|
-
type="drag"
|
|
123
|
-
),
|
|
130
|
+
action=ActionDrag(path=drag_path, type="drag"),
|
|
124
131
|
pending_safety_checks=[],
|
|
125
132
|
status="completed",
|
|
126
|
-
type="computer_call"
|
|
133
|
+
type="computer_call",
|
|
127
134
|
)
|
|
128
135
|
|
|
129
|
-
|
|
136
|
+
|
|
137
|
+
def make_keypress_item(
|
|
138
|
+
keys: List[str], call_id: Optional[str] = None
|
|
139
|
+
) -> ResponseComputerToolCallParam:
|
|
130
140
|
return ResponseComputerToolCallParam(
|
|
131
141
|
id=random_id(),
|
|
132
142
|
call_id=call_id if call_id else random_id(),
|
|
133
|
-
action=ActionKeypress(
|
|
134
|
-
keys=keys,
|
|
135
|
-
type="keypress"
|
|
136
|
-
),
|
|
143
|
+
action=ActionKeypress(keys=keys, type="keypress"),
|
|
137
144
|
pending_safety_checks=[],
|
|
138
145
|
status="completed",
|
|
139
|
-
type="computer_call"
|
|
146
|
+
type="computer_call",
|
|
140
147
|
)
|
|
141
148
|
|
|
149
|
+
|
|
142
150
|
def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
143
151
|
return ResponseComputerToolCallParam(
|
|
144
152
|
id=random_id(),
|
|
145
153
|
call_id=call_id if call_id else random_id(),
|
|
146
|
-
action=ActionMove(
|
|
147
|
-
type="move",
|
|
148
|
-
x=x,
|
|
149
|
-
y=y
|
|
150
|
-
),
|
|
154
|
+
action=ActionMove(type="move", x=x, y=y),
|
|
151
155
|
pending_safety_checks=[],
|
|
152
156
|
status="completed",
|
|
153
|
-
type="computer_call"
|
|
157
|
+
type="computer_call",
|
|
154
158
|
)
|
|
155
159
|
|
|
160
|
+
|
|
156
161
|
def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
157
162
|
return ResponseComputerToolCallParam(
|
|
158
163
|
id=random_id(),
|
|
159
164
|
call_id=call_id if call_id else random_id(),
|
|
160
|
-
action=ActionScreenshot(
|
|
161
|
-
type="screenshot"
|
|
162
|
-
),
|
|
165
|
+
action=ActionScreenshot(type="screenshot"),
|
|
163
166
|
pending_safety_checks=[],
|
|
164
167
|
status="completed",
|
|
165
|
-
type="computer_call"
|
|
168
|
+
type="computer_call",
|
|
166
169
|
)
|
|
167
170
|
|
|
168
|
-
|
|
171
|
+
|
|
172
|
+
def make_scroll_item(
|
|
173
|
+
x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
|
|
174
|
+
) -> ResponseComputerToolCallParam:
|
|
169
175
|
return ResponseComputerToolCallParam(
|
|
170
176
|
id=random_id(),
|
|
171
177
|
call_id=call_id if call_id else random_id(),
|
|
172
|
-
action=ActionScroll(
|
|
173
|
-
scroll_x=scroll_x,
|
|
174
|
-
scroll_y=scroll_y,
|
|
175
|
-
type="scroll",
|
|
176
|
-
x=x,
|
|
177
|
-
y=y
|
|
178
|
-
),
|
|
178
|
+
action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
|
|
179
179
|
pending_safety_checks=[],
|
|
180
180
|
status="completed",
|
|
181
|
-
type="computer_call"
|
|
181
|
+
type="computer_call",
|
|
182
182
|
)
|
|
183
183
|
|
|
184
|
+
|
|
184
185
|
def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
185
186
|
return ResponseComputerToolCallParam(
|
|
186
187
|
id=random_id(),
|
|
187
188
|
call_id=call_id if call_id else random_id(),
|
|
188
|
-
action=ActionTypeAction(
|
|
189
|
-
text=text,
|
|
190
|
-
type="type"
|
|
191
|
-
),
|
|
189
|
+
action=ActionTypeAction(text=text, type="type"),
|
|
192
190
|
pending_safety_checks=[],
|
|
193
191
|
status="completed",
|
|
194
|
-
type="computer_call"
|
|
192
|
+
type="computer_call",
|
|
195
193
|
)
|
|
196
194
|
|
|
195
|
+
|
|
197
196
|
def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
198
197
|
return ResponseComputerToolCallParam(
|
|
199
198
|
id=random_id(),
|
|
200
199
|
call_id=call_id if call_id else random_id(),
|
|
201
|
-
action=ActionWait(
|
|
202
|
-
type="wait"
|
|
203
|
-
),
|
|
200
|
+
action=ActionWait(type="wait"),
|
|
204
201
|
pending_safety_checks=[],
|
|
205
202
|
status="completed",
|
|
206
|
-
type="computer_call"
|
|
203
|
+
type="computer_call",
|
|
207
204
|
)
|
|
208
205
|
|
|
206
|
+
|
|
209
207
|
# Extra anthropic computer calls
|
|
210
|
-
def make_left_mouse_down_item(
|
|
208
|
+
def make_left_mouse_down_item(
|
|
209
|
+
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
210
|
+
) -> Dict[str, Any]:
|
|
211
211
|
return {
|
|
212
212
|
"id": random_id(),
|
|
213
213
|
"call_id": call_id if call_id else random_id(),
|
|
214
|
-
"action": {
|
|
215
|
-
"type": "left_mouse_down",
|
|
216
|
-
"x": x,
|
|
217
|
-
"y": y
|
|
218
|
-
},
|
|
214
|
+
"action": {"type": "left_mouse_down", "x": x, "y": y},
|
|
219
215
|
"pending_safety_checks": [],
|
|
220
216
|
"status": "completed",
|
|
221
|
-
"type": "computer_call"
|
|
217
|
+
"type": "computer_call",
|
|
222
218
|
}
|
|
223
219
|
|
|
224
|
-
|
|
220
|
+
|
|
221
|
+
def make_left_mouse_up_item(
|
|
222
|
+
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
223
|
+
) -> Dict[str, Any]:
|
|
225
224
|
return {
|
|
226
225
|
"id": random_id(),
|
|
227
226
|
"call_id": call_id if call_id else random_id(),
|
|
228
|
-
"action": {
|
|
229
|
-
"type": "left_mouse_up",
|
|
230
|
-
"x": x,
|
|
231
|
-
"y": y
|
|
232
|
-
},
|
|
227
|
+
"action": {"type": "left_mouse_up", "x": x, "y": y},
|
|
233
228
|
"pending_safety_checks": [],
|
|
234
229
|
"status": "completed",
|
|
235
|
-
"type": "computer_call"
|
|
230
|
+
"type": "computer_call",
|
|
236
231
|
}
|
|
237
232
|
|
|
238
|
-
|
|
233
|
+
|
|
234
|
+
def make_failed_tool_call_items(
|
|
235
|
+
tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
|
|
236
|
+
) -> List[Dict[str, Any]]:
|
|
239
237
|
call_id = call_id if call_id else random_id()
|
|
240
238
|
return [
|
|
241
239
|
{
|
|
@@ -249,27 +247,80 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
|
|
|
249
247
|
"type": "function_call_output",
|
|
250
248
|
"call_id": call_id,
|
|
251
249
|
"output": json.dumps({"error": error_message}),
|
|
252
|
-
}
|
|
250
|
+
},
|
|
253
251
|
]
|
|
254
252
|
|
|
253
|
+
|
|
254
|
+
def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
|
|
255
|
+
call_id = call_id if call_id else random_id()
|
|
256
|
+
return {
|
|
257
|
+
"type": "function_call_output",
|
|
258
|
+
"call_id": call_id,
|
|
259
|
+
"output": json.dumps({"error": error_message}),
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def replace_failed_computer_calls_with_function_calls(
|
|
264
|
+
messages: List[Dict[str, Any]],
|
|
265
|
+
) -> List[Dict[str, Any]]:
|
|
266
|
+
"""
|
|
267
|
+
Replace computer_call items with function_call items if they share a call_id with a function_call_output.
|
|
268
|
+
This indicates the computer call failed and should be treated as a function call instead.
|
|
269
|
+
We do this because the computer_call_output items do not support text output.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
messages: List of message items to process
|
|
273
|
+
"""
|
|
274
|
+
messages = messages.copy()
|
|
275
|
+
|
|
276
|
+
# Find all call_ids that have function_call_output items
|
|
277
|
+
failed_call_ids = set()
|
|
278
|
+
for msg in messages:
|
|
279
|
+
if msg.get("type") == "function_call_output":
|
|
280
|
+
call_id = msg.get("call_id")
|
|
281
|
+
if call_id:
|
|
282
|
+
failed_call_ids.add(call_id)
|
|
283
|
+
|
|
284
|
+
# Replace computer_call items that have matching call_ids
|
|
285
|
+
for i, msg in enumerate(messages):
|
|
286
|
+
if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
|
|
287
|
+
|
|
288
|
+
# Extract action from computer_call
|
|
289
|
+
action = msg.get("action", {})
|
|
290
|
+
call_id = msg.get("call_id")
|
|
291
|
+
|
|
292
|
+
# Create function_call replacement
|
|
293
|
+
messages[i] = {
|
|
294
|
+
"type": "function_call",
|
|
295
|
+
"id": msg.get("id", random_id()),
|
|
296
|
+
"call_id": call_id,
|
|
297
|
+
"name": "computer",
|
|
298
|
+
"arguments": json.dumps(action),
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
return messages
|
|
302
|
+
|
|
303
|
+
|
|
255
304
|
# Conversion functions between element descriptions and coordinates
|
|
256
|
-
def convert_computer_calls_desc2xy(
|
|
305
|
+
def convert_computer_calls_desc2xy(
|
|
306
|
+
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
307
|
+
) -> List[Dict[str, Any]]:
|
|
257
308
|
"""
|
|
258
309
|
Convert computer calls from element descriptions to x,y coordinates.
|
|
259
|
-
|
|
310
|
+
|
|
260
311
|
Args:
|
|
261
312
|
responses_items: List of response items containing computer calls with element_description
|
|
262
313
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
263
|
-
|
|
314
|
+
|
|
264
315
|
Returns:
|
|
265
316
|
List of response items with element_description replaced by x,y coordinates
|
|
266
317
|
"""
|
|
267
318
|
converted_items = []
|
|
268
|
-
|
|
319
|
+
|
|
269
320
|
for item in responses_items:
|
|
270
321
|
if item.get("type") == "computer_call" and "action" in item:
|
|
271
322
|
action = item["action"].copy()
|
|
272
|
-
|
|
323
|
+
|
|
273
324
|
# Handle single element_description
|
|
274
325
|
if "element_description" in action:
|
|
275
326
|
desc = action["element_description"]
|
|
@@ -278,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
|
|
|
278
329
|
action["x"] = x
|
|
279
330
|
action["y"] = y
|
|
280
331
|
del action["element_description"]
|
|
281
|
-
|
|
332
|
+
|
|
282
333
|
# Handle start_element_description and end_element_description for drag operations
|
|
283
334
|
elif "start_element_description" in action and "end_element_description" in action:
|
|
284
335
|
start_desc = action["start_element_description"]
|
|
285
336
|
end_desc = action["end_element_description"]
|
|
286
|
-
|
|
337
|
+
|
|
287
338
|
if start_desc in desc2xy and end_desc in desc2xy:
|
|
288
339
|
start_x, start_y = desc2xy[start_desc]
|
|
289
340
|
end_x, end_y = desc2xy[end_desc]
|
|
290
341
|
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
291
342
|
del action["start_element_description"]
|
|
292
343
|
del action["end_element_description"]
|
|
293
|
-
|
|
344
|
+
|
|
294
345
|
converted_item = item.copy()
|
|
295
346
|
converted_item["action"] = action
|
|
296
347
|
converted_items.append(converted_item)
|
|
297
348
|
else:
|
|
298
349
|
converted_items.append(item)
|
|
299
|
-
|
|
350
|
+
|
|
300
351
|
return converted_items
|
|
301
352
|
|
|
302
353
|
|
|
303
|
-
def convert_computer_calls_xy2desc(
|
|
354
|
+
def convert_computer_calls_xy2desc(
|
|
355
|
+
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
356
|
+
) -> List[Dict[str, Any]]:
|
|
304
357
|
"""
|
|
305
358
|
Convert computer calls from x,y coordinates to element descriptions.
|
|
306
|
-
|
|
359
|
+
|
|
307
360
|
Args:
|
|
308
361
|
responses_items: List of response items containing computer calls with x,y coordinates
|
|
309
362
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
310
|
-
|
|
363
|
+
|
|
311
364
|
Returns:
|
|
312
365
|
List of response items with x,y coordinates replaced by element_description
|
|
313
366
|
"""
|
|
314
367
|
# Create reverse mapping from coordinates to descriptions
|
|
315
368
|
xy2desc = {coords: desc for desc, coords in desc2xy.items()}
|
|
316
|
-
|
|
369
|
+
|
|
317
370
|
converted_items = []
|
|
318
|
-
|
|
371
|
+
|
|
319
372
|
for item in responses_items:
|
|
320
373
|
if item.get("type") == "computer_call" and "action" in item:
|
|
321
374
|
action = item["action"].copy()
|
|
322
|
-
|
|
375
|
+
|
|
323
376
|
# Handle single x,y coordinates
|
|
324
377
|
if "x" in action and "y" in action:
|
|
325
378
|
coords = (action["x"], action["y"])
|
|
@@ -327,77 +380,94 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
|
|
|
327
380
|
action["element_description"] = xy2desc[coords]
|
|
328
381
|
del action["x"]
|
|
329
382
|
del action["y"]
|
|
330
|
-
|
|
383
|
+
|
|
331
384
|
# Handle path for drag operations
|
|
332
385
|
elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
|
|
333
386
|
start_point = action["path"][0]
|
|
334
387
|
end_point = action["path"][1]
|
|
335
|
-
|
|
336
|
-
if (
|
|
337
|
-
"x" in
|
|
338
|
-
|
|
388
|
+
|
|
389
|
+
if (
|
|
390
|
+
"x" in start_point
|
|
391
|
+
and "y" in start_point
|
|
392
|
+
and "x" in end_point
|
|
393
|
+
and "y" in end_point
|
|
394
|
+
):
|
|
395
|
+
|
|
339
396
|
start_coords = (start_point["x"], start_point["y"])
|
|
340
397
|
end_coords = (end_point["x"], end_point["y"])
|
|
341
|
-
|
|
398
|
+
|
|
342
399
|
if start_coords in xy2desc and end_coords in xy2desc:
|
|
343
400
|
action["start_element_description"] = xy2desc[start_coords]
|
|
344
401
|
action["end_element_description"] = xy2desc[end_coords]
|
|
345
402
|
del action["path"]
|
|
346
|
-
|
|
403
|
+
|
|
347
404
|
converted_item = item.copy()
|
|
348
405
|
converted_item["action"] = action
|
|
349
406
|
converted_items.append(converted_item)
|
|
350
407
|
else:
|
|
351
408
|
converted_items.append(item)
|
|
352
|
-
|
|
409
|
+
|
|
353
410
|
return converted_items
|
|
354
411
|
|
|
355
412
|
|
|
356
413
|
def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
|
|
357
414
|
"""
|
|
358
415
|
Extract all element descriptions from computer calls in responses items.
|
|
359
|
-
|
|
416
|
+
|
|
360
417
|
Args:
|
|
361
418
|
responses_items: List of response items containing computer calls
|
|
362
|
-
|
|
419
|
+
|
|
363
420
|
Returns:
|
|
364
421
|
List of unique element descriptions found in computer calls
|
|
365
422
|
"""
|
|
366
423
|
descriptions = set()
|
|
367
|
-
|
|
424
|
+
|
|
368
425
|
for item in responses_items:
|
|
369
426
|
if item.get("type") == "computer_call" and "action" in item:
|
|
370
427
|
action = item["action"]
|
|
371
|
-
|
|
428
|
+
|
|
372
429
|
# Handle single element_description
|
|
373
430
|
if "element_description" in action:
|
|
374
431
|
descriptions.add(action["element_description"])
|
|
375
|
-
|
|
432
|
+
|
|
376
433
|
# Handle start_element_description and end_element_description for drag operations
|
|
377
434
|
if "start_element_description" in action:
|
|
378
435
|
descriptions.add(action["start_element_description"])
|
|
379
|
-
|
|
436
|
+
|
|
380
437
|
if "end_element_description" in action:
|
|
381
438
|
descriptions.add(action["end_element_description"])
|
|
382
|
-
|
|
439
|
+
|
|
383
440
|
return list(descriptions)
|
|
384
441
|
|
|
385
442
|
|
|
386
443
|
# Conversion functions between responses_items and completion messages formats
|
|
387
|
-
def convert_responses_items_to_completion_messages(
|
|
444
|
+
def convert_responses_items_to_completion_messages(
|
|
445
|
+
messages: List[Dict[str, Any]],
|
|
446
|
+
allow_images_in_tool_results: bool = True,
|
|
447
|
+
send_multiple_user_images_per_parallel_tool_results: bool = False,
|
|
448
|
+
use_xml_tools: bool = False,
|
|
449
|
+
) -> List[Dict[str, Any]]:
|
|
388
450
|
"""Convert responses_items message format to liteLLM completion format.
|
|
389
|
-
|
|
451
|
+
|
|
390
452
|
Args:
|
|
391
453
|
messages: List of responses_items format messages
|
|
392
454
|
allow_images_in_tool_results: If True, include images in tool role messages.
|
|
393
455
|
If False, send tool message + separate user message with image.
|
|
456
|
+
send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
|
|
457
|
+
use_xml_tools: If True, use XML-style <tool_call> tags instead of tool_calls array.
|
|
458
|
+
Also sends tool results as user messages instead of tool role.
|
|
394
459
|
"""
|
|
460
|
+
# Assert that allow_images_in_tool_results is False when use_xml_tools is True
|
|
461
|
+
if use_xml_tools:
|
|
462
|
+
assert (
|
|
463
|
+
not allow_images_in_tool_results
|
|
464
|
+
), "allow_images_in_tool_results must be False when use_xml_tools is True"
|
|
395
465
|
completion_messages = []
|
|
396
|
-
|
|
397
|
-
for message in messages:
|
|
466
|
+
|
|
467
|
+
for i, message in enumerate(messages):
|
|
398
468
|
msg_type = message.get("type")
|
|
399
469
|
role = message.get("role")
|
|
400
|
-
|
|
470
|
+
|
|
401
471
|
# Handle user messages (both with and without explicit type)
|
|
402
472
|
if role == "user" or msg_type == "user":
|
|
403
473
|
content = message.get("content", "")
|
|
@@ -406,34 +476,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
406
476
|
completion_content = []
|
|
407
477
|
for item in content:
|
|
408
478
|
if item.get("type") == "input_image":
|
|
409
|
-
completion_content.append(
|
|
410
|
-
"type": "image_url",
|
|
411
|
-
|
|
412
|
-
"url": item.get("image_url")
|
|
413
|
-
}
|
|
414
|
-
})
|
|
479
|
+
completion_content.append(
|
|
480
|
+
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
|
481
|
+
)
|
|
415
482
|
elif item.get("type") == "input_text":
|
|
416
|
-
completion_content.append({
|
|
417
|
-
"type": "text",
|
|
418
|
-
"text": item.get("text")
|
|
419
|
-
})
|
|
483
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
420
484
|
elif item.get("type") == "text":
|
|
421
|
-
completion_content.append({
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
})
|
|
425
|
-
|
|
426
|
-
completion_messages.append({
|
|
427
|
-
"role": "user",
|
|
428
|
-
"content": completion_content
|
|
429
|
-
})
|
|
485
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
486
|
+
|
|
487
|
+
completion_messages.append({"role": "user", "content": completion_content})
|
|
430
488
|
elif isinstance(content, str):
|
|
431
489
|
# Handle string content
|
|
432
|
-
completion_messages.append({
|
|
433
|
-
|
|
434
|
-
"content": content
|
|
435
|
-
})
|
|
436
|
-
|
|
490
|
+
completion_messages.append({"role": "user", "content": content})
|
|
491
|
+
|
|
437
492
|
# Handle assistant messages
|
|
438
493
|
elif role == "assistant" or msg_type == "message":
|
|
439
494
|
content = message.get("content", [])
|
|
@@ -444,13 +499,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
444
499
|
text_parts.append(item.get("text", ""))
|
|
445
500
|
elif item.get("type") == "text":
|
|
446
501
|
text_parts.append(item.get("text", ""))
|
|
447
|
-
|
|
502
|
+
|
|
448
503
|
if text_parts:
|
|
449
|
-
completion_messages.append(
|
|
450
|
-
"role": "assistant",
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
504
|
+
completion_messages.append(
|
|
505
|
+
{"role": "assistant", "content": "\n".join(text_parts)}
|
|
506
|
+
)
|
|
507
|
+
|
|
454
508
|
# Handle reasoning items (convert to assistant message)
|
|
455
509
|
elif msg_type == "reasoning":
|
|
456
510
|
summary = message.get("summary", [])
|
|
@@ -458,107 +512,185 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
458
512
|
for item in summary:
|
|
459
513
|
if item.get("type") == "summary_text":
|
|
460
514
|
text_parts.append(item.get("text", ""))
|
|
461
|
-
|
|
515
|
+
|
|
462
516
|
if text_parts:
|
|
463
|
-
completion_messages.append({
|
|
464
|
-
|
|
465
|
-
"content": "\n".join(text_parts)
|
|
466
|
-
})
|
|
467
|
-
|
|
517
|
+
completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
|
|
518
|
+
|
|
468
519
|
# Handle function calls
|
|
469
520
|
elif msg_type == "function_call":
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
completion_messages
|
|
473
|
-
"role": "assistant",
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
"
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
"
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
521
|
+
if use_xml_tools:
|
|
522
|
+
# Use XML format instead of tool_calls array
|
|
523
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
524
|
+
completion_messages.append({"role": "assistant", "content": ""})
|
|
525
|
+
|
|
526
|
+
# Ensure arguments is a JSON string (not a dict)
|
|
527
|
+
arguments = message.get("arguments")
|
|
528
|
+
if isinstance(arguments, dict):
|
|
529
|
+
arguments = json.dumps(arguments)
|
|
530
|
+
|
|
531
|
+
# Format as XML tool call
|
|
532
|
+
tool_call_xml = f'<tool_call>{{"name": "{message.get("name")}", "arguments": {arguments}}}</tool_call>'
|
|
533
|
+
if completion_messages[-1]["content"]:
|
|
534
|
+
completion_messages[-1]["content"] += "\n" + tool_call_xml
|
|
535
|
+
else:
|
|
536
|
+
completion_messages[-1]["content"] = tool_call_xml
|
|
537
|
+
else:
|
|
538
|
+
# Add tool call to last assistant message or create new one
|
|
539
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
540
|
+
completion_messages.append(
|
|
541
|
+
{"role": "assistant", "content": "", "tool_calls": []}
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
if "tool_calls" not in completion_messages[-1]:
|
|
545
|
+
completion_messages[-1]["tool_calls"] = []
|
|
546
|
+
|
|
547
|
+
# Ensure arguments is a JSON string (not a dict)
|
|
548
|
+
arguments = message.get("arguments")
|
|
549
|
+
if isinstance(arguments, dict):
|
|
550
|
+
arguments = json.dumps(arguments)
|
|
551
|
+
|
|
552
|
+
completion_messages[-1]["tool_calls"].append(
|
|
553
|
+
{
|
|
554
|
+
"id": message.get("call_id"),
|
|
555
|
+
"type": "function",
|
|
556
|
+
"function": {
|
|
557
|
+
"name": message.get("name"),
|
|
558
|
+
"arguments": arguments,
|
|
559
|
+
},
|
|
560
|
+
}
|
|
561
|
+
)
|
|
562
|
+
|
|
490
563
|
# Handle computer calls
|
|
491
564
|
elif msg_type == "computer_call":
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
completion_messages
|
|
495
|
-
"role": "assistant",
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
"
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
565
|
+
if use_xml_tools:
|
|
566
|
+
# Use XML format instead of tool_calls array
|
|
567
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
568
|
+
completion_messages.append({"role": "assistant", "content": ""})
|
|
569
|
+
|
|
570
|
+
action = message.get("action", {})
|
|
571
|
+
# Format as XML tool call
|
|
572
|
+
tool_call_xml = f'<tool_call>{{"name": "computer", "arguments": {json.dumps(action)}}}</tool_call>'
|
|
573
|
+
if completion_messages[-1]["content"]:
|
|
574
|
+
completion_messages[-1]["content"] += "\n" + tool_call_xml
|
|
575
|
+
else:
|
|
576
|
+
completion_messages[-1]["content"] = tool_call_xml
|
|
577
|
+
else:
|
|
578
|
+
# Add tool call to last assistant message or create new one
|
|
579
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
580
|
+
completion_messages.append(
|
|
581
|
+
{"role": "assistant", "content": "", "tool_calls": []}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
if "tool_calls" not in completion_messages[-1]:
|
|
585
|
+
completion_messages[-1]["tool_calls"] = []
|
|
586
|
+
|
|
587
|
+
action = message.get("action", {})
|
|
588
|
+
completion_messages[-1]["tool_calls"].append(
|
|
589
|
+
{
|
|
590
|
+
"id": message.get("call_id"),
|
|
591
|
+
"type": "function",
|
|
592
|
+
"function": {"name": "computer", "arguments": json.dumps(action)},
|
|
593
|
+
}
|
|
594
|
+
)
|
|
595
|
+
|
|
513
596
|
# Handle function/computer call outputs
|
|
514
597
|
elif msg_type in ["function_call_output", "computer_call_output"]:
|
|
515
598
|
output = message.get("output")
|
|
516
599
|
call_id = message.get("call_id")
|
|
517
|
-
|
|
518
|
-
if
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
"
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
600
|
+
|
|
601
|
+
if use_xml_tools:
|
|
602
|
+
# When using XML tools, send all results as user messages
|
|
603
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
604
|
+
# Send image as user message
|
|
605
|
+
completion_messages.append(
|
|
606
|
+
{
|
|
607
|
+
"role": "user",
|
|
608
|
+
"content": [
|
|
609
|
+
{
|
|
610
|
+
"type": "image_url",
|
|
611
|
+
"image_url": {"url": output.get("image_url")},
|
|
612
|
+
}
|
|
613
|
+
],
|
|
614
|
+
}
|
|
615
|
+
)
|
|
531
616
|
else:
|
|
532
|
-
# Send
|
|
533
|
-
completion_messages
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
"content": [{
|
|
540
|
-
"type": "image_url",
|
|
541
|
-
"image_url": {
|
|
542
|
-
"url": output.get("image_url")
|
|
543
|
-
}
|
|
544
|
-
}]
|
|
545
|
-
}]
|
|
617
|
+
# Send text result as user message
|
|
618
|
+
completion_messages.append(
|
|
619
|
+
{
|
|
620
|
+
"role": "user",
|
|
621
|
+
"content": str(output),
|
|
622
|
+
}
|
|
623
|
+
)
|
|
546
624
|
else:
|
|
547
|
-
#
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
625
|
+
# Standard tool message handling
|
|
626
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
627
|
+
if allow_images_in_tool_results:
|
|
628
|
+
# Handle image output as tool response (may not work with all APIs)
|
|
629
|
+
completion_messages.append(
|
|
630
|
+
{
|
|
631
|
+
"role": "tool",
|
|
632
|
+
"tool_call_id": call_id,
|
|
633
|
+
"content": [
|
|
634
|
+
{
|
|
635
|
+
"type": "image_url",
|
|
636
|
+
"image_url": {"url": output.get("image_url")},
|
|
637
|
+
}
|
|
638
|
+
],
|
|
639
|
+
}
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
# Determine if the next message is also a tool call output
|
|
643
|
+
next_type = None
|
|
644
|
+
if i + 1 < len(messages):
|
|
645
|
+
next_msg = messages[i + 1]
|
|
646
|
+
next_type = next_msg.get("type")
|
|
647
|
+
is_next_message_image_result = next_type in [
|
|
648
|
+
"computer_call_output",
|
|
649
|
+
]
|
|
650
|
+
# Send tool message + separate user message with image (OpenAI compatible)
|
|
651
|
+
completion_messages += (
|
|
652
|
+
[
|
|
653
|
+
{
|
|
654
|
+
"role": "tool",
|
|
655
|
+
"tool_call_id": call_id,
|
|
656
|
+
"content": "[Execution completed. See screenshot below]",
|
|
657
|
+
},
|
|
658
|
+
{
|
|
659
|
+
"role": "user",
|
|
660
|
+
"content": [
|
|
661
|
+
{
|
|
662
|
+
"type": "image_url",
|
|
663
|
+
"image_url": {"url": output.get("image_url")},
|
|
664
|
+
}
|
|
665
|
+
],
|
|
666
|
+
},
|
|
667
|
+
]
|
|
668
|
+
if send_multiple_user_images_per_parallel_tool_results
|
|
669
|
+
or (not is_next_message_image_result)
|
|
670
|
+
else [
|
|
671
|
+
{
|
|
672
|
+
"role": "tool",
|
|
673
|
+
"tool_call_id": call_id,
|
|
674
|
+
"content": "[Execution completed. See screenshot below]",
|
|
675
|
+
},
|
|
676
|
+
]
|
|
677
|
+
)
|
|
678
|
+
else:
|
|
679
|
+
# Handle text output as tool response
|
|
680
|
+
completion_messages.append(
|
|
681
|
+
{"role": "tool", "tool_call_id": call_id, "content": str(output)}
|
|
682
|
+
)
|
|
683
|
+
|
|
554
684
|
return completion_messages
|
|
555
685
|
|
|
556
686
|
|
|
557
|
-
def convert_completion_messages_to_responses_items(
|
|
687
|
+
def convert_completion_messages_to_responses_items(
|
|
688
|
+
completion_messages: List[Dict[str, Any]],
|
|
689
|
+
) -> List[Dict[str, Any]]:
|
|
558
690
|
"""Convert completion messages format to responses_items message format."""
|
|
559
691
|
responses_items = []
|
|
560
692
|
skip_next = False
|
|
561
|
-
|
|
693
|
+
|
|
562
694
|
for i, message in enumerate(completion_messages):
|
|
563
695
|
if skip_next:
|
|
564
696
|
skip_next = False
|
|
@@ -567,25 +699,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
567
699
|
role = message.get("role")
|
|
568
700
|
content = message.get("content")
|
|
569
701
|
tool_calls = message.get("tool_calls", [])
|
|
570
|
-
|
|
702
|
+
|
|
571
703
|
# Handle assistant messages with text content
|
|
572
704
|
if role == "assistant" and content and isinstance(content, str):
|
|
573
|
-
responses_items.append(
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
"type": "output_text",
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
705
|
+
responses_items.append(
|
|
706
|
+
{
|
|
707
|
+
"type": "message",
|
|
708
|
+
"role": "assistant",
|
|
709
|
+
"content": [{"type": "output_text", "text": content}],
|
|
710
|
+
}
|
|
711
|
+
)
|
|
712
|
+
|
|
582
713
|
# Handle tool calls
|
|
583
714
|
if tool_calls:
|
|
584
715
|
for tool_call in tool_calls:
|
|
585
716
|
if tool_call.get("type") == "function":
|
|
586
717
|
function = tool_call.get("function", {})
|
|
587
718
|
function_name = function.get("name")
|
|
588
|
-
|
|
719
|
+
|
|
589
720
|
if function_name == "computer":
|
|
590
721
|
# Parse computer action
|
|
591
722
|
try:
|
|
@@ -594,31 +725,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
594
725
|
if action.get("action"):
|
|
595
726
|
action["type"] = action["action"]
|
|
596
727
|
del action["action"]
|
|
597
|
-
responses_items.append(
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
728
|
+
responses_items.append(
|
|
729
|
+
{
|
|
730
|
+
"type": "computer_call",
|
|
731
|
+
"call_id": tool_call.get("id"),
|
|
732
|
+
"action": action,
|
|
733
|
+
"status": "completed",
|
|
734
|
+
}
|
|
735
|
+
)
|
|
603
736
|
except json.JSONDecodeError:
|
|
604
737
|
# Fallback to function call format
|
|
605
|
-
responses_items.append(
|
|
738
|
+
responses_items.append(
|
|
739
|
+
{
|
|
740
|
+
"type": "function_call",
|
|
741
|
+
"call_id": tool_call.get("id"),
|
|
742
|
+
"name": function_name,
|
|
743
|
+
"arguments": function.get("arguments", "{}"),
|
|
744
|
+
"status": "completed",
|
|
745
|
+
}
|
|
746
|
+
)
|
|
747
|
+
else:
|
|
748
|
+
# Regular function call
|
|
749
|
+
responses_items.append(
|
|
750
|
+
{
|
|
606
751
|
"type": "function_call",
|
|
607
752
|
"call_id": tool_call.get("id"),
|
|
608
753
|
"name": function_name,
|
|
609
754
|
"arguments": function.get("arguments", "{}"),
|
|
610
|
-
"status": "completed"
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
responses_items.append({
|
|
615
|
-
"type": "function_call",
|
|
616
|
-
"call_id": tool_call.get("id"),
|
|
617
|
-
"name": function_name,
|
|
618
|
-
"arguments": function.get("arguments", "{}"),
|
|
619
|
-
"status": "completed"
|
|
620
|
-
})
|
|
621
|
-
|
|
755
|
+
"status": "completed",
|
|
756
|
+
}
|
|
757
|
+
)
|
|
758
|
+
|
|
622
759
|
# Handle tool messages (function/computer call outputs)
|
|
623
760
|
elif role == "tool" and content:
|
|
624
761
|
tool_call_id = message.get("tool_call_id")
|
|
@@ -627,74 +764,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
627
764
|
if content == "[Execution completed. See screenshot below]":
|
|
628
765
|
# Look ahead for the next user message with image
|
|
629
766
|
next_idx = i + 1
|
|
630
|
-
if (
|
|
631
|
-
|
|
632
|
-
|
|
767
|
+
if (
|
|
768
|
+
next_idx < len(completion_messages)
|
|
769
|
+
and completion_messages[next_idx].get("role") == "user"
|
|
770
|
+
and isinstance(completion_messages[next_idx].get("content"), list)
|
|
771
|
+
):
|
|
633
772
|
# Found the pattern - extract image from next message
|
|
634
773
|
next_content = completion_messages[next_idx]["content"]
|
|
635
774
|
for item in next_content:
|
|
636
775
|
if item.get("type") == "image_url":
|
|
637
|
-
responses_items.append(
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
"
|
|
642
|
-
|
|
776
|
+
responses_items.append(
|
|
777
|
+
{
|
|
778
|
+
"type": "computer_call_output",
|
|
779
|
+
"call_id": tool_call_id,
|
|
780
|
+
"output": {
|
|
781
|
+
"type": "input_image",
|
|
782
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
783
|
+
},
|
|
643
784
|
}
|
|
644
|
-
|
|
785
|
+
)
|
|
645
786
|
# Skip the next user message since we processed it
|
|
646
787
|
skip_next = True
|
|
647
788
|
break
|
|
648
789
|
else:
|
|
649
790
|
# No matching user message, treat as regular text
|
|
650
|
-
responses_items.append(
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
791
|
+
responses_items.append(
|
|
792
|
+
{
|
|
793
|
+
"type": "computer_call_output",
|
|
794
|
+
"call_id": tool_call_id,
|
|
795
|
+
"output": content,
|
|
796
|
+
}
|
|
797
|
+
)
|
|
655
798
|
else:
|
|
656
799
|
# Determine if this is a computer call or function call output
|
|
657
800
|
try:
|
|
658
801
|
# Try to parse as structured output
|
|
659
802
|
parsed_content = json.loads(content)
|
|
660
803
|
if parsed_content.get("type") == "input_image":
|
|
661
|
-
responses_items.append(
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
804
|
+
responses_items.append(
|
|
805
|
+
{
|
|
806
|
+
"type": "computer_call_output",
|
|
807
|
+
"call_id": tool_call_id,
|
|
808
|
+
"output": parsed_content,
|
|
809
|
+
}
|
|
810
|
+
)
|
|
666
811
|
else:
|
|
667
|
-
responses_items.append(
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
812
|
+
responses_items.append(
|
|
813
|
+
{
|
|
814
|
+
"type": "computer_call_output",
|
|
815
|
+
"call_id": tool_call_id,
|
|
816
|
+
"output": content,
|
|
817
|
+
}
|
|
818
|
+
)
|
|
672
819
|
except json.JSONDecodeError:
|
|
673
820
|
# Plain text output - could be function or computer call
|
|
674
|
-
responses_items.append(
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
821
|
+
responses_items.append(
|
|
822
|
+
{
|
|
823
|
+
"type": "function_call_output",
|
|
824
|
+
"call_id": tool_call_id,
|
|
825
|
+
"output": content,
|
|
826
|
+
}
|
|
827
|
+
)
|
|
679
828
|
elif isinstance(content, list):
|
|
680
829
|
# Handle structured content (e.g., images)
|
|
681
830
|
for item in content:
|
|
682
831
|
if item.get("type") == "image_url":
|
|
683
|
-
responses_items.append(
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
"
|
|
688
|
-
|
|
832
|
+
responses_items.append(
|
|
833
|
+
{
|
|
834
|
+
"type": "computer_call_output",
|
|
835
|
+
"call_id": tool_call_id,
|
|
836
|
+
"output": {
|
|
837
|
+
"type": "input_image",
|
|
838
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
839
|
+
},
|
|
689
840
|
}
|
|
690
|
-
|
|
841
|
+
)
|
|
691
842
|
elif item.get("type") == "text":
|
|
692
|
-
responses_items.append(
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
843
|
+
responses_items.append(
|
|
844
|
+
{
|
|
845
|
+
"type": "function_call_output",
|
|
846
|
+
"call_id": tool_call_id,
|
|
847
|
+
"output": item.get("text"),
|
|
848
|
+
}
|
|
849
|
+
)
|
|
850
|
+
|
|
698
851
|
# Handle actual user messages
|
|
699
852
|
elif role == "user" and content:
|
|
700
853
|
if isinstance(content, list):
|
|
@@ -702,27 +855,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
702
855
|
user_content = []
|
|
703
856
|
for item in content:
|
|
704
857
|
if item.get("type") == "image_url":
|
|
705
|
-
user_content.append(
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
858
|
+
user_content.append(
|
|
859
|
+
{
|
|
860
|
+
"type": "input_image",
|
|
861
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
862
|
+
}
|
|
863
|
+
)
|
|
709
864
|
elif item.get("type") == "text":
|
|
710
|
-
user_content.append({
|
|
711
|
-
|
|
712
|
-
"text": item.get("text")
|
|
713
|
-
})
|
|
714
|
-
|
|
865
|
+
user_content.append({"type": "input_text", "text": item.get("text")})
|
|
866
|
+
|
|
715
867
|
if user_content:
|
|
716
|
-
responses_items.append(
|
|
717
|
-
"role": "user",
|
|
718
|
-
|
|
719
|
-
"content": user_content
|
|
720
|
-
})
|
|
868
|
+
responses_items.append(
|
|
869
|
+
{"role": "user", "type": "message", "content": user_content}
|
|
870
|
+
)
|
|
721
871
|
elif isinstance(content, str):
|
|
722
872
|
# Handle simple text user message
|
|
723
|
-
responses_items.append({
|
|
724
|
-
|
|
725
|
-
"content": content
|
|
726
|
-
})
|
|
727
|
-
|
|
873
|
+
responses_items.append({"role": "user", "content": content})
|
|
874
|
+
|
|
728
875
|
return responses_items
|