cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/responses.py
CHANGED
|
@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
|
|
|
6
6
|
import base64
|
|
7
7
|
import json
|
|
8
8
|
import uuid
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
10
10
|
|
|
11
|
+
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
11
12
|
from openai.types.responses.response_computer_tool_call_param import (
|
|
12
|
-
ResponseComputerToolCallParam,
|
|
13
13
|
ActionClick,
|
|
14
14
|
ActionDoubleClick,
|
|
15
15
|
ActionDrag,
|
|
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
|
|
|
18
18
|
ActionMove,
|
|
19
19
|
ActionScreenshot,
|
|
20
20
|
ActionScroll,
|
|
21
|
+
)
|
|
22
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
21
23
|
ActionType as ActionTypeAction,
|
|
24
|
+
)
|
|
25
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
22
26
|
ActionWait,
|
|
23
|
-
PendingSafetyCheck
|
|
27
|
+
PendingSafetyCheck,
|
|
28
|
+
ResponseComputerToolCallParam,
|
|
29
|
+
)
|
|
30
|
+
from openai.types.responses.response_function_tool_call_param import (
|
|
31
|
+
ResponseFunctionToolCallParam,
|
|
24
32
|
)
|
|
25
|
-
|
|
26
|
-
from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
|
|
27
|
-
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
28
|
-
from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
|
|
29
|
-
from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
|
|
30
|
-
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
31
33
|
from openai.types.responses.response_input_image_param import ResponseInputImageParam
|
|
34
|
+
from openai.types.responses.response_output_message_param import (
|
|
35
|
+
ResponseOutputMessageParam,
|
|
36
|
+
)
|
|
37
|
+
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
38
|
+
from openai.types.responses.response_reasoning_item_param import (
|
|
39
|
+
ResponseReasoningItemParam,
|
|
40
|
+
Summary,
|
|
41
|
+
)
|
|
42
|
+
|
|
32
43
|
|
|
33
44
|
def random_id():
|
|
34
45
|
return str(uuid.uuid4())
|
|
35
46
|
|
|
47
|
+
|
|
36
48
|
# User message items
|
|
37
49
|
def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
|
|
38
50
|
return EasyInputMessageParam(
|
|
39
51
|
content=[
|
|
40
52
|
ResponseInputImageParam(
|
|
41
53
|
type="input_image",
|
|
42
|
-
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
|
|
43
|
-
)
|
|
54
|
+
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
|
|
55
|
+
) # type: ignore
|
|
44
56
|
],
|
|
45
57
|
role="user",
|
|
46
|
-
type="message"
|
|
58
|
+
type="message",
|
|
47
59
|
)
|
|
48
60
|
|
|
61
|
+
|
|
49
62
|
# Text items
|
|
50
63
|
def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
|
|
51
64
|
return ResponseReasoningItemParam(
|
|
52
|
-
id=random_id(),
|
|
53
|
-
summary=[
|
|
54
|
-
Summary(text=reasoning, type="summary_text")
|
|
55
|
-
],
|
|
56
|
-
type="reasoning"
|
|
65
|
+
id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
|
|
57
66
|
)
|
|
58
67
|
|
|
68
|
+
|
|
59
69
|
def make_output_text_item(content: str) -> ResponseOutputMessageParam:
|
|
60
70
|
return ResponseOutputMessageParam(
|
|
61
71
|
id=random_id(),
|
|
62
|
-
content=[
|
|
63
|
-
ResponseOutputTextParam(
|
|
64
|
-
text=content,
|
|
65
|
-
type="output_text",
|
|
66
|
-
annotations=[]
|
|
67
|
-
)
|
|
68
|
-
],
|
|
72
|
+
content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
|
|
69
73
|
role="assistant",
|
|
70
74
|
status="completed",
|
|
71
|
-
type="message"
|
|
75
|
+
type="message",
|
|
72
76
|
)
|
|
73
77
|
|
|
78
|
+
|
|
74
79
|
# Function call items
|
|
75
|
-
def make_function_call_item(
|
|
80
|
+
def make_function_call_item(
|
|
81
|
+
function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
|
|
82
|
+
) -> ResponseFunctionToolCallParam:
|
|
76
83
|
return ResponseFunctionToolCallParam(
|
|
77
84
|
id=random_id(),
|
|
78
85
|
call_id=call_id if call_id else random_id(),
|
|
79
86
|
name=function_name,
|
|
80
87
|
arguments=json.dumps(arguments),
|
|
81
88
|
status="completed",
|
|
82
|
-
type="function_call"
|
|
89
|
+
type="function_call",
|
|
83
90
|
)
|
|
84
91
|
|
|
92
|
+
|
|
85
93
|
# Computer tool call items
|
|
86
|
-
def make_click_item(
|
|
94
|
+
def make_click_item(
|
|
95
|
+
x: int,
|
|
96
|
+
y: int,
|
|
97
|
+
button: Literal["left", "right", "wheel", "back", "forward"] = "left",
|
|
98
|
+
call_id: Optional[str] = None,
|
|
99
|
+
) -> ResponseComputerToolCallParam:
|
|
87
100
|
return ResponseComputerToolCallParam(
|
|
88
101
|
id=random_id(),
|
|
89
102
|
call_id=call_id if call_id else random_id(),
|
|
90
|
-
action=ActionClick(
|
|
91
|
-
button=button,
|
|
92
|
-
type="click",
|
|
93
|
-
x=x,
|
|
94
|
-
y=y
|
|
95
|
-
),
|
|
103
|
+
action=ActionClick(button=button, type="click", x=x, y=y),
|
|
96
104
|
pending_safety_checks=[],
|
|
97
105
|
status="completed",
|
|
98
|
-
type="computer_call"
|
|
106
|
+
type="computer_call",
|
|
99
107
|
)
|
|
100
108
|
|
|
101
|
-
|
|
109
|
+
|
|
110
|
+
def make_double_click_item(
|
|
111
|
+
x: int, y: int, call_id: Optional[str] = None
|
|
112
|
+
) -> ResponseComputerToolCallParam:
|
|
102
113
|
return ResponseComputerToolCallParam(
|
|
103
114
|
id=random_id(),
|
|
104
115
|
call_id=call_id if call_id else random_id(),
|
|
105
|
-
action=ActionDoubleClick(
|
|
106
|
-
type="double_click",
|
|
107
|
-
x=x,
|
|
108
|
-
y=y
|
|
109
|
-
),
|
|
116
|
+
action=ActionDoubleClick(type="double_click", x=x, y=y),
|
|
110
117
|
pending_safety_checks=[],
|
|
111
118
|
status="completed",
|
|
112
|
-
type="computer_call"
|
|
119
|
+
type="computer_call",
|
|
113
120
|
)
|
|
114
121
|
|
|
115
|
-
|
|
122
|
+
|
|
123
|
+
def make_drag_item(
|
|
124
|
+
path: List[Dict[str, int]], call_id: Optional[str] = None
|
|
125
|
+
) -> ResponseComputerToolCallParam:
|
|
116
126
|
drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
|
|
117
127
|
return ResponseComputerToolCallParam(
|
|
118
128
|
id=random_id(),
|
|
119
129
|
call_id=call_id if call_id else random_id(),
|
|
120
|
-
action=ActionDrag(
|
|
121
|
-
path=drag_path,
|
|
122
|
-
type="drag"
|
|
123
|
-
),
|
|
130
|
+
action=ActionDrag(path=drag_path, type="drag"),
|
|
124
131
|
pending_safety_checks=[],
|
|
125
132
|
status="completed",
|
|
126
|
-
type="computer_call"
|
|
133
|
+
type="computer_call",
|
|
127
134
|
)
|
|
128
135
|
|
|
129
|
-
|
|
136
|
+
|
|
137
|
+
def make_keypress_item(
|
|
138
|
+
keys: List[str], call_id: Optional[str] = None
|
|
139
|
+
) -> ResponseComputerToolCallParam:
|
|
130
140
|
return ResponseComputerToolCallParam(
|
|
131
141
|
id=random_id(),
|
|
132
142
|
call_id=call_id if call_id else random_id(),
|
|
133
|
-
action=ActionKeypress(
|
|
134
|
-
keys=keys,
|
|
135
|
-
type="keypress"
|
|
136
|
-
),
|
|
143
|
+
action=ActionKeypress(keys=keys, type="keypress"),
|
|
137
144
|
pending_safety_checks=[],
|
|
138
145
|
status="completed",
|
|
139
|
-
type="computer_call"
|
|
146
|
+
type="computer_call",
|
|
140
147
|
)
|
|
141
148
|
|
|
149
|
+
|
|
142
150
|
def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
143
151
|
return ResponseComputerToolCallParam(
|
|
144
152
|
id=random_id(),
|
|
145
153
|
call_id=call_id if call_id else random_id(),
|
|
146
|
-
action=ActionMove(
|
|
147
|
-
type="move",
|
|
148
|
-
x=x,
|
|
149
|
-
y=y
|
|
150
|
-
),
|
|
154
|
+
action=ActionMove(type="move", x=x, y=y),
|
|
151
155
|
pending_safety_checks=[],
|
|
152
156
|
status="completed",
|
|
153
|
-
type="computer_call"
|
|
157
|
+
type="computer_call",
|
|
154
158
|
)
|
|
155
159
|
|
|
160
|
+
|
|
156
161
|
def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
157
162
|
return ResponseComputerToolCallParam(
|
|
158
163
|
id=random_id(),
|
|
159
164
|
call_id=call_id if call_id else random_id(),
|
|
160
|
-
action=ActionScreenshot(
|
|
161
|
-
type="screenshot"
|
|
162
|
-
),
|
|
165
|
+
action=ActionScreenshot(type="screenshot"),
|
|
163
166
|
pending_safety_checks=[],
|
|
164
167
|
status="completed",
|
|
165
|
-
type="computer_call"
|
|
168
|
+
type="computer_call",
|
|
166
169
|
)
|
|
167
170
|
|
|
168
|
-
|
|
171
|
+
|
|
172
|
+
def make_scroll_item(
|
|
173
|
+
x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
|
|
174
|
+
) -> ResponseComputerToolCallParam:
|
|
169
175
|
return ResponseComputerToolCallParam(
|
|
170
176
|
id=random_id(),
|
|
171
177
|
call_id=call_id if call_id else random_id(),
|
|
172
|
-
action=ActionScroll(
|
|
173
|
-
scroll_x=scroll_x,
|
|
174
|
-
scroll_y=scroll_y,
|
|
175
|
-
type="scroll",
|
|
176
|
-
x=x,
|
|
177
|
-
y=y
|
|
178
|
-
),
|
|
178
|
+
action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
|
|
179
179
|
pending_safety_checks=[],
|
|
180
180
|
status="completed",
|
|
181
|
-
type="computer_call"
|
|
181
|
+
type="computer_call",
|
|
182
182
|
)
|
|
183
183
|
|
|
184
|
+
|
|
184
185
|
def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
185
186
|
return ResponseComputerToolCallParam(
|
|
186
187
|
id=random_id(),
|
|
187
188
|
call_id=call_id if call_id else random_id(),
|
|
188
|
-
action=ActionTypeAction(
|
|
189
|
-
text=text,
|
|
190
|
-
type="type"
|
|
191
|
-
),
|
|
189
|
+
action=ActionTypeAction(text=text, type="type"),
|
|
192
190
|
pending_safety_checks=[],
|
|
193
191
|
status="completed",
|
|
194
|
-
type="computer_call"
|
|
192
|
+
type="computer_call",
|
|
195
193
|
)
|
|
196
194
|
|
|
195
|
+
|
|
197
196
|
def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
198
197
|
return ResponseComputerToolCallParam(
|
|
199
198
|
id=random_id(),
|
|
200
199
|
call_id=call_id if call_id else random_id(),
|
|
201
|
-
action=ActionWait(
|
|
202
|
-
type="wait"
|
|
203
|
-
),
|
|
200
|
+
action=ActionWait(type="wait"),
|
|
204
201
|
pending_safety_checks=[],
|
|
205
202
|
status="completed",
|
|
206
|
-
type="computer_call"
|
|
203
|
+
type="computer_call",
|
|
207
204
|
)
|
|
208
205
|
|
|
206
|
+
|
|
209
207
|
# Extra anthropic computer calls
|
|
210
|
-
def make_left_mouse_down_item(
|
|
208
|
+
def make_left_mouse_down_item(
|
|
209
|
+
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
210
|
+
) -> Dict[str, Any]:
|
|
211
211
|
return {
|
|
212
212
|
"id": random_id(),
|
|
213
213
|
"call_id": call_id if call_id else random_id(),
|
|
214
|
-
"action": {
|
|
215
|
-
"type": "left_mouse_down",
|
|
216
|
-
"x": x,
|
|
217
|
-
"y": y
|
|
218
|
-
},
|
|
214
|
+
"action": {"type": "left_mouse_down", "x": x, "y": y},
|
|
219
215
|
"pending_safety_checks": [],
|
|
220
216
|
"status": "completed",
|
|
221
|
-
"type": "computer_call"
|
|
217
|
+
"type": "computer_call",
|
|
222
218
|
}
|
|
223
219
|
|
|
224
|
-
|
|
220
|
+
|
|
221
|
+
def make_left_mouse_up_item(
|
|
222
|
+
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
223
|
+
) -> Dict[str, Any]:
|
|
225
224
|
return {
|
|
226
225
|
"id": random_id(),
|
|
227
226
|
"call_id": call_id if call_id else random_id(),
|
|
228
|
-
"action": {
|
|
229
|
-
"type": "left_mouse_up",
|
|
230
|
-
"x": x,
|
|
231
|
-
"y": y
|
|
232
|
-
},
|
|
227
|
+
"action": {"type": "left_mouse_up", "x": x, "y": y},
|
|
233
228
|
"pending_safety_checks": [],
|
|
234
229
|
"status": "completed",
|
|
235
|
-
"type": "computer_call"
|
|
230
|
+
"type": "computer_call",
|
|
236
231
|
}
|
|
237
232
|
|
|
238
|
-
|
|
233
|
+
|
|
234
|
+
def make_failed_tool_call_items(
|
|
235
|
+
tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
|
|
236
|
+
) -> List[Dict[str, Any]]:
|
|
239
237
|
call_id = call_id if call_id else random_id()
|
|
240
238
|
return [
|
|
241
239
|
{
|
|
@@ -249,9 +247,10 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
|
|
|
249
247
|
"type": "function_call_output",
|
|
250
248
|
"call_id": call_id,
|
|
251
249
|
"output": json.dumps({"error": error_message}),
|
|
252
|
-
}
|
|
250
|
+
},
|
|
253
251
|
]
|
|
254
252
|
|
|
253
|
+
|
|
255
254
|
def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
|
|
256
255
|
call_id = call_id if call_id else random_id()
|
|
257
256
|
return {
|
|
@@ -260,12 +259,15 @@ def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> D
|
|
|
260
259
|
"output": json.dumps({"error": error_message}),
|
|
261
260
|
}
|
|
262
261
|
|
|
263
|
-
|
|
262
|
+
|
|
263
|
+
def replace_failed_computer_calls_with_function_calls(
|
|
264
|
+
messages: List[Dict[str, Any]],
|
|
265
|
+
) -> List[Dict[str, Any]]:
|
|
264
266
|
"""
|
|
265
267
|
Replace computer_call items with function_call items if they share a call_id with a function_call_output.
|
|
266
268
|
This indicates the computer call failed and should be treated as a function call instead.
|
|
267
269
|
We do this because the computer_call_output items do not support text output.
|
|
268
|
-
|
|
270
|
+
|
|
269
271
|
Args:
|
|
270
272
|
messages: List of message items to process
|
|
271
273
|
"""
|
|
@@ -278,16 +280,15 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
|
|
|
278
280
|
call_id = msg.get("call_id")
|
|
279
281
|
if call_id:
|
|
280
282
|
failed_call_ids.add(call_id)
|
|
281
|
-
|
|
283
|
+
|
|
282
284
|
# Replace computer_call items that have matching call_ids
|
|
283
285
|
for i, msg in enumerate(messages):
|
|
284
|
-
if
|
|
285
|
-
|
|
286
|
-
|
|
286
|
+
if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
|
|
287
|
+
|
|
287
288
|
# Extract action from computer_call
|
|
288
289
|
action = msg.get("action", {})
|
|
289
290
|
call_id = msg.get("call_id")
|
|
290
|
-
|
|
291
|
+
|
|
291
292
|
# Create function_call replacement
|
|
292
293
|
messages[i] = {
|
|
293
294
|
"type": "function_call",
|
|
@@ -296,27 +297,30 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
|
|
|
296
297
|
"name": "computer",
|
|
297
298
|
"arguments": json.dumps(action),
|
|
298
299
|
}
|
|
299
|
-
|
|
300
|
+
|
|
300
301
|
return messages
|
|
301
302
|
|
|
303
|
+
|
|
302
304
|
# Conversion functions between element descriptions and coordinates
|
|
303
|
-
def convert_computer_calls_desc2xy(
|
|
305
|
+
def convert_computer_calls_desc2xy(
|
|
306
|
+
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
307
|
+
) -> List[Dict[str, Any]]:
|
|
304
308
|
"""
|
|
305
309
|
Convert computer calls from element descriptions to x,y coordinates.
|
|
306
|
-
|
|
310
|
+
|
|
307
311
|
Args:
|
|
308
312
|
responses_items: List of response items containing computer calls with element_description
|
|
309
313
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
310
|
-
|
|
314
|
+
|
|
311
315
|
Returns:
|
|
312
316
|
List of response items with element_description replaced by x,y coordinates
|
|
313
317
|
"""
|
|
314
318
|
converted_items = []
|
|
315
|
-
|
|
319
|
+
|
|
316
320
|
for item in responses_items:
|
|
317
321
|
if item.get("type") == "computer_call" and "action" in item:
|
|
318
322
|
action = item["action"].copy()
|
|
319
|
-
|
|
323
|
+
|
|
320
324
|
# Handle single element_description
|
|
321
325
|
if "element_description" in action:
|
|
322
326
|
desc = action["element_description"]
|
|
@@ -325,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
|
|
|
325
329
|
action["x"] = x
|
|
326
330
|
action["y"] = y
|
|
327
331
|
del action["element_description"]
|
|
328
|
-
|
|
332
|
+
|
|
329
333
|
# Handle start_element_description and end_element_description for drag operations
|
|
330
334
|
elif "start_element_description" in action and "end_element_description" in action:
|
|
331
335
|
start_desc = action["start_element_description"]
|
|
332
336
|
end_desc = action["end_element_description"]
|
|
333
|
-
|
|
337
|
+
|
|
334
338
|
if start_desc in desc2xy and end_desc in desc2xy:
|
|
335
339
|
start_x, start_y = desc2xy[start_desc]
|
|
336
340
|
end_x, end_y = desc2xy[end_desc]
|
|
337
341
|
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
338
342
|
del action["start_element_description"]
|
|
339
343
|
del action["end_element_description"]
|
|
340
|
-
|
|
344
|
+
|
|
341
345
|
converted_item = item.copy()
|
|
342
346
|
converted_item["action"] = action
|
|
343
347
|
converted_items.append(converted_item)
|
|
344
348
|
else:
|
|
345
349
|
converted_items.append(item)
|
|
346
|
-
|
|
350
|
+
|
|
347
351
|
return converted_items
|
|
348
352
|
|
|
349
353
|
|
|
350
|
-
def convert_computer_calls_xy2desc(
|
|
354
|
+
def convert_computer_calls_xy2desc(
|
|
355
|
+
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
356
|
+
) -> List[Dict[str, Any]]:
|
|
351
357
|
"""
|
|
352
358
|
Convert computer calls from x,y coordinates to element descriptions.
|
|
353
|
-
|
|
359
|
+
|
|
354
360
|
Args:
|
|
355
361
|
responses_items: List of response items containing computer calls with x,y coordinates
|
|
356
362
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
357
|
-
|
|
363
|
+
|
|
358
364
|
Returns:
|
|
359
365
|
List of response items with x,y coordinates replaced by element_description
|
|
360
366
|
"""
|
|
361
367
|
# Create reverse mapping from coordinates to descriptions
|
|
362
368
|
xy2desc = {coords: desc for desc, coords in desc2xy.items()}
|
|
363
|
-
|
|
369
|
+
|
|
364
370
|
converted_items = []
|
|
365
|
-
|
|
371
|
+
|
|
366
372
|
for item in responses_items:
|
|
367
373
|
if item.get("type") == "computer_call" and "action" in item:
|
|
368
374
|
action = item["action"].copy()
|
|
369
|
-
|
|
375
|
+
|
|
370
376
|
# Handle single x,y coordinates
|
|
371
377
|
if "x" in action and "y" in action:
|
|
372
378
|
coords = (action["x"], action["y"])
|
|
@@ -374,77 +380,94 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
|
|
|
374
380
|
action["element_description"] = xy2desc[coords]
|
|
375
381
|
del action["x"]
|
|
376
382
|
del action["y"]
|
|
377
|
-
|
|
383
|
+
|
|
378
384
|
# Handle path for drag operations
|
|
379
385
|
elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
|
|
380
386
|
start_point = action["path"][0]
|
|
381
387
|
end_point = action["path"][1]
|
|
382
|
-
|
|
383
|
-
if (
|
|
384
|
-
"x" in
|
|
385
|
-
|
|
388
|
+
|
|
389
|
+
if (
|
|
390
|
+
"x" in start_point
|
|
391
|
+
and "y" in start_point
|
|
392
|
+
and "x" in end_point
|
|
393
|
+
and "y" in end_point
|
|
394
|
+
):
|
|
395
|
+
|
|
386
396
|
start_coords = (start_point["x"], start_point["y"])
|
|
387
397
|
end_coords = (end_point["x"], end_point["y"])
|
|
388
|
-
|
|
398
|
+
|
|
389
399
|
if start_coords in xy2desc and end_coords in xy2desc:
|
|
390
400
|
action["start_element_description"] = xy2desc[start_coords]
|
|
391
401
|
action["end_element_description"] = xy2desc[end_coords]
|
|
392
402
|
del action["path"]
|
|
393
|
-
|
|
403
|
+
|
|
394
404
|
converted_item = item.copy()
|
|
395
405
|
converted_item["action"] = action
|
|
396
406
|
converted_items.append(converted_item)
|
|
397
407
|
else:
|
|
398
408
|
converted_items.append(item)
|
|
399
|
-
|
|
409
|
+
|
|
400
410
|
return converted_items
|
|
401
411
|
|
|
402
412
|
|
|
403
413
|
def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
|
|
404
414
|
"""
|
|
405
415
|
Extract all element descriptions from computer calls in responses items.
|
|
406
|
-
|
|
416
|
+
|
|
407
417
|
Args:
|
|
408
418
|
responses_items: List of response items containing computer calls
|
|
409
|
-
|
|
419
|
+
|
|
410
420
|
Returns:
|
|
411
421
|
List of unique element descriptions found in computer calls
|
|
412
422
|
"""
|
|
413
423
|
descriptions = set()
|
|
414
|
-
|
|
424
|
+
|
|
415
425
|
for item in responses_items:
|
|
416
426
|
if item.get("type") == "computer_call" and "action" in item:
|
|
417
427
|
action = item["action"]
|
|
418
|
-
|
|
428
|
+
|
|
419
429
|
# Handle single element_description
|
|
420
430
|
if "element_description" in action:
|
|
421
431
|
descriptions.add(action["element_description"])
|
|
422
|
-
|
|
432
|
+
|
|
423
433
|
# Handle start_element_description and end_element_description for drag operations
|
|
424
434
|
if "start_element_description" in action:
|
|
425
435
|
descriptions.add(action["start_element_description"])
|
|
426
|
-
|
|
436
|
+
|
|
427
437
|
if "end_element_description" in action:
|
|
428
438
|
descriptions.add(action["end_element_description"])
|
|
429
|
-
|
|
439
|
+
|
|
430
440
|
return list(descriptions)
|
|
431
441
|
|
|
432
442
|
|
|
433
443
|
# Conversion functions between responses_items and completion messages formats
|
|
434
|
-
def convert_responses_items_to_completion_messages(
|
|
444
|
+
def convert_responses_items_to_completion_messages(
|
|
445
|
+
messages: List[Dict[str, Any]],
|
|
446
|
+
allow_images_in_tool_results: bool = True,
|
|
447
|
+
send_multiple_user_images_per_parallel_tool_results: bool = False,
|
|
448
|
+
use_xml_tools: bool = False,
|
|
449
|
+
) -> List[Dict[str, Any]]:
|
|
435
450
|
"""Convert responses_items message format to liteLLM completion format.
|
|
436
|
-
|
|
451
|
+
|
|
437
452
|
Args:
|
|
438
453
|
messages: List of responses_items format messages
|
|
439
454
|
allow_images_in_tool_results: If True, include images in tool role messages.
|
|
440
455
|
If False, send tool message + separate user message with image.
|
|
456
|
+
send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
|
|
457
|
+
use_xml_tools: If True, use XML-style <tool_call> tags instead of tool_calls array.
|
|
458
|
+
Also sends tool results as user messages instead of tool role.
|
|
441
459
|
"""
|
|
460
|
+
# Assert that allow_images_in_tool_results is False when use_xml_tools is True
|
|
461
|
+
if use_xml_tools:
|
|
462
|
+
assert (
|
|
463
|
+
not allow_images_in_tool_results
|
|
464
|
+
), "allow_images_in_tool_results must be False when use_xml_tools is True"
|
|
442
465
|
completion_messages = []
|
|
443
|
-
|
|
444
|
-
for message in messages:
|
|
466
|
+
|
|
467
|
+
for i, message in enumerate(messages):
|
|
445
468
|
msg_type = message.get("type")
|
|
446
469
|
role = message.get("role")
|
|
447
|
-
|
|
470
|
+
|
|
448
471
|
# Handle user messages (both with and without explicit type)
|
|
449
472
|
if role == "user" or msg_type == "user":
|
|
450
473
|
content = message.get("content", "")
|
|
@@ -453,34 +476,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
453
476
|
completion_content = []
|
|
454
477
|
for item in content:
|
|
455
478
|
if item.get("type") == "input_image":
|
|
456
|
-
completion_content.append(
|
|
457
|
-
"type": "image_url",
|
|
458
|
-
|
|
459
|
-
"url": item.get("image_url")
|
|
460
|
-
}
|
|
461
|
-
})
|
|
479
|
+
completion_content.append(
|
|
480
|
+
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
|
481
|
+
)
|
|
462
482
|
elif item.get("type") == "input_text":
|
|
463
|
-
completion_content.append({
|
|
464
|
-
"type": "text",
|
|
465
|
-
"text": item.get("text")
|
|
466
|
-
})
|
|
483
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
467
484
|
elif item.get("type") == "text":
|
|
468
|
-
completion_content.append({
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
})
|
|
472
|
-
|
|
473
|
-
completion_messages.append({
|
|
474
|
-
"role": "user",
|
|
475
|
-
"content": completion_content
|
|
476
|
-
})
|
|
485
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
486
|
+
|
|
487
|
+
completion_messages.append({"role": "user", "content": completion_content})
|
|
477
488
|
elif isinstance(content, str):
|
|
478
489
|
# Handle string content
|
|
479
|
-
completion_messages.append({
|
|
480
|
-
|
|
481
|
-
"content": content
|
|
482
|
-
})
|
|
483
|
-
|
|
490
|
+
completion_messages.append({"role": "user", "content": content})
|
|
491
|
+
|
|
484
492
|
# Handle assistant messages
|
|
485
493
|
elif role == "assistant" or msg_type == "message":
|
|
486
494
|
content = message.get("content", [])
|
|
@@ -491,13 +499,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
491
499
|
text_parts.append(item.get("text", ""))
|
|
492
500
|
elif item.get("type") == "text":
|
|
493
501
|
text_parts.append(item.get("text", ""))
|
|
494
|
-
|
|
502
|
+
|
|
495
503
|
if text_parts:
|
|
496
|
-
completion_messages.append(
|
|
497
|
-
"role": "assistant",
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
504
|
+
completion_messages.append(
|
|
505
|
+
{"role": "assistant", "content": "\n".join(text_parts)}
|
|
506
|
+
)
|
|
507
|
+
|
|
501
508
|
# Handle reasoning items (convert to assistant message)
|
|
502
509
|
elif msg_type == "reasoning":
|
|
503
510
|
summary = message.get("summary", [])
|
|
@@ -505,107 +512,185 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
505
512
|
for item in summary:
|
|
506
513
|
if item.get("type") == "summary_text":
|
|
507
514
|
text_parts.append(item.get("text", ""))
|
|
508
|
-
|
|
515
|
+
|
|
509
516
|
if text_parts:
|
|
510
|
-
completion_messages.append({
|
|
511
|
-
|
|
512
|
-
"content": "\n".join(text_parts)
|
|
513
|
-
})
|
|
514
|
-
|
|
517
|
+
completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
|
|
518
|
+
|
|
515
519
|
# Handle function calls
|
|
516
520
|
elif msg_type == "function_call":
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
completion_messages
|
|
520
|
-
"role": "assistant",
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
"
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
"
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
521
|
+
if use_xml_tools:
|
|
522
|
+
# Use XML format instead of tool_calls array
|
|
523
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
524
|
+
completion_messages.append({"role": "assistant", "content": ""})
|
|
525
|
+
|
|
526
|
+
# Ensure arguments is a JSON string (not a dict)
|
|
527
|
+
arguments = message.get("arguments")
|
|
528
|
+
if isinstance(arguments, dict):
|
|
529
|
+
arguments = json.dumps(arguments)
|
|
530
|
+
|
|
531
|
+
# Format as XML tool call
|
|
532
|
+
tool_call_xml = f'<tool_call>{{"name": "{message.get("name")}", "arguments": {arguments}}}</tool_call>'
|
|
533
|
+
if completion_messages[-1]["content"]:
|
|
534
|
+
completion_messages[-1]["content"] += "\n" + tool_call_xml
|
|
535
|
+
else:
|
|
536
|
+
completion_messages[-1]["content"] = tool_call_xml
|
|
537
|
+
else:
|
|
538
|
+
# Add tool call to last assistant message or create new one
|
|
539
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
540
|
+
completion_messages.append(
|
|
541
|
+
{"role": "assistant", "content": "", "tool_calls": []}
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
if "tool_calls" not in completion_messages[-1]:
|
|
545
|
+
completion_messages[-1]["tool_calls"] = []
|
|
546
|
+
|
|
547
|
+
# Ensure arguments is a JSON string (not a dict)
|
|
548
|
+
arguments = message.get("arguments")
|
|
549
|
+
if isinstance(arguments, dict):
|
|
550
|
+
arguments = json.dumps(arguments)
|
|
551
|
+
|
|
552
|
+
completion_messages[-1]["tool_calls"].append(
|
|
553
|
+
{
|
|
554
|
+
"id": message.get("call_id"),
|
|
555
|
+
"type": "function",
|
|
556
|
+
"function": {
|
|
557
|
+
"name": message.get("name"),
|
|
558
|
+
"arguments": arguments,
|
|
559
|
+
},
|
|
560
|
+
}
|
|
561
|
+
)
|
|
562
|
+
|
|
537
563
|
# Handle computer calls
|
|
538
564
|
elif msg_type == "computer_call":
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
completion_messages
|
|
542
|
-
"role": "assistant",
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
"
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
565
|
+
if use_xml_tools:
|
|
566
|
+
# Use XML format instead of tool_calls array
|
|
567
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
568
|
+
completion_messages.append({"role": "assistant", "content": ""})
|
|
569
|
+
|
|
570
|
+
action = message.get("action", {})
|
|
571
|
+
# Format as XML tool call
|
|
572
|
+
tool_call_xml = f'<tool_call>{{"name": "computer", "arguments": {json.dumps(action)}}}</tool_call>'
|
|
573
|
+
if completion_messages[-1]["content"]:
|
|
574
|
+
completion_messages[-1]["content"] += "\n" + tool_call_xml
|
|
575
|
+
else:
|
|
576
|
+
completion_messages[-1]["content"] = tool_call_xml
|
|
577
|
+
else:
|
|
578
|
+
# Add tool call to last assistant message or create new one
|
|
579
|
+
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
580
|
+
completion_messages.append(
|
|
581
|
+
{"role": "assistant", "content": "", "tool_calls": []}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
if "tool_calls" not in completion_messages[-1]:
|
|
585
|
+
completion_messages[-1]["tool_calls"] = []
|
|
586
|
+
|
|
587
|
+
action = message.get("action", {})
|
|
588
|
+
completion_messages[-1]["tool_calls"].append(
|
|
589
|
+
{
|
|
590
|
+
"id": message.get("call_id"),
|
|
591
|
+
"type": "function",
|
|
592
|
+
"function": {"name": "computer", "arguments": json.dumps(action)},
|
|
593
|
+
}
|
|
594
|
+
)
|
|
595
|
+
|
|
560
596
|
# Handle function/computer call outputs
|
|
561
597
|
elif msg_type in ["function_call_output", "computer_call_output"]:
|
|
562
598
|
output = message.get("output")
|
|
563
599
|
call_id = message.get("call_id")
|
|
564
|
-
|
|
565
|
-
if
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
"
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
600
|
+
|
|
601
|
+
if use_xml_tools:
|
|
602
|
+
# When using XML tools, send all results as user messages
|
|
603
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
604
|
+
# Send image as user message
|
|
605
|
+
completion_messages.append(
|
|
606
|
+
{
|
|
607
|
+
"role": "user",
|
|
608
|
+
"content": [
|
|
609
|
+
{
|
|
610
|
+
"type": "image_url",
|
|
611
|
+
"image_url": {"url": output.get("image_url")},
|
|
612
|
+
}
|
|
613
|
+
],
|
|
614
|
+
}
|
|
615
|
+
)
|
|
578
616
|
else:
|
|
579
|
-
# Send
|
|
580
|
-
completion_messages
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
"content": [{
|
|
587
|
-
"type": "image_url",
|
|
588
|
-
"image_url": {
|
|
589
|
-
"url": output.get("image_url")
|
|
590
|
-
}
|
|
591
|
-
}]
|
|
592
|
-
}]
|
|
617
|
+
# Send text result as user message
|
|
618
|
+
completion_messages.append(
|
|
619
|
+
{
|
|
620
|
+
"role": "user",
|
|
621
|
+
"content": str(output),
|
|
622
|
+
}
|
|
623
|
+
)
|
|
593
624
|
else:
|
|
594
|
-
#
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
625
|
+
# Standard tool message handling
|
|
626
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
627
|
+
if allow_images_in_tool_results:
|
|
628
|
+
# Handle image output as tool response (may not work with all APIs)
|
|
629
|
+
completion_messages.append(
|
|
630
|
+
{
|
|
631
|
+
"role": "tool",
|
|
632
|
+
"tool_call_id": call_id,
|
|
633
|
+
"content": [
|
|
634
|
+
{
|
|
635
|
+
"type": "image_url",
|
|
636
|
+
"image_url": {"url": output.get("image_url")},
|
|
637
|
+
}
|
|
638
|
+
],
|
|
639
|
+
}
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
# Determine if the next message is also a tool call output
|
|
643
|
+
next_type = None
|
|
644
|
+
if i + 1 < len(messages):
|
|
645
|
+
next_msg = messages[i + 1]
|
|
646
|
+
next_type = next_msg.get("type")
|
|
647
|
+
is_next_message_image_result = next_type in [
|
|
648
|
+
"computer_call_output",
|
|
649
|
+
]
|
|
650
|
+
# Send tool message + separate user message with image (OpenAI compatible)
|
|
651
|
+
completion_messages += (
|
|
652
|
+
[
|
|
653
|
+
{
|
|
654
|
+
"role": "tool",
|
|
655
|
+
"tool_call_id": call_id,
|
|
656
|
+
"content": "[Execution completed. See screenshot below]",
|
|
657
|
+
},
|
|
658
|
+
{
|
|
659
|
+
"role": "user",
|
|
660
|
+
"content": [
|
|
661
|
+
{
|
|
662
|
+
"type": "image_url",
|
|
663
|
+
"image_url": {"url": output.get("image_url")},
|
|
664
|
+
}
|
|
665
|
+
],
|
|
666
|
+
},
|
|
667
|
+
]
|
|
668
|
+
if send_multiple_user_images_per_parallel_tool_results
|
|
669
|
+
or (not is_next_message_image_result)
|
|
670
|
+
else [
|
|
671
|
+
{
|
|
672
|
+
"role": "tool",
|
|
673
|
+
"tool_call_id": call_id,
|
|
674
|
+
"content": "[Execution completed. See screenshot below]",
|
|
675
|
+
},
|
|
676
|
+
]
|
|
677
|
+
)
|
|
678
|
+
else:
|
|
679
|
+
# Handle text output as tool response
|
|
680
|
+
completion_messages.append(
|
|
681
|
+
{"role": "tool", "tool_call_id": call_id, "content": str(output)}
|
|
682
|
+
)
|
|
683
|
+
|
|
601
684
|
return completion_messages
|
|
602
685
|
|
|
603
686
|
|
|
604
|
-
def convert_completion_messages_to_responses_items(
|
|
687
|
+
def convert_completion_messages_to_responses_items(
|
|
688
|
+
completion_messages: List[Dict[str, Any]],
|
|
689
|
+
) -> List[Dict[str, Any]]:
|
|
605
690
|
"""Convert completion messages format to responses_items message format."""
|
|
606
691
|
responses_items = []
|
|
607
692
|
skip_next = False
|
|
608
|
-
|
|
693
|
+
|
|
609
694
|
for i, message in enumerate(completion_messages):
|
|
610
695
|
if skip_next:
|
|
611
696
|
skip_next = False
|
|
@@ -614,25 +699,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
614
699
|
role = message.get("role")
|
|
615
700
|
content = message.get("content")
|
|
616
701
|
tool_calls = message.get("tool_calls", [])
|
|
617
|
-
|
|
702
|
+
|
|
618
703
|
# Handle assistant messages with text content
|
|
619
704
|
if role == "assistant" and content and isinstance(content, str):
|
|
620
|
-
responses_items.append(
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
"type": "output_text",
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
705
|
+
responses_items.append(
|
|
706
|
+
{
|
|
707
|
+
"type": "message",
|
|
708
|
+
"role": "assistant",
|
|
709
|
+
"content": [{"type": "output_text", "text": content}],
|
|
710
|
+
}
|
|
711
|
+
)
|
|
712
|
+
|
|
629
713
|
# Handle tool calls
|
|
630
714
|
if tool_calls:
|
|
631
715
|
for tool_call in tool_calls:
|
|
632
716
|
if tool_call.get("type") == "function":
|
|
633
717
|
function = tool_call.get("function", {})
|
|
634
718
|
function_name = function.get("name")
|
|
635
|
-
|
|
719
|
+
|
|
636
720
|
if function_name == "computer":
|
|
637
721
|
# Parse computer action
|
|
638
722
|
try:
|
|
@@ -641,31 +725,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
641
725
|
if action.get("action"):
|
|
642
726
|
action["type"] = action["action"]
|
|
643
727
|
del action["action"]
|
|
644
|
-
responses_items.append(
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
728
|
+
responses_items.append(
|
|
729
|
+
{
|
|
730
|
+
"type": "computer_call",
|
|
731
|
+
"call_id": tool_call.get("id"),
|
|
732
|
+
"action": action,
|
|
733
|
+
"status": "completed",
|
|
734
|
+
}
|
|
735
|
+
)
|
|
650
736
|
except json.JSONDecodeError:
|
|
651
737
|
# Fallback to function call format
|
|
652
|
-
responses_items.append(
|
|
738
|
+
responses_items.append(
|
|
739
|
+
{
|
|
740
|
+
"type": "function_call",
|
|
741
|
+
"call_id": tool_call.get("id"),
|
|
742
|
+
"name": function_name,
|
|
743
|
+
"arguments": function.get("arguments", "{}"),
|
|
744
|
+
"status": "completed",
|
|
745
|
+
}
|
|
746
|
+
)
|
|
747
|
+
else:
|
|
748
|
+
# Regular function call
|
|
749
|
+
responses_items.append(
|
|
750
|
+
{
|
|
653
751
|
"type": "function_call",
|
|
654
752
|
"call_id": tool_call.get("id"),
|
|
655
753
|
"name": function_name,
|
|
656
754
|
"arguments": function.get("arguments", "{}"),
|
|
657
|
-
"status": "completed"
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
responses_items.append({
|
|
662
|
-
"type": "function_call",
|
|
663
|
-
"call_id": tool_call.get("id"),
|
|
664
|
-
"name": function_name,
|
|
665
|
-
"arguments": function.get("arguments", "{}"),
|
|
666
|
-
"status": "completed"
|
|
667
|
-
})
|
|
668
|
-
|
|
755
|
+
"status": "completed",
|
|
756
|
+
}
|
|
757
|
+
)
|
|
758
|
+
|
|
669
759
|
# Handle tool messages (function/computer call outputs)
|
|
670
760
|
elif role == "tool" and content:
|
|
671
761
|
tool_call_id = message.get("tool_call_id")
|
|
@@ -674,74 +764,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
674
764
|
if content == "[Execution completed. See screenshot below]":
|
|
675
765
|
# Look ahead for the next user message with image
|
|
676
766
|
next_idx = i + 1
|
|
677
|
-
if (
|
|
678
|
-
|
|
679
|
-
|
|
767
|
+
if (
|
|
768
|
+
next_idx < len(completion_messages)
|
|
769
|
+
and completion_messages[next_idx].get("role") == "user"
|
|
770
|
+
and isinstance(completion_messages[next_idx].get("content"), list)
|
|
771
|
+
):
|
|
680
772
|
# Found the pattern - extract image from next message
|
|
681
773
|
next_content = completion_messages[next_idx]["content"]
|
|
682
774
|
for item in next_content:
|
|
683
775
|
if item.get("type") == "image_url":
|
|
684
|
-
responses_items.append(
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
"
|
|
689
|
-
|
|
776
|
+
responses_items.append(
|
|
777
|
+
{
|
|
778
|
+
"type": "computer_call_output",
|
|
779
|
+
"call_id": tool_call_id,
|
|
780
|
+
"output": {
|
|
781
|
+
"type": "input_image",
|
|
782
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
783
|
+
},
|
|
690
784
|
}
|
|
691
|
-
|
|
785
|
+
)
|
|
692
786
|
# Skip the next user message since we processed it
|
|
693
787
|
skip_next = True
|
|
694
788
|
break
|
|
695
789
|
else:
|
|
696
790
|
# No matching user message, treat as regular text
|
|
697
|
-
responses_items.append(
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
791
|
+
responses_items.append(
|
|
792
|
+
{
|
|
793
|
+
"type": "computer_call_output",
|
|
794
|
+
"call_id": tool_call_id,
|
|
795
|
+
"output": content,
|
|
796
|
+
}
|
|
797
|
+
)
|
|
702
798
|
else:
|
|
703
799
|
# Determine if this is a computer call or function call output
|
|
704
800
|
try:
|
|
705
801
|
# Try to parse as structured output
|
|
706
802
|
parsed_content = json.loads(content)
|
|
707
803
|
if parsed_content.get("type") == "input_image":
|
|
708
|
-
responses_items.append(
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
804
|
+
responses_items.append(
|
|
805
|
+
{
|
|
806
|
+
"type": "computer_call_output",
|
|
807
|
+
"call_id": tool_call_id,
|
|
808
|
+
"output": parsed_content,
|
|
809
|
+
}
|
|
810
|
+
)
|
|
713
811
|
else:
|
|
714
|
-
responses_items.append(
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
812
|
+
responses_items.append(
|
|
813
|
+
{
|
|
814
|
+
"type": "computer_call_output",
|
|
815
|
+
"call_id": tool_call_id,
|
|
816
|
+
"output": content,
|
|
817
|
+
}
|
|
818
|
+
)
|
|
719
819
|
except json.JSONDecodeError:
|
|
720
820
|
# Plain text output - could be function or computer call
|
|
721
|
-
responses_items.append(
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
821
|
+
responses_items.append(
|
|
822
|
+
{
|
|
823
|
+
"type": "function_call_output",
|
|
824
|
+
"call_id": tool_call_id,
|
|
825
|
+
"output": content,
|
|
826
|
+
}
|
|
827
|
+
)
|
|
726
828
|
elif isinstance(content, list):
|
|
727
829
|
# Handle structured content (e.g., images)
|
|
728
830
|
for item in content:
|
|
729
831
|
if item.get("type") == "image_url":
|
|
730
|
-
responses_items.append(
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
"
|
|
735
|
-
|
|
832
|
+
responses_items.append(
|
|
833
|
+
{
|
|
834
|
+
"type": "computer_call_output",
|
|
835
|
+
"call_id": tool_call_id,
|
|
836
|
+
"output": {
|
|
837
|
+
"type": "input_image",
|
|
838
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
839
|
+
},
|
|
736
840
|
}
|
|
737
|
-
|
|
841
|
+
)
|
|
738
842
|
elif item.get("type") == "text":
|
|
739
|
-
responses_items.append(
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
843
|
+
responses_items.append(
|
|
844
|
+
{
|
|
845
|
+
"type": "function_call_output",
|
|
846
|
+
"call_id": tool_call_id,
|
|
847
|
+
"output": item.get("text"),
|
|
848
|
+
}
|
|
849
|
+
)
|
|
850
|
+
|
|
745
851
|
# Handle actual user messages
|
|
746
852
|
elif role == "user" and content:
|
|
747
853
|
if isinstance(content, list):
|
|
@@ -749,27 +855,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
749
855
|
user_content = []
|
|
750
856
|
for item in content:
|
|
751
857
|
if item.get("type") == "image_url":
|
|
752
|
-
user_content.append(
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
858
|
+
user_content.append(
|
|
859
|
+
{
|
|
860
|
+
"type": "input_image",
|
|
861
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
862
|
+
}
|
|
863
|
+
)
|
|
756
864
|
elif item.get("type") == "text":
|
|
757
|
-
user_content.append({
|
|
758
|
-
|
|
759
|
-
"text": item.get("text")
|
|
760
|
-
})
|
|
761
|
-
|
|
865
|
+
user_content.append({"type": "input_text", "text": item.get("text")})
|
|
866
|
+
|
|
762
867
|
if user_content:
|
|
763
|
-
responses_items.append(
|
|
764
|
-
"role": "user",
|
|
765
|
-
|
|
766
|
-
"content": user_content
|
|
767
|
-
})
|
|
868
|
+
responses_items.append(
|
|
869
|
+
{"role": "user", "type": "message", "content": user_content}
|
|
870
|
+
)
|
|
768
871
|
elif isinstance(content, str):
|
|
769
872
|
# Handle simple text user message
|
|
770
|
-
responses_items.append({
|
|
771
|
-
|
|
772
|
-
"content": content
|
|
773
|
-
})
|
|
774
|
-
|
|
873
|
+
responses_items.append({"role": "user", "content": content})
|
|
874
|
+
|
|
775
875
|
return responses_items
|