cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/responses.py
CHANGED
|
@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
|
|
|
6
6
|
import base64
|
|
7
7
|
import json
|
|
8
8
|
import uuid
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
10
10
|
|
|
11
|
+
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
11
12
|
from openai.types.responses.response_computer_tool_call_param import (
|
|
12
|
-
ResponseComputerToolCallParam,
|
|
13
13
|
ActionClick,
|
|
14
14
|
ActionDoubleClick,
|
|
15
15
|
ActionDrag,
|
|
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
|
|
|
18
18
|
ActionMove,
|
|
19
19
|
ActionScreenshot,
|
|
20
20
|
ActionScroll,
|
|
21
|
+
)
|
|
22
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
21
23
|
ActionType as ActionTypeAction,
|
|
24
|
+
)
|
|
25
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
22
26
|
ActionWait,
|
|
23
|
-
PendingSafetyCheck
|
|
27
|
+
PendingSafetyCheck,
|
|
28
|
+
ResponseComputerToolCallParam,
|
|
29
|
+
)
|
|
30
|
+
from openai.types.responses.response_function_tool_call_param import (
|
|
31
|
+
ResponseFunctionToolCallParam,
|
|
24
32
|
)
|
|
25
|
-
|
|
26
|
-
from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
|
|
27
|
-
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
28
|
-
from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
|
|
29
|
-
from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
|
|
30
|
-
from openai.types.responses.easy_input_message_param import EasyInputMessageParam
|
|
31
33
|
from openai.types.responses.response_input_image_param import ResponseInputImageParam
|
|
34
|
+
from openai.types.responses.response_output_message_param import (
|
|
35
|
+
ResponseOutputMessageParam,
|
|
36
|
+
)
|
|
37
|
+
from openai.types.responses.response_output_text_param import ResponseOutputTextParam
|
|
38
|
+
from openai.types.responses.response_reasoning_item_param import (
|
|
39
|
+
ResponseReasoningItemParam,
|
|
40
|
+
Summary,
|
|
41
|
+
)
|
|
42
|
+
|
|
32
43
|
|
|
33
44
|
def random_id():
|
|
34
45
|
return str(uuid.uuid4())
|
|
35
46
|
|
|
47
|
+
|
|
36
48
|
# User message items
|
|
37
49
|
def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
|
|
38
50
|
return EasyInputMessageParam(
|
|
39
51
|
content=[
|
|
40
52
|
ResponseInputImageParam(
|
|
41
53
|
type="input_image",
|
|
42
|
-
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
|
|
43
|
-
)
|
|
54
|
+
image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
|
|
55
|
+
) # type: ignore
|
|
44
56
|
],
|
|
45
57
|
role="user",
|
|
46
|
-
type="message"
|
|
58
|
+
type="message",
|
|
47
59
|
)
|
|
48
60
|
|
|
61
|
+
|
|
49
62
|
# Text items
|
|
50
63
|
def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
|
|
51
64
|
return ResponseReasoningItemParam(
|
|
52
|
-
id=random_id(),
|
|
53
|
-
summary=[
|
|
54
|
-
Summary(text=reasoning, type="summary_text")
|
|
55
|
-
],
|
|
56
|
-
type="reasoning"
|
|
65
|
+
id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
|
|
57
66
|
)
|
|
58
67
|
|
|
68
|
+
|
|
59
69
|
def make_output_text_item(content: str) -> ResponseOutputMessageParam:
|
|
60
70
|
return ResponseOutputMessageParam(
|
|
61
71
|
id=random_id(),
|
|
62
|
-
content=[
|
|
63
|
-
ResponseOutputTextParam(
|
|
64
|
-
text=content,
|
|
65
|
-
type="output_text",
|
|
66
|
-
annotations=[]
|
|
67
|
-
)
|
|
68
|
-
],
|
|
72
|
+
content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
|
|
69
73
|
role="assistant",
|
|
70
74
|
status="completed",
|
|
71
|
-
type="message"
|
|
75
|
+
type="message",
|
|
72
76
|
)
|
|
73
77
|
|
|
78
|
+
|
|
74
79
|
# Function call items
|
|
75
|
-
def make_function_call_item(
|
|
80
|
+
def make_function_call_item(
|
|
81
|
+
function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
|
|
82
|
+
) -> ResponseFunctionToolCallParam:
|
|
76
83
|
return ResponseFunctionToolCallParam(
|
|
77
84
|
id=random_id(),
|
|
78
85
|
call_id=call_id if call_id else random_id(),
|
|
79
86
|
name=function_name,
|
|
80
87
|
arguments=json.dumps(arguments),
|
|
81
88
|
status="completed",
|
|
82
|
-
type="function_call"
|
|
89
|
+
type="function_call",
|
|
83
90
|
)
|
|
84
91
|
|
|
92
|
+
|
|
85
93
|
# Computer tool call items
|
|
86
|
-
def make_click_item(
|
|
94
|
+
def make_click_item(
|
|
95
|
+
x: int,
|
|
96
|
+
y: int,
|
|
97
|
+
button: Literal["left", "right", "wheel", "back", "forward"] = "left",
|
|
98
|
+
call_id: Optional[str] = None,
|
|
99
|
+
) -> ResponseComputerToolCallParam:
|
|
87
100
|
return ResponseComputerToolCallParam(
|
|
88
101
|
id=random_id(),
|
|
89
102
|
call_id=call_id if call_id else random_id(),
|
|
90
|
-
action=ActionClick(
|
|
91
|
-
button=button,
|
|
92
|
-
type="click",
|
|
93
|
-
x=x,
|
|
94
|
-
y=y
|
|
95
|
-
),
|
|
103
|
+
action=ActionClick(button=button, type="click", x=x, y=y),
|
|
96
104
|
pending_safety_checks=[],
|
|
97
105
|
status="completed",
|
|
98
|
-
type="computer_call"
|
|
106
|
+
type="computer_call",
|
|
99
107
|
)
|
|
100
108
|
|
|
101
|
-
|
|
109
|
+
|
|
110
|
+
def make_double_click_item(
|
|
111
|
+
x: int, y: int, call_id: Optional[str] = None
|
|
112
|
+
) -> ResponseComputerToolCallParam:
|
|
102
113
|
return ResponseComputerToolCallParam(
|
|
103
114
|
id=random_id(),
|
|
104
115
|
call_id=call_id if call_id else random_id(),
|
|
105
|
-
action=ActionDoubleClick(
|
|
106
|
-
type="double_click",
|
|
107
|
-
x=x,
|
|
108
|
-
y=y
|
|
109
|
-
),
|
|
116
|
+
action=ActionDoubleClick(type="double_click", x=x, y=y),
|
|
110
117
|
pending_safety_checks=[],
|
|
111
118
|
status="completed",
|
|
112
|
-
type="computer_call"
|
|
119
|
+
type="computer_call",
|
|
113
120
|
)
|
|
114
121
|
|
|
115
|
-
|
|
122
|
+
|
|
123
|
+
def make_drag_item(
|
|
124
|
+
path: List[Dict[str, int]], call_id: Optional[str] = None
|
|
125
|
+
) -> ResponseComputerToolCallParam:
|
|
116
126
|
drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
|
|
117
127
|
return ResponseComputerToolCallParam(
|
|
118
128
|
id=random_id(),
|
|
119
129
|
call_id=call_id if call_id else random_id(),
|
|
120
|
-
action=ActionDrag(
|
|
121
|
-
path=drag_path,
|
|
122
|
-
type="drag"
|
|
123
|
-
),
|
|
130
|
+
action=ActionDrag(path=drag_path, type="drag"),
|
|
124
131
|
pending_safety_checks=[],
|
|
125
132
|
status="completed",
|
|
126
|
-
type="computer_call"
|
|
133
|
+
type="computer_call",
|
|
127
134
|
)
|
|
128
135
|
|
|
129
|
-
|
|
136
|
+
|
|
137
|
+
def make_keypress_item(
|
|
138
|
+
keys: List[str], call_id: Optional[str] = None
|
|
139
|
+
) -> ResponseComputerToolCallParam:
|
|
130
140
|
return ResponseComputerToolCallParam(
|
|
131
141
|
id=random_id(),
|
|
132
142
|
call_id=call_id if call_id else random_id(),
|
|
133
|
-
action=ActionKeypress(
|
|
134
|
-
keys=keys,
|
|
135
|
-
type="keypress"
|
|
136
|
-
),
|
|
143
|
+
action=ActionKeypress(keys=keys, type="keypress"),
|
|
137
144
|
pending_safety_checks=[],
|
|
138
145
|
status="completed",
|
|
139
|
-
type="computer_call"
|
|
146
|
+
type="computer_call",
|
|
140
147
|
)
|
|
141
148
|
|
|
149
|
+
|
|
142
150
|
def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
143
151
|
return ResponseComputerToolCallParam(
|
|
144
152
|
id=random_id(),
|
|
145
153
|
call_id=call_id if call_id else random_id(),
|
|
146
|
-
action=ActionMove(
|
|
147
|
-
type="move",
|
|
148
|
-
x=x,
|
|
149
|
-
y=y
|
|
150
|
-
),
|
|
154
|
+
action=ActionMove(type="move", x=x, y=y),
|
|
151
155
|
pending_safety_checks=[],
|
|
152
156
|
status="completed",
|
|
153
|
-
type="computer_call"
|
|
157
|
+
type="computer_call",
|
|
154
158
|
)
|
|
155
159
|
|
|
160
|
+
|
|
156
161
|
def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
157
162
|
return ResponseComputerToolCallParam(
|
|
158
163
|
id=random_id(),
|
|
159
164
|
call_id=call_id if call_id else random_id(),
|
|
160
|
-
action=ActionScreenshot(
|
|
161
|
-
type="screenshot"
|
|
162
|
-
),
|
|
165
|
+
action=ActionScreenshot(type="screenshot"),
|
|
163
166
|
pending_safety_checks=[],
|
|
164
167
|
status="completed",
|
|
165
|
-
type="computer_call"
|
|
168
|
+
type="computer_call",
|
|
166
169
|
)
|
|
167
170
|
|
|
168
|
-
|
|
171
|
+
|
|
172
|
+
def make_scroll_item(
|
|
173
|
+
x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
|
|
174
|
+
) -> ResponseComputerToolCallParam:
|
|
169
175
|
return ResponseComputerToolCallParam(
|
|
170
176
|
id=random_id(),
|
|
171
177
|
call_id=call_id if call_id else random_id(),
|
|
172
|
-
action=ActionScroll(
|
|
173
|
-
scroll_x=scroll_x,
|
|
174
|
-
scroll_y=scroll_y,
|
|
175
|
-
type="scroll",
|
|
176
|
-
x=x,
|
|
177
|
-
y=y
|
|
178
|
-
),
|
|
178
|
+
action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
|
|
179
179
|
pending_safety_checks=[],
|
|
180
180
|
status="completed",
|
|
181
|
-
type="computer_call"
|
|
181
|
+
type="computer_call",
|
|
182
182
|
)
|
|
183
183
|
|
|
184
|
+
|
|
184
185
|
def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
185
186
|
return ResponseComputerToolCallParam(
|
|
186
187
|
id=random_id(),
|
|
187
188
|
call_id=call_id if call_id else random_id(),
|
|
188
|
-
action=ActionTypeAction(
|
|
189
|
-
text=text,
|
|
190
|
-
type="type"
|
|
191
|
-
),
|
|
189
|
+
action=ActionTypeAction(text=text, type="type"),
|
|
192
190
|
pending_safety_checks=[],
|
|
193
191
|
status="completed",
|
|
194
|
-
type="computer_call"
|
|
192
|
+
type="computer_call",
|
|
195
193
|
)
|
|
196
194
|
|
|
195
|
+
|
|
197
196
|
def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
|
|
198
197
|
return ResponseComputerToolCallParam(
|
|
199
198
|
id=random_id(),
|
|
200
199
|
call_id=call_id if call_id else random_id(),
|
|
201
|
-
action=ActionWait(
|
|
202
|
-
type="wait"
|
|
203
|
-
),
|
|
200
|
+
action=ActionWait(type="wait"),
|
|
204
201
|
pending_safety_checks=[],
|
|
205
202
|
status="completed",
|
|
206
|
-
type="computer_call"
|
|
203
|
+
type="computer_call",
|
|
207
204
|
)
|
|
208
205
|
|
|
206
|
+
|
|
209
207
|
# Extra anthropic computer calls
|
|
210
|
-
def make_left_mouse_down_item(
|
|
208
|
+
def make_left_mouse_down_item(
|
|
209
|
+
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
210
|
+
) -> Dict[str, Any]:
|
|
211
211
|
return {
|
|
212
212
|
"id": random_id(),
|
|
213
213
|
"call_id": call_id if call_id else random_id(),
|
|
214
|
-
"action": {
|
|
215
|
-
"type": "left_mouse_down",
|
|
216
|
-
"x": x,
|
|
217
|
-
"y": y
|
|
218
|
-
},
|
|
214
|
+
"action": {"type": "left_mouse_down", "x": x, "y": y},
|
|
219
215
|
"pending_safety_checks": [],
|
|
220
216
|
"status": "completed",
|
|
221
|
-
"type": "computer_call"
|
|
217
|
+
"type": "computer_call",
|
|
222
218
|
}
|
|
223
219
|
|
|
224
|
-
|
|
220
|
+
|
|
221
|
+
def make_left_mouse_up_item(
|
|
222
|
+
x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
|
|
223
|
+
) -> Dict[str, Any]:
|
|
225
224
|
return {
|
|
226
225
|
"id": random_id(),
|
|
227
226
|
"call_id": call_id if call_id else random_id(),
|
|
228
|
-
"action": {
|
|
229
|
-
"type": "left_mouse_up",
|
|
230
|
-
"x": x,
|
|
231
|
-
"y": y
|
|
232
|
-
},
|
|
227
|
+
"action": {"type": "left_mouse_up", "x": x, "y": y},
|
|
233
228
|
"pending_safety_checks": [],
|
|
234
229
|
"status": "completed",
|
|
235
|
-
"type": "computer_call"
|
|
230
|
+
"type": "computer_call",
|
|
236
231
|
}
|
|
237
232
|
|
|
238
|
-
|
|
233
|
+
|
|
234
|
+
def make_failed_tool_call_items(
|
|
235
|
+
tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
|
|
236
|
+
) -> List[Dict[str, Any]]:
|
|
239
237
|
call_id = call_id if call_id else random_id()
|
|
240
238
|
return [
|
|
241
239
|
{
|
|
@@ -249,9 +247,10 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
|
|
|
249
247
|
"type": "function_call_output",
|
|
250
248
|
"call_id": call_id,
|
|
251
249
|
"output": json.dumps({"error": error_message}),
|
|
252
|
-
}
|
|
250
|
+
},
|
|
253
251
|
]
|
|
254
252
|
|
|
253
|
+
|
|
255
254
|
def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
|
|
256
255
|
call_id = call_id if call_id else random_id()
|
|
257
256
|
return {
|
|
@@ -260,12 +259,15 @@ def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> D
|
|
|
260
259
|
"output": json.dumps({"error": error_message}),
|
|
261
260
|
}
|
|
262
261
|
|
|
263
|
-
|
|
262
|
+
|
|
263
|
+
def replace_failed_computer_calls_with_function_calls(
|
|
264
|
+
messages: List[Dict[str, Any]],
|
|
265
|
+
) -> List[Dict[str, Any]]:
|
|
264
266
|
"""
|
|
265
267
|
Replace computer_call items with function_call items if they share a call_id with a function_call_output.
|
|
266
268
|
This indicates the computer call failed and should be treated as a function call instead.
|
|
267
269
|
We do this because the computer_call_output items do not support text output.
|
|
268
|
-
|
|
270
|
+
|
|
269
271
|
Args:
|
|
270
272
|
messages: List of message items to process
|
|
271
273
|
"""
|
|
@@ -278,16 +280,15 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
|
|
|
278
280
|
call_id = msg.get("call_id")
|
|
279
281
|
if call_id:
|
|
280
282
|
failed_call_ids.add(call_id)
|
|
281
|
-
|
|
283
|
+
|
|
282
284
|
# Replace computer_call items that have matching call_ids
|
|
283
285
|
for i, msg in enumerate(messages):
|
|
284
|
-
if
|
|
285
|
-
|
|
286
|
-
|
|
286
|
+
if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
|
|
287
|
+
|
|
287
288
|
# Extract action from computer_call
|
|
288
289
|
action = msg.get("action", {})
|
|
289
290
|
call_id = msg.get("call_id")
|
|
290
|
-
|
|
291
|
+
|
|
291
292
|
# Create function_call replacement
|
|
292
293
|
messages[i] = {
|
|
293
294
|
"type": "function_call",
|
|
@@ -296,27 +297,30 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
|
|
|
296
297
|
"name": "computer",
|
|
297
298
|
"arguments": json.dumps(action),
|
|
298
299
|
}
|
|
299
|
-
|
|
300
|
+
|
|
300
301
|
return messages
|
|
301
302
|
|
|
303
|
+
|
|
302
304
|
# Conversion functions between element descriptions and coordinates
|
|
303
|
-
def convert_computer_calls_desc2xy(
|
|
305
|
+
def convert_computer_calls_desc2xy(
|
|
306
|
+
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
307
|
+
) -> List[Dict[str, Any]]:
|
|
304
308
|
"""
|
|
305
309
|
Convert computer calls from element descriptions to x,y coordinates.
|
|
306
|
-
|
|
310
|
+
|
|
307
311
|
Args:
|
|
308
312
|
responses_items: List of response items containing computer calls with element_description
|
|
309
313
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
310
|
-
|
|
314
|
+
|
|
311
315
|
Returns:
|
|
312
316
|
List of response items with element_description replaced by x,y coordinates
|
|
313
317
|
"""
|
|
314
318
|
converted_items = []
|
|
315
|
-
|
|
319
|
+
|
|
316
320
|
for item in responses_items:
|
|
317
321
|
if item.get("type") == "computer_call" and "action" in item:
|
|
318
322
|
action = item["action"].copy()
|
|
319
|
-
|
|
323
|
+
|
|
320
324
|
# Handle single element_description
|
|
321
325
|
if "element_description" in action:
|
|
322
326
|
desc = action["element_description"]
|
|
@@ -325,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
|
|
|
325
329
|
action["x"] = x
|
|
326
330
|
action["y"] = y
|
|
327
331
|
del action["element_description"]
|
|
328
|
-
|
|
332
|
+
|
|
329
333
|
# Handle start_element_description and end_element_description for drag operations
|
|
330
334
|
elif "start_element_description" in action and "end_element_description" in action:
|
|
331
335
|
start_desc = action["start_element_description"]
|
|
332
336
|
end_desc = action["end_element_description"]
|
|
333
|
-
|
|
337
|
+
|
|
334
338
|
if start_desc in desc2xy and end_desc in desc2xy:
|
|
335
339
|
start_x, start_y = desc2xy[start_desc]
|
|
336
340
|
end_x, end_y = desc2xy[end_desc]
|
|
337
341
|
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
338
342
|
del action["start_element_description"]
|
|
339
343
|
del action["end_element_description"]
|
|
340
|
-
|
|
344
|
+
|
|
341
345
|
converted_item = item.copy()
|
|
342
346
|
converted_item["action"] = action
|
|
343
347
|
converted_items.append(converted_item)
|
|
344
348
|
else:
|
|
345
349
|
converted_items.append(item)
|
|
346
|
-
|
|
350
|
+
|
|
347
351
|
return converted_items
|
|
348
352
|
|
|
349
353
|
|
|
350
|
-
def convert_computer_calls_xy2desc(
|
|
354
|
+
def convert_computer_calls_xy2desc(
|
|
355
|
+
responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
|
|
356
|
+
) -> List[Dict[str, Any]]:
|
|
351
357
|
"""
|
|
352
358
|
Convert computer calls from x,y coordinates to element descriptions.
|
|
353
|
-
|
|
359
|
+
|
|
354
360
|
Args:
|
|
355
361
|
responses_items: List of response items containing computer calls with x,y coordinates
|
|
356
362
|
desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
|
|
357
|
-
|
|
363
|
+
|
|
358
364
|
Returns:
|
|
359
365
|
List of response items with x,y coordinates replaced by element_description
|
|
360
366
|
"""
|
|
361
367
|
# Create reverse mapping from coordinates to descriptions
|
|
362
368
|
xy2desc = {coords: desc for desc, coords in desc2xy.items()}
|
|
363
|
-
|
|
369
|
+
|
|
364
370
|
converted_items = []
|
|
365
|
-
|
|
371
|
+
|
|
366
372
|
for item in responses_items:
|
|
367
373
|
if item.get("type") == "computer_call" and "action" in item:
|
|
368
374
|
action = item["action"].copy()
|
|
369
|
-
|
|
375
|
+
|
|
370
376
|
# Handle single x,y coordinates
|
|
371
377
|
if "x" in action and "y" in action:
|
|
372
378
|
coords = (action["x"], action["y"])
|
|
@@ -374,77 +380,83 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
|
|
|
374
380
|
action["element_description"] = xy2desc[coords]
|
|
375
381
|
del action["x"]
|
|
376
382
|
del action["y"]
|
|
377
|
-
|
|
383
|
+
|
|
378
384
|
# Handle path for drag operations
|
|
379
385
|
elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
|
|
380
386
|
start_point = action["path"][0]
|
|
381
387
|
end_point = action["path"][1]
|
|
382
|
-
|
|
383
|
-
if (
|
|
384
|
-
"x" in
|
|
385
|
-
|
|
388
|
+
|
|
389
|
+
if (
|
|
390
|
+
"x" in start_point
|
|
391
|
+
and "y" in start_point
|
|
392
|
+
and "x" in end_point
|
|
393
|
+
and "y" in end_point
|
|
394
|
+
):
|
|
395
|
+
|
|
386
396
|
start_coords = (start_point["x"], start_point["y"])
|
|
387
397
|
end_coords = (end_point["x"], end_point["y"])
|
|
388
|
-
|
|
398
|
+
|
|
389
399
|
if start_coords in xy2desc and end_coords in xy2desc:
|
|
390
400
|
action["start_element_description"] = xy2desc[start_coords]
|
|
391
401
|
action["end_element_description"] = xy2desc[end_coords]
|
|
392
402
|
del action["path"]
|
|
393
|
-
|
|
403
|
+
|
|
394
404
|
converted_item = item.copy()
|
|
395
405
|
converted_item["action"] = action
|
|
396
406
|
converted_items.append(converted_item)
|
|
397
407
|
else:
|
|
398
408
|
converted_items.append(item)
|
|
399
|
-
|
|
409
|
+
|
|
400
410
|
return converted_items
|
|
401
411
|
|
|
402
412
|
|
|
403
413
|
def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
|
|
404
414
|
"""
|
|
405
415
|
Extract all element descriptions from computer calls in responses items.
|
|
406
|
-
|
|
416
|
+
|
|
407
417
|
Args:
|
|
408
418
|
responses_items: List of response items containing computer calls
|
|
409
|
-
|
|
419
|
+
|
|
410
420
|
Returns:
|
|
411
421
|
List of unique element descriptions found in computer calls
|
|
412
422
|
"""
|
|
413
423
|
descriptions = set()
|
|
414
|
-
|
|
424
|
+
|
|
415
425
|
for item in responses_items:
|
|
416
426
|
if item.get("type") == "computer_call" and "action" in item:
|
|
417
427
|
action = item["action"]
|
|
418
|
-
|
|
428
|
+
|
|
419
429
|
# Handle single element_description
|
|
420
430
|
if "element_description" in action:
|
|
421
431
|
descriptions.add(action["element_description"])
|
|
422
|
-
|
|
432
|
+
|
|
423
433
|
# Handle start_element_description and end_element_description for drag operations
|
|
424
434
|
if "start_element_description" in action:
|
|
425
435
|
descriptions.add(action["start_element_description"])
|
|
426
|
-
|
|
436
|
+
|
|
427
437
|
if "end_element_description" in action:
|
|
428
438
|
descriptions.add(action["end_element_description"])
|
|
429
|
-
|
|
439
|
+
|
|
430
440
|
return list(descriptions)
|
|
431
441
|
|
|
432
442
|
|
|
433
443
|
# Conversion functions between responses_items and completion messages formats
|
|
434
|
-
def convert_responses_items_to_completion_messages(
|
|
444
|
+
def convert_responses_items_to_completion_messages(
|
|
445
|
+
messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True
|
|
446
|
+
) -> List[Dict[str, Any]]:
|
|
435
447
|
"""Convert responses_items message format to liteLLM completion format.
|
|
436
|
-
|
|
448
|
+
|
|
437
449
|
Args:
|
|
438
450
|
messages: List of responses_items format messages
|
|
439
451
|
allow_images_in_tool_results: If True, include images in tool role messages.
|
|
440
452
|
If False, send tool message + separate user message with image.
|
|
441
453
|
"""
|
|
442
454
|
completion_messages = []
|
|
443
|
-
|
|
455
|
+
|
|
444
456
|
for message in messages:
|
|
445
457
|
msg_type = message.get("type")
|
|
446
458
|
role = message.get("role")
|
|
447
|
-
|
|
459
|
+
|
|
448
460
|
# Handle user messages (both with and without explicit type)
|
|
449
461
|
if role == "user" or msg_type == "user":
|
|
450
462
|
content = message.get("content", "")
|
|
@@ -453,34 +465,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
453
465
|
completion_content = []
|
|
454
466
|
for item in content:
|
|
455
467
|
if item.get("type") == "input_image":
|
|
456
|
-
completion_content.append(
|
|
457
|
-
"type": "image_url",
|
|
458
|
-
|
|
459
|
-
"url": item.get("image_url")
|
|
460
|
-
}
|
|
461
|
-
})
|
|
468
|
+
completion_content.append(
|
|
469
|
+
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
|
470
|
+
)
|
|
462
471
|
elif item.get("type") == "input_text":
|
|
463
|
-
completion_content.append({
|
|
464
|
-
"type": "text",
|
|
465
|
-
"text": item.get("text")
|
|
466
|
-
})
|
|
472
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
467
473
|
elif item.get("type") == "text":
|
|
468
|
-
completion_content.append({
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
})
|
|
472
|
-
|
|
473
|
-
completion_messages.append({
|
|
474
|
-
"role": "user",
|
|
475
|
-
"content": completion_content
|
|
476
|
-
})
|
|
474
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
475
|
+
|
|
476
|
+
completion_messages.append({"role": "user", "content": completion_content})
|
|
477
477
|
elif isinstance(content, str):
|
|
478
478
|
# Handle string content
|
|
479
|
-
completion_messages.append({
|
|
480
|
-
|
|
481
|
-
"content": content
|
|
482
|
-
})
|
|
483
|
-
|
|
479
|
+
completion_messages.append({"role": "user", "content": content})
|
|
480
|
+
|
|
484
481
|
# Handle assistant messages
|
|
485
482
|
elif role == "assistant" or msg_type == "message":
|
|
486
483
|
content = message.get("content", [])
|
|
@@ -491,13 +488,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
491
488
|
text_parts.append(item.get("text", ""))
|
|
492
489
|
elif item.get("type") == "text":
|
|
493
490
|
text_parts.append(item.get("text", ""))
|
|
494
|
-
|
|
491
|
+
|
|
495
492
|
if text_parts:
|
|
496
|
-
completion_messages.append(
|
|
497
|
-
"role": "assistant",
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
493
|
+
completion_messages.append(
|
|
494
|
+
{"role": "assistant", "content": "\n".join(text_parts)}
|
|
495
|
+
)
|
|
496
|
+
|
|
501
497
|
# Handle reasoning items (convert to assistant message)
|
|
502
498
|
elif msg_type == "reasoning":
|
|
503
499
|
summary = message.get("summary", [])
|
|
@@ -505,107 +501,96 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
|
|
|
505
501
|
for item in summary:
|
|
506
502
|
if item.get("type") == "summary_text":
|
|
507
503
|
text_parts.append(item.get("text", ""))
|
|
508
|
-
|
|
504
|
+
|
|
509
505
|
if text_parts:
|
|
510
|
-
completion_messages.append({
|
|
511
|
-
|
|
512
|
-
"content": "\n".join(text_parts)
|
|
513
|
-
})
|
|
514
|
-
|
|
506
|
+
completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
|
|
507
|
+
|
|
515
508
|
# Handle function calls
|
|
516
509
|
elif msg_type == "function_call":
|
|
517
510
|
# Add tool call to last assistant message or create new one
|
|
518
511
|
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
519
|
-
completion_messages.append({
|
|
520
|
-
|
|
521
|
-
"content": "",
|
|
522
|
-
"tool_calls": []
|
|
523
|
-
})
|
|
524
|
-
|
|
512
|
+
completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
|
|
513
|
+
|
|
525
514
|
if "tool_calls" not in completion_messages[-1]:
|
|
526
515
|
completion_messages[-1]["tool_calls"] = []
|
|
527
|
-
|
|
528
|
-
completion_messages[-1]["tool_calls"].append(
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
"
|
|
533
|
-
|
|
516
|
+
|
|
517
|
+
completion_messages[-1]["tool_calls"].append(
|
|
518
|
+
{
|
|
519
|
+
"id": message.get("call_id"),
|
|
520
|
+
"type": "function",
|
|
521
|
+
"function": {
|
|
522
|
+
"name": message.get("name"),
|
|
523
|
+
"arguments": message.get("arguments"),
|
|
524
|
+
},
|
|
534
525
|
}
|
|
535
|
-
|
|
536
|
-
|
|
526
|
+
)
|
|
527
|
+
|
|
537
528
|
# Handle computer calls
|
|
538
529
|
elif msg_type == "computer_call":
|
|
539
530
|
# Add tool call to last assistant message or create new one
|
|
540
531
|
if not completion_messages or completion_messages[-1]["role"] != "assistant":
|
|
541
|
-
completion_messages.append({
|
|
542
|
-
|
|
543
|
-
"content": "",
|
|
544
|
-
"tool_calls": []
|
|
545
|
-
})
|
|
546
|
-
|
|
532
|
+
completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
|
|
533
|
+
|
|
547
534
|
if "tool_calls" not in completion_messages[-1]:
|
|
548
535
|
completion_messages[-1]["tool_calls"] = []
|
|
549
|
-
|
|
536
|
+
|
|
550
537
|
action = message.get("action", {})
|
|
551
|
-
completion_messages[-1]["tool_calls"].append(
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
"name": "computer",
|
|
556
|
-
"arguments": json.dumps(action)
|
|
538
|
+
completion_messages[-1]["tool_calls"].append(
|
|
539
|
+
{
|
|
540
|
+
"id": message.get("call_id"),
|
|
541
|
+
"type": "function",
|
|
542
|
+
"function": {"name": "computer", "arguments": json.dumps(action)},
|
|
557
543
|
}
|
|
558
|
-
|
|
559
|
-
|
|
544
|
+
)
|
|
545
|
+
|
|
560
546
|
# Handle function/computer call outputs
|
|
561
547
|
elif msg_type in ["function_call_output", "computer_call_output"]:
|
|
562
548
|
output = message.get("output")
|
|
563
549
|
call_id = message.get("call_id")
|
|
564
|
-
|
|
550
|
+
|
|
565
551
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
566
552
|
if allow_images_in_tool_results:
|
|
567
553
|
# Handle image output as tool response (may not work with all APIs)
|
|
568
|
-
completion_messages.append(
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
"
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
})
|
|
554
|
+
completion_messages.append(
|
|
555
|
+
{
|
|
556
|
+
"role": "tool",
|
|
557
|
+
"tool_call_id": call_id,
|
|
558
|
+
"content": [
|
|
559
|
+
{"type": "image_url", "image_url": {"url": output.get("image_url")}}
|
|
560
|
+
],
|
|
561
|
+
}
|
|
562
|
+
)
|
|
578
563
|
else:
|
|
579
564
|
# Send tool message + separate user message with image (OpenAI compatible)
|
|
580
|
-
completion_messages += [
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
"
|
|
588
|
-
"
|
|
589
|
-
"url": output.get("image_url")
|
|
590
|
-
|
|
591
|
-
}
|
|
592
|
-
|
|
565
|
+
completion_messages += [
|
|
566
|
+
{
|
|
567
|
+
"role": "tool",
|
|
568
|
+
"tool_call_id": call_id,
|
|
569
|
+
"content": "[Execution completed. See screenshot below]",
|
|
570
|
+
},
|
|
571
|
+
{
|
|
572
|
+
"role": "user",
|
|
573
|
+
"content": [
|
|
574
|
+
{"type": "image_url", "image_url": {"url": output.get("image_url")}}
|
|
575
|
+
],
|
|
576
|
+
},
|
|
577
|
+
]
|
|
593
578
|
else:
|
|
594
579
|
# Handle text output as tool response
|
|
595
|
-
completion_messages.append(
|
|
596
|
-
"role": "tool",
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
})
|
|
600
|
-
|
|
580
|
+
completion_messages.append(
|
|
581
|
+
{"role": "tool", "tool_call_id": call_id, "content": str(output)}
|
|
582
|
+
)
|
|
583
|
+
|
|
601
584
|
return completion_messages
|
|
602
585
|
|
|
603
586
|
|
|
604
|
-
def convert_completion_messages_to_responses_items(
|
|
587
|
+
def convert_completion_messages_to_responses_items(
|
|
588
|
+
completion_messages: List[Dict[str, Any]],
|
|
589
|
+
) -> List[Dict[str, Any]]:
|
|
605
590
|
"""Convert completion messages format to responses_items message format."""
|
|
606
591
|
responses_items = []
|
|
607
592
|
skip_next = False
|
|
608
|
-
|
|
593
|
+
|
|
609
594
|
for i, message in enumerate(completion_messages):
|
|
610
595
|
if skip_next:
|
|
611
596
|
skip_next = False
|
|
@@ -614,25 +599,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
614
599
|
role = message.get("role")
|
|
615
600
|
content = message.get("content")
|
|
616
601
|
tool_calls = message.get("tool_calls", [])
|
|
617
|
-
|
|
602
|
+
|
|
618
603
|
# Handle assistant messages with text content
|
|
619
604
|
if role == "assistant" and content and isinstance(content, str):
|
|
620
|
-
responses_items.append(
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
"type": "output_text",
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
605
|
+
responses_items.append(
|
|
606
|
+
{
|
|
607
|
+
"type": "message",
|
|
608
|
+
"role": "assistant",
|
|
609
|
+
"content": [{"type": "output_text", "text": content}],
|
|
610
|
+
}
|
|
611
|
+
)
|
|
612
|
+
|
|
629
613
|
# Handle tool calls
|
|
630
614
|
if tool_calls:
|
|
631
615
|
for tool_call in tool_calls:
|
|
632
616
|
if tool_call.get("type") == "function":
|
|
633
617
|
function = tool_call.get("function", {})
|
|
634
618
|
function_name = function.get("name")
|
|
635
|
-
|
|
619
|
+
|
|
636
620
|
if function_name == "computer":
|
|
637
621
|
# Parse computer action
|
|
638
622
|
try:
|
|
@@ -641,31 +625,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
641
625
|
if action.get("action"):
|
|
642
626
|
action["type"] = action["action"]
|
|
643
627
|
del action["action"]
|
|
644
|
-
responses_items.append(
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
628
|
+
responses_items.append(
|
|
629
|
+
{
|
|
630
|
+
"type": "computer_call",
|
|
631
|
+
"call_id": tool_call.get("id"),
|
|
632
|
+
"action": action,
|
|
633
|
+
"status": "completed",
|
|
634
|
+
}
|
|
635
|
+
)
|
|
650
636
|
except json.JSONDecodeError:
|
|
651
637
|
# Fallback to function call format
|
|
652
|
-
responses_items.append(
|
|
638
|
+
responses_items.append(
|
|
639
|
+
{
|
|
640
|
+
"type": "function_call",
|
|
641
|
+
"call_id": tool_call.get("id"),
|
|
642
|
+
"name": function_name,
|
|
643
|
+
"arguments": function.get("arguments", "{}"),
|
|
644
|
+
"status": "completed",
|
|
645
|
+
}
|
|
646
|
+
)
|
|
647
|
+
else:
|
|
648
|
+
# Regular function call
|
|
649
|
+
responses_items.append(
|
|
650
|
+
{
|
|
653
651
|
"type": "function_call",
|
|
654
652
|
"call_id": tool_call.get("id"),
|
|
655
653
|
"name": function_name,
|
|
656
654
|
"arguments": function.get("arguments", "{}"),
|
|
657
|
-
"status": "completed"
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
responses_items.append({
|
|
662
|
-
"type": "function_call",
|
|
663
|
-
"call_id": tool_call.get("id"),
|
|
664
|
-
"name": function_name,
|
|
665
|
-
"arguments": function.get("arguments", "{}"),
|
|
666
|
-
"status": "completed"
|
|
667
|
-
})
|
|
668
|
-
|
|
655
|
+
"status": "completed",
|
|
656
|
+
}
|
|
657
|
+
)
|
|
658
|
+
|
|
669
659
|
# Handle tool messages (function/computer call outputs)
|
|
670
660
|
elif role == "tool" and content:
|
|
671
661
|
tool_call_id = message.get("tool_call_id")
|
|
@@ -674,74 +664,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
674
664
|
if content == "[Execution completed. See screenshot below]":
|
|
675
665
|
# Look ahead for the next user message with image
|
|
676
666
|
next_idx = i + 1
|
|
677
|
-
if (
|
|
678
|
-
|
|
679
|
-
|
|
667
|
+
if (
|
|
668
|
+
next_idx < len(completion_messages)
|
|
669
|
+
and completion_messages[next_idx].get("role") == "user"
|
|
670
|
+
and isinstance(completion_messages[next_idx].get("content"), list)
|
|
671
|
+
):
|
|
680
672
|
# Found the pattern - extract image from next message
|
|
681
673
|
next_content = completion_messages[next_idx]["content"]
|
|
682
674
|
for item in next_content:
|
|
683
675
|
if item.get("type") == "image_url":
|
|
684
|
-
responses_items.append(
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
"
|
|
689
|
-
|
|
676
|
+
responses_items.append(
|
|
677
|
+
{
|
|
678
|
+
"type": "computer_call_output",
|
|
679
|
+
"call_id": tool_call_id,
|
|
680
|
+
"output": {
|
|
681
|
+
"type": "input_image",
|
|
682
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
683
|
+
},
|
|
690
684
|
}
|
|
691
|
-
|
|
685
|
+
)
|
|
692
686
|
# Skip the next user message since we processed it
|
|
693
687
|
skip_next = True
|
|
694
688
|
break
|
|
695
689
|
else:
|
|
696
690
|
# No matching user message, treat as regular text
|
|
697
|
-
responses_items.append(
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
691
|
+
responses_items.append(
|
|
692
|
+
{
|
|
693
|
+
"type": "computer_call_output",
|
|
694
|
+
"call_id": tool_call_id,
|
|
695
|
+
"output": content,
|
|
696
|
+
}
|
|
697
|
+
)
|
|
702
698
|
else:
|
|
703
699
|
# Determine if this is a computer call or function call output
|
|
704
700
|
try:
|
|
705
701
|
# Try to parse as structured output
|
|
706
702
|
parsed_content = json.loads(content)
|
|
707
703
|
if parsed_content.get("type") == "input_image":
|
|
708
|
-
responses_items.append(
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
704
|
+
responses_items.append(
|
|
705
|
+
{
|
|
706
|
+
"type": "computer_call_output",
|
|
707
|
+
"call_id": tool_call_id,
|
|
708
|
+
"output": parsed_content,
|
|
709
|
+
}
|
|
710
|
+
)
|
|
713
711
|
else:
|
|
714
|
-
responses_items.append(
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
712
|
+
responses_items.append(
|
|
713
|
+
{
|
|
714
|
+
"type": "computer_call_output",
|
|
715
|
+
"call_id": tool_call_id,
|
|
716
|
+
"output": content,
|
|
717
|
+
}
|
|
718
|
+
)
|
|
719
719
|
except json.JSONDecodeError:
|
|
720
720
|
# Plain text output - could be function or computer call
|
|
721
|
-
responses_items.append(
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
721
|
+
responses_items.append(
|
|
722
|
+
{
|
|
723
|
+
"type": "function_call_output",
|
|
724
|
+
"call_id": tool_call_id,
|
|
725
|
+
"output": content,
|
|
726
|
+
}
|
|
727
|
+
)
|
|
726
728
|
elif isinstance(content, list):
|
|
727
729
|
# Handle structured content (e.g., images)
|
|
728
730
|
for item in content:
|
|
729
731
|
if item.get("type") == "image_url":
|
|
730
|
-
responses_items.append(
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
"
|
|
735
|
-
|
|
732
|
+
responses_items.append(
|
|
733
|
+
{
|
|
734
|
+
"type": "computer_call_output",
|
|
735
|
+
"call_id": tool_call_id,
|
|
736
|
+
"output": {
|
|
737
|
+
"type": "input_image",
|
|
738
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
739
|
+
},
|
|
736
740
|
}
|
|
737
|
-
|
|
741
|
+
)
|
|
738
742
|
elif item.get("type") == "text":
|
|
739
|
-
responses_items.append(
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
743
|
+
responses_items.append(
|
|
744
|
+
{
|
|
745
|
+
"type": "function_call_output",
|
|
746
|
+
"call_id": tool_call_id,
|
|
747
|
+
"output": item.get("text"),
|
|
748
|
+
}
|
|
749
|
+
)
|
|
750
|
+
|
|
745
751
|
# Handle actual user messages
|
|
746
752
|
elif role == "user" and content:
|
|
747
753
|
if isinstance(content, list):
|
|
@@ -749,27 +755,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
|
|
|
749
755
|
user_content = []
|
|
750
756
|
for item in content:
|
|
751
757
|
if item.get("type") == "image_url":
|
|
752
|
-
user_content.append(
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
758
|
+
user_content.append(
|
|
759
|
+
{
|
|
760
|
+
"type": "input_image",
|
|
761
|
+
"image_url": item.get("image_url", {}).get("url"),
|
|
762
|
+
}
|
|
763
|
+
)
|
|
756
764
|
elif item.get("type") == "text":
|
|
757
|
-
user_content.append({
|
|
758
|
-
|
|
759
|
-
"text": item.get("text")
|
|
760
|
-
})
|
|
761
|
-
|
|
765
|
+
user_content.append({"type": "input_text", "text": item.get("text")})
|
|
766
|
+
|
|
762
767
|
if user_content:
|
|
763
|
-
responses_items.append(
|
|
764
|
-
"role": "user",
|
|
765
|
-
|
|
766
|
-
"content": user_content
|
|
767
|
-
})
|
|
768
|
+
responses_items.append(
|
|
769
|
+
{"role": "user", "type": "message", "content": user_content}
|
|
770
|
+
)
|
|
768
771
|
elif isinstance(content, str):
|
|
769
772
|
# Handle simple text user message
|
|
770
|
-
responses_items.append({
|
|
771
|
-
|
|
772
|
-
"content": content
|
|
773
|
-
})
|
|
774
|
-
|
|
773
|
+
responses_items.append({"role": "user", "content": content})
|
|
774
|
+
|
|
775
775
|
return responses_items
|