cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/responses.py CHANGED
@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
6
6
  import base64
7
7
  import json
8
8
  import uuid
9
- from typing import List, Dict, Any, Literal, Union, Optional
9
+ from typing import Any, Dict, List, Literal, Optional, Union
10
10
 
11
+ from openai.types.responses.easy_input_message_param import EasyInputMessageParam
11
12
  from openai.types.responses.response_computer_tool_call_param import (
12
- ResponseComputerToolCallParam,
13
13
  ActionClick,
14
14
  ActionDoubleClick,
15
15
  ActionDrag,
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
18
18
  ActionMove,
19
19
  ActionScreenshot,
20
20
  ActionScroll,
21
+ )
22
+ from openai.types.responses.response_computer_tool_call_param import (
21
23
  ActionType as ActionTypeAction,
24
+ )
25
+ from openai.types.responses.response_computer_tool_call_param import (
22
26
  ActionWait,
23
- PendingSafetyCheck
27
+ PendingSafetyCheck,
28
+ ResponseComputerToolCallParam,
29
+ )
30
+ from openai.types.responses.response_function_tool_call_param import (
31
+ ResponseFunctionToolCallParam,
24
32
  )
25
-
26
- from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
27
- from openai.types.responses.response_output_text_param import ResponseOutputTextParam
28
- from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
29
- from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
30
- from openai.types.responses.easy_input_message_param import EasyInputMessageParam
31
33
  from openai.types.responses.response_input_image_param import ResponseInputImageParam
34
+ from openai.types.responses.response_output_message_param import (
35
+ ResponseOutputMessageParam,
36
+ )
37
+ from openai.types.responses.response_output_text_param import ResponseOutputTextParam
38
+ from openai.types.responses.response_reasoning_item_param import (
39
+ ResponseReasoningItemParam,
40
+ Summary,
41
+ )
42
+
32
43
 
33
44
  def random_id():
34
45
  return str(uuid.uuid4())
35
46
 
47
+
36
48
  # User message items
37
49
  def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
38
50
  return EasyInputMessageParam(
39
51
  content=[
40
52
  ResponseInputImageParam(
41
53
  type="input_image",
42
- image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
43
- ) # type: ignore
54
+ image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
55
+ ) # type: ignore
44
56
  ],
45
57
  role="user",
46
- type="message"
58
+ type="message",
47
59
  )
48
60
 
61
+
49
62
  # Text items
50
63
  def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
51
64
  return ResponseReasoningItemParam(
52
- id=random_id(),
53
- summary=[
54
- Summary(text=reasoning, type="summary_text")
55
- ],
56
- type="reasoning"
65
+ id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
57
66
  )
58
67
 
68
+
59
69
  def make_output_text_item(content: str) -> ResponseOutputMessageParam:
60
70
  return ResponseOutputMessageParam(
61
71
  id=random_id(),
62
- content=[
63
- ResponseOutputTextParam(
64
- text=content,
65
- type="output_text",
66
- annotations=[]
67
- )
68
- ],
72
+ content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
69
73
  role="assistant",
70
74
  status="completed",
71
- type="message"
75
+ type="message",
72
76
  )
73
77
 
78
+
74
79
  # Function call items
75
- def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
80
+ def make_function_call_item(
81
+ function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
82
+ ) -> ResponseFunctionToolCallParam:
76
83
  return ResponseFunctionToolCallParam(
77
84
  id=random_id(),
78
85
  call_id=call_id if call_id else random_id(),
79
86
  name=function_name,
80
87
  arguments=json.dumps(arguments),
81
88
  status="completed",
82
- type="function_call"
89
+ type="function_call",
83
90
  )
84
91
 
92
+
85
93
  # Computer tool call items
86
- def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
94
+ def make_click_item(
95
+ x: int,
96
+ y: int,
97
+ button: Literal["left", "right", "wheel", "back", "forward"] = "left",
98
+ call_id: Optional[str] = None,
99
+ ) -> ResponseComputerToolCallParam:
87
100
  return ResponseComputerToolCallParam(
88
101
  id=random_id(),
89
102
  call_id=call_id if call_id else random_id(),
90
- action=ActionClick(
91
- button=button,
92
- type="click",
93
- x=x,
94
- y=y
95
- ),
103
+ action=ActionClick(button=button, type="click", x=x, y=y),
96
104
  pending_safety_checks=[],
97
105
  status="completed",
98
- type="computer_call"
106
+ type="computer_call",
99
107
  )
100
108
 
101
- def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
109
+
110
+ def make_double_click_item(
111
+ x: int, y: int, call_id: Optional[str] = None
112
+ ) -> ResponseComputerToolCallParam:
102
113
  return ResponseComputerToolCallParam(
103
114
  id=random_id(),
104
115
  call_id=call_id if call_id else random_id(),
105
- action=ActionDoubleClick(
106
- type="double_click",
107
- x=x,
108
- y=y
109
- ),
116
+ action=ActionDoubleClick(type="double_click", x=x, y=y),
110
117
  pending_safety_checks=[],
111
118
  status="completed",
112
- type="computer_call"
119
+ type="computer_call",
113
120
  )
114
121
 
115
- def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
122
+
123
+ def make_drag_item(
124
+ path: List[Dict[str, int]], call_id: Optional[str] = None
125
+ ) -> ResponseComputerToolCallParam:
116
126
  drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
117
127
  return ResponseComputerToolCallParam(
118
128
  id=random_id(),
119
129
  call_id=call_id if call_id else random_id(),
120
- action=ActionDrag(
121
- path=drag_path,
122
- type="drag"
123
- ),
130
+ action=ActionDrag(path=drag_path, type="drag"),
124
131
  pending_safety_checks=[],
125
132
  status="completed",
126
- type="computer_call"
133
+ type="computer_call",
127
134
  )
128
135
 
129
- def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
136
+
137
+ def make_keypress_item(
138
+ keys: List[str], call_id: Optional[str] = None
139
+ ) -> ResponseComputerToolCallParam:
130
140
  return ResponseComputerToolCallParam(
131
141
  id=random_id(),
132
142
  call_id=call_id if call_id else random_id(),
133
- action=ActionKeypress(
134
- keys=keys,
135
- type="keypress"
136
- ),
143
+ action=ActionKeypress(keys=keys, type="keypress"),
137
144
  pending_safety_checks=[],
138
145
  status="completed",
139
- type="computer_call"
146
+ type="computer_call",
140
147
  )
141
148
 
149
+
142
150
  def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
143
151
  return ResponseComputerToolCallParam(
144
152
  id=random_id(),
145
153
  call_id=call_id if call_id else random_id(),
146
- action=ActionMove(
147
- type="move",
148
- x=x,
149
- y=y
150
- ),
154
+ action=ActionMove(type="move", x=x, y=y),
151
155
  pending_safety_checks=[],
152
156
  status="completed",
153
- type="computer_call"
157
+ type="computer_call",
154
158
  )
155
159
 
160
+
156
161
  def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
157
162
  return ResponseComputerToolCallParam(
158
163
  id=random_id(),
159
164
  call_id=call_id if call_id else random_id(),
160
- action=ActionScreenshot(
161
- type="screenshot"
162
- ),
165
+ action=ActionScreenshot(type="screenshot"),
163
166
  pending_safety_checks=[],
164
167
  status="completed",
165
- type="computer_call"
168
+ type="computer_call",
166
169
  )
167
170
 
168
- def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
171
+
172
+ def make_scroll_item(
173
+ x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
174
+ ) -> ResponseComputerToolCallParam:
169
175
  return ResponseComputerToolCallParam(
170
176
  id=random_id(),
171
177
  call_id=call_id if call_id else random_id(),
172
- action=ActionScroll(
173
- scroll_x=scroll_x,
174
- scroll_y=scroll_y,
175
- type="scroll",
176
- x=x,
177
- y=y
178
- ),
178
+ action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
179
179
  pending_safety_checks=[],
180
180
  status="completed",
181
- type="computer_call"
181
+ type="computer_call",
182
182
  )
183
183
 
184
+
184
185
  def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
185
186
  return ResponseComputerToolCallParam(
186
187
  id=random_id(),
187
188
  call_id=call_id if call_id else random_id(),
188
- action=ActionTypeAction(
189
- text=text,
190
- type="type"
191
- ),
189
+ action=ActionTypeAction(text=text, type="type"),
192
190
  pending_safety_checks=[],
193
191
  status="completed",
194
- type="computer_call"
192
+ type="computer_call",
195
193
  )
196
194
 
195
+
197
196
  def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
198
197
  return ResponseComputerToolCallParam(
199
198
  id=random_id(),
200
199
  call_id=call_id if call_id else random_id(),
201
- action=ActionWait(
202
- type="wait"
203
- ),
200
+ action=ActionWait(type="wait"),
204
201
  pending_safety_checks=[],
205
202
  status="completed",
206
- type="computer_call"
203
+ type="computer_call",
207
204
  )
208
205
 
206
+
209
207
  # Extra anthropic computer calls
210
- def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
208
+ def make_left_mouse_down_item(
209
+ x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
210
+ ) -> Dict[str, Any]:
211
211
  return {
212
212
  "id": random_id(),
213
213
  "call_id": call_id if call_id else random_id(),
214
- "action": {
215
- "type": "left_mouse_down",
216
- "x": x,
217
- "y": y
218
- },
214
+ "action": {"type": "left_mouse_down", "x": x, "y": y},
219
215
  "pending_safety_checks": [],
220
216
  "status": "completed",
221
- "type": "computer_call"
217
+ "type": "computer_call",
222
218
  }
223
219
 
224
- def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
220
+
221
+ def make_left_mouse_up_item(
222
+ x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
223
+ ) -> Dict[str, Any]:
225
224
  return {
226
225
  "id": random_id(),
227
226
  "call_id": call_id if call_id else random_id(),
228
- "action": {
229
- "type": "left_mouse_up",
230
- "x": x,
231
- "y": y
232
- },
227
+ "action": {"type": "left_mouse_up", "x": x, "y": y},
233
228
  "pending_safety_checks": [],
234
229
  "status": "completed",
235
- "type": "computer_call"
230
+ "type": "computer_call",
236
231
  }
237
232
 
238
- def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]:
233
+
234
+ def make_failed_tool_call_items(
235
+ tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
236
+ ) -> List[Dict[str, Any]]:
239
237
  call_id = call_id if call_id else random_id()
240
238
  return [
241
239
  {
@@ -249,27 +247,80 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
249
247
  "type": "function_call_output",
250
248
  "call_id": call_id,
251
249
  "output": json.dumps({"error": error_message}),
252
- }
250
+ },
253
251
  ]
254
252
 
253
+
254
+ def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
255
+ call_id = call_id if call_id else random_id()
256
+ return {
257
+ "type": "function_call_output",
258
+ "call_id": call_id,
259
+ "output": json.dumps({"error": error_message}),
260
+ }
261
+
262
+
263
+ def replace_failed_computer_calls_with_function_calls(
264
+ messages: List[Dict[str, Any]],
265
+ ) -> List[Dict[str, Any]]:
266
+ """
267
+ Replace computer_call items with function_call items if they share a call_id with a function_call_output.
268
+ This indicates the computer call failed and should be treated as a function call instead.
269
+ We do this because the computer_call_output items do not support text output.
270
+
271
+ Args:
272
+ messages: List of message items to process
273
+ """
274
+ messages = messages.copy()
275
+
276
+ # Find all call_ids that have function_call_output items
277
+ failed_call_ids = set()
278
+ for msg in messages:
279
+ if msg.get("type") == "function_call_output":
280
+ call_id = msg.get("call_id")
281
+ if call_id:
282
+ failed_call_ids.add(call_id)
283
+
284
+ # Replace computer_call items that have matching call_ids
285
+ for i, msg in enumerate(messages):
286
+ if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
287
+
288
+ # Extract action from computer_call
289
+ action = msg.get("action", {})
290
+ call_id = msg.get("call_id")
291
+
292
+ # Create function_call replacement
293
+ messages[i] = {
294
+ "type": "function_call",
295
+ "id": msg.get("id", random_id()),
296
+ "call_id": call_id,
297
+ "name": "computer",
298
+ "arguments": json.dumps(action),
299
+ }
300
+
301
+ return messages
302
+
303
+
255
304
  # Conversion functions between element descriptions and coordinates
256
- def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
305
+ def convert_computer_calls_desc2xy(
306
+ responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
307
+ ) -> List[Dict[str, Any]]:
257
308
  """
258
309
  Convert computer calls from element descriptions to x,y coordinates.
259
-
310
+
260
311
  Args:
261
312
  responses_items: List of response items containing computer calls with element_description
262
313
  desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
263
-
314
+
264
315
  Returns:
265
316
  List of response items with element_description replaced by x,y coordinates
266
317
  """
267
318
  converted_items = []
268
-
319
+
269
320
  for item in responses_items:
270
321
  if item.get("type") == "computer_call" and "action" in item:
271
322
  action = item["action"].copy()
272
-
323
+
273
324
  # Handle single element_description
274
325
  if "element_description" in action:
275
326
  desc = action["element_description"]
@@ -278,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
278
329
  action["x"] = x
279
330
  action["y"] = y
280
331
  del action["element_description"]
281
-
332
+
282
333
  # Handle start_element_description and end_element_description for drag operations
283
334
  elif "start_element_description" in action and "end_element_description" in action:
284
335
  start_desc = action["start_element_description"]
285
336
  end_desc = action["end_element_description"]
286
-
337
+
287
338
  if start_desc in desc2xy and end_desc in desc2xy:
288
339
  start_x, start_y = desc2xy[start_desc]
289
340
  end_x, end_y = desc2xy[end_desc]
290
341
  action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
291
342
  del action["start_element_description"]
292
343
  del action["end_element_description"]
293
-
344
+
294
345
  converted_item = item.copy()
295
346
  converted_item["action"] = action
296
347
  converted_items.append(converted_item)
297
348
  else:
298
349
  converted_items.append(item)
299
-
350
+
300
351
  return converted_items
301
352
 
302
353
 
303
- def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
354
+ def convert_computer_calls_xy2desc(
355
+ responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
356
+ ) -> List[Dict[str, Any]]:
304
357
  """
305
358
  Convert computer calls from x,y coordinates to element descriptions.
306
-
359
+
307
360
  Args:
308
361
  responses_items: List of response items containing computer calls with x,y coordinates
309
362
  desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
310
-
363
+
311
364
  Returns:
312
365
  List of response items with x,y coordinates replaced by element_description
313
366
  """
314
367
  # Create reverse mapping from coordinates to descriptions
315
368
  xy2desc = {coords: desc for desc, coords in desc2xy.items()}
316
-
369
+
317
370
  converted_items = []
318
-
371
+
319
372
  for item in responses_items:
320
373
  if item.get("type") == "computer_call" and "action" in item:
321
374
  action = item["action"].copy()
322
-
375
+
323
376
  # Handle single x,y coordinates
324
377
  if "x" in action and "y" in action:
325
378
  coords = (action["x"], action["y"])
@@ -327,77 +380,94 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
327
380
  action["element_description"] = xy2desc[coords]
328
381
  del action["x"]
329
382
  del action["y"]
330
-
383
+
331
384
  # Handle path for drag operations
332
385
  elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
333
386
  start_point = action["path"][0]
334
387
  end_point = action["path"][1]
335
-
336
- if ("x" in start_point and "y" in start_point and
337
- "x" in end_point and "y" in end_point):
338
-
388
+
389
+ if (
390
+ "x" in start_point
391
+ and "y" in start_point
392
+ and "x" in end_point
393
+ and "y" in end_point
394
+ ):
395
+
339
396
  start_coords = (start_point["x"], start_point["y"])
340
397
  end_coords = (end_point["x"], end_point["y"])
341
-
398
+
342
399
  if start_coords in xy2desc and end_coords in xy2desc:
343
400
  action["start_element_description"] = xy2desc[start_coords]
344
401
  action["end_element_description"] = xy2desc[end_coords]
345
402
  del action["path"]
346
-
403
+
347
404
  converted_item = item.copy()
348
405
  converted_item["action"] = action
349
406
  converted_items.append(converted_item)
350
407
  else:
351
408
  converted_items.append(item)
352
-
409
+
353
410
  return converted_items
354
411
 
355
412
 
356
413
  def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
357
414
  """
358
415
  Extract all element descriptions from computer calls in responses items.
359
-
416
+
360
417
  Args:
361
418
  responses_items: List of response items containing computer calls
362
-
419
+
363
420
  Returns:
364
421
  List of unique element descriptions found in computer calls
365
422
  """
366
423
  descriptions = set()
367
-
424
+
368
425
  for item in responses_items:
369
426
  if item.get("type") == "computer_call" and "action" in item:
370
427
  action = item["action"]
371
-
428
+
372
429
  # Handle single element_description
373
430
  if "element_description" in action:
374
431
  descriptions.add(action["element_description"])
375
-
432
+
376
433
  # Handle start_element_description and end_element_description for drag operations
377
434
  if "start_element_description" in action:
378
435
  descriptions.add(action["start_element_description"])
379
-
436
+
380
437
  if "end_element_description" in action:
381
438
  descriptions.add(action["end_element_description"])
382
-
439
+
383
440
  return list(descriptions)
384
441
 
385
442
 
386
443
  # Conversion functions between responses_items and completion messages formats
387
- def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True) -> List[Dict[str, Any]]:
444
+ def convert_responses_items_to_completion_messages(
445
+ messages: List[Dict[str, Any]],
446
+ allow_images_in_tool_results: bool = True,
447
+ send_multiple_user_images_per_parallel_tool_results: bool = False,
448
+ use_xml_tools: bool = False,
449
+ ) -> List[Dict[str, Any]]:
388
450
  """Convert responses_items message format to liteLLM completion format.
389
-
451
+
390
452
  Args:
391
453
  messages: List of responses_items format messages
392
454
  allow_images_in_tool_results: If True, include images in tool role messages.
393
455
  If False, send tool message + separate user message with image.
456
+ send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
457
+ use_xml_tools: If True, use XML-style <tool_call> tags instead of tool_calls array.
458
+ Also sends tool results as user messages instead of tool role.
394
459
  """
460
+ # Assert that allow_images_in_tool_results is False when use_xml_tools is True
461
+ if use_xml_tools:
462
+ assert (
463
+ not allow_images_in_tool_results
464
+ ), "allow_images_in_tool_results must be False when use_xml_tools is True"
395
465
  completion_messages = []
396
-
397
- for message in messages:
466
+
467
+ for i, message in enumerate(messages):
398
468
  msg_type = message.get("type")
399
469
  role = message.get("role")
400
-
470
+
401
471
  # Handle user messages (both with and without explicit type)
402
472
  if role == "user" or msg_type == "user":
403
473
  content = message.get("content", "")
@@ -406,34 +476,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
406
476
  completion_content = []
407
477
  for item in content:
408
478
  if item.get("type") == "input_image":
409
- completion_content.append({
410
- "type": "image_url",
411
- "image_url": {
412
- "url": item.get("image_url")
413
- }
414
- })
479
+ completion_content.append(
480
+ {"type": "image_url", "image_url": {"url": item.get("image_url")}}
481
+ )
415
482
  elif item.get("type") == "input_text":
416
- completion_content.append({
417
- "type": "text",
418
- "text": item.get("text")
419
- })
483
+ completion_content.append({"type": "text", "text": item.get("text")})
420
484
  elif item.get("type") == "text":
421
- completion_content.append({
422
- "type": "text",
423
- "text": item.get("text")
424
- })
425
-
426
- completion_messages.append({
427
- "role": "user",
428
- "content": completion_content
429
- })
485
+ completion_content.append({"type": "text", "text": item.get("text")})
486
+
487
+ completion_messages.append({"role": "user", "content": completion_content})
430
488
  elif isinstance(content, str):
431
489
  # Handle string content
432
- completion_messages.append({
433
- "role": "user",
434
- "content": content
435
- })
436
-
490
+ completion_messages.append({"role": "user", "content": content})
491
+
437
492
  # Handle assistant messages
438
493
  elif role == "assistant" or msg_type == "message":
439
494
  content = message.get("content", [])
@@ -444,13 +499,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
444
499
  text_parts.append(item.get("text", ""))
445
500
  elif item.get("type") == "text":
446
501
  text_parts.append(item.get("text", ""))
447
-
502
+
448
503
  if text_parts:
449
- completion_messages.append({
450
- "role": "assistant",
451
- "content": "\n".join(text_parts)
452
- })
453
-
504
+ completion_messages.append(
505
+ {"role": "assistant", "content": "\n".join(text_parts)}
506
+ )
507
+
454
508
  # Handle reasoning items (convert to assistant message)
455
509
  elif msg_type == "reasoning":
456
510
  summary = message.get("summary", [])
@@ -458,107 +512,185 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
458
512
  for item in summary:
459
513
  if item.get("type") == "summary_text":
460
514
  text_parts.append(item.get("text", ""))
461
-
515
+
462
516
  if text_parts:
463
- completion_messages.append({
464
- "role": "assistant",
465
- "content": "\n".join(text_parts)
466
- })
467
-
517
+ completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
518
+
468
519
  # Handle function calls
469
520
  elif msg_type == "function_call":
470
- # Add tool call to last assistant message or create new one
471
- if not completion_messages or completion_messages[-1]["role"] != "assistant":
472
- completion_messages.append({
473
- "role": "assistant",
474
- "content": "",
475
- "tool_calls": []
476
- })
477
-
478
- if "tool_calls" not in completion_messages[-1]:
479
- completion_messages[-1]["tool_calls"] = []
480
-
481
- completion_messages[-1]["tool_calls"].append({
482
- "id": message.get("call_id"),
483
- "type": "function",
484
- "function": {
485
- "name": message.get("name"),
486
- "arguments": message.get("arguments")
487
- }
488
- })
489
-
521
+ if use_xml_tools:
522
+ # Use XML format instead of tool_calls array
523
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
524
+ completion_messages.append({"role": "assistant", "content": ""})
525
+
526
+ # Ensure arguments is a JSON string (not a dict)
527
+ arguments = message.get("arguments")
528
+ if isinstance(arguments, dict):
529
+ arguments = json.dumps(arguments)
530
+
531
+ # Format as XML tool call
532
+ tool_call_xml = f'<tool_call>{{"name": "{message.get("name")}", "arguments": {arguments}}}</tool_call>'
533
+ if completion_messages[-1]["content"]:
534
+ completion_messages[-1]["content"] += "\n" + tool_call_xml
535
+ else:
536
+ completion_messages[-1]["content"] = tool_call_xml
537
+ else:
538
+ # Add tool call to last assistant message or create new one
539
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
540
+ completion_messages.append(
541
+ {"role": "assistant", "content": "", "tool_calls": []}
542
+ )
543
+
544
+ if "tool_calls" not in completion_messages[-1]:
545
+ completion_messages[-1]["tool_calls"] = []
546
+
547
+ # Ensure arguments is a JSON string (not a dict)
548
+ arguments = message.get("arguments")
549
+ if isinstance(arguments, dict):
550
+ arguments = json.dumps(arguments)
551
+
552
+ completion_messages[-1]["tool_calls"].append(
553
+ {
554
+ "id": message.get("call_id"),
555
+ "type": "function",
556
+ "function": {
557
+ "name": message.get("name"),
558
+ "arguments": arguments,
559
+ },
560
+ }
561
+ )
562
+
490
563
  # Handle computer calls
491
564
  elif msg_type == "computer_call":
492
- # Add tool call to last assistant message or create new one
493
- if not completion_messages or completion_messages[-1]["role"] != "assistant":
494
- completion_messages.append({
495
- "role": "assistant",
496
- "content": "",
497
- "tool_calls": []
498
- })
499
-
500
- if "tool_calls" not in completion_messages[-1]:
501
- completion_messages[-1]["tool_calls"] = []
502
-
503
- action = message.get("action", {})
504
- completion_messages[-1]["tool_calls"].append({
505
- "id": message.get("call_id"),
506
- "type": "function",
507
- "function": {
508
- "name": "computer",
509
- "arguments": json.dumps(action)
510
- }
511
- })
512
-
565
+ if use_xml_tools:
566
+ # Use XML format instead of tool_calls array
567
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
568
+ completion_messages.append({"role": "assistant", "content": ""})
569
+
570
+ action = message.get("action", {})
571
+ # Format as XML tool call
572
+ tool_call_xml = f'<tool_call>{{"name": "computer", "arguments": {json.dumps(action)}}}</tool_call>'
573
+ if completion_messages[-1]["content"]:
574
+ completion_messages[-1]["content"] += "\n" + tool_call_xml
575
+ else:
576
+ completion_messages[-1]["content"] = tool_call_xml
577
+ else:
578
+ # Add tool call to last assistant message or create new one
579
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
580
+ completion_messages.append(
581
+ {"role": "assistant", "content": "", "tool_calls": []}
582
+ )
583
+
584
+ if "tool_calls" not in completion_messages[-1]:
585
+ completion_messages[-1]["tool_calls"] = []
586
+
587
+ action = message.get("action", {})
588
+ completion_messages[-1]["tool_calls"].append(
589
+ {
590
+ "id": message.get("call_id"),
591
+ "type": "function",
592
+ "function": {"name": "computer", "arguments": json.dumps(action)},
593
+ }
594
+ )
595
+
513
596
  # Handle function/computer call outputs
514
597
  elif msg_type in ["function_call_output", "computer_call_output"]:
515
598
  output = message.get("output")
516
599
  call_id = message.get("call_id")
517
-
518
- if isinstance(output, dict) and output.get("type") == "input_image":
519
- if allow_images_in_tool_results:
520
- # Handle image output as tool response (may not work with all APIs)
521
- completion_messages.append({
522
- "role": "tool",
523
- "tool_call_id": call_id,
524
- "content": [{
525
- "type": "image_url",
526
- "image_url": {
527
- "url": output.get("image_url")
528
- }
529
- }]
530
- })
600
+
601
+ if use_xml_tools:
602
+ # When using XML tools, send all results as user messages
603
+ if isinstance(output, dict) and output.get("type") == "input_image":
604
+ # Send image as user message
605
+ completion_messages.append(
606
+ {
607
+ "role": "user",
608
+ "content": [
609
+ {
610
+ "type": "image_url",
611
+ "image_url": {"url": output.get("image_url")},
612
+ }
613
+ ],
614
+ }
615
+ )
531
616
  else:
532
- # Send tool message + separate user message with image (OpenAI compatible)
533
- completion_messages += [{
534
- "role": "tool",
535
- "tool_call_id": call_id,
536
- "content": "[Execution completed. See screenshot below]"
537
- }, {
538
- "role": "user",
539
- "content": [{
540
- "type": "image_url",
541
- "image_url": {
542
- "url": output.get("image_url")
543
- }
544
- }]
545
- }]
617
+ # Send text result as user message
618
+ completion_messages.append(
619
+ {
620
+ "role": "user",
621
+ "content": str(output),
622
+ }
623
+ )
546
624
  else:
547
- # Handle text output as tool response
548
- completion_messages.append({
549
- "role": "tool",
550
- "tool_call_id": call_id,
551
- "content": str(output)
552
- })
553
-
625
+ # Standard tool message handling
626
+ if isinstance(output, dict) and output.get("type") == "input_image":
627
+ if allow_images_in_tool_results:
628
+ # Handle image output as tool response (may not work with all APIs)
629
+ completion_messages.append(
630
+ {
631
+ "role": "tool",
632
+ "tool_call_id": call_id,
633
+ "content": [
634
+ {
635
+ "type": "image_url",
636
+ "image_url": {"url": output.get("image_url")},
637
+ }
638
+ ],
639
+ }
640
+ )
641
+ else:
642
+ # Determine if the next message is also a tool call output
643
+ next_type = None
644
+ if i + 1 < len(messages):
645
+ next_msg = messages[i + 1]
646
+ next_type = next_msg.get("type")
647
+ is_next_message_image_result = next_type in [
648
+ "computer_call_output",
649
+ ]
650
+ # Send tool message + separate user message with image (OpenAI compatible)
651
+ completion_messages += (
652
+ [
653
+ {
654
+ "role": "tool",
655
+ "tool_call_id": call_id,
656
+ "content": "[Execution completed. See screenshot below]",
657
+ },
658
+ {
659
+ "role": "user",
660
+ "content": [
661
+ {
662
+ "type": "image_url",
663
+ "image_url": {"url": output.get("image_url")},
664
+ }
665
+ ],
666
+ },
667
+ ]
668
+ if send_multiple_user_images_per_parallel_tool_results
669
+ or (not is_next_message_image_result)
670
+ else [
671
+ {
672
+ "role": "tool",
673
+ "tool_call_id": call_id,
674
+ "content": "[Execution completed. See screenshot below]",
675
+ },
676
+ ]
677
+ )
678
+ else:
679
+ # Handle text output as tool response
680
+ completion_messages.append(
681
+ {"role": "tool", "tool_call_id": call_id, "content": str(output)}
682
+ )
683
+
554
684
  return completion_messages
555
685
 
556
686
 
557
- def convert_completion_messages_to_responses_items(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
687
+ def convert_completion_messages_to_responses_items(
688
+ completion_messages: List[Dict[str, Any]],
689
+ ) -> List[Dict[str, Any]]:
558
690
  """Convert completion messages format to responses_items message format."""
559
691
  responses_items = []
560
692
  skip_next = False
561
-
693
+
562
694
  for i, message in enumerate(completion_messages):
563
695
  if skip_next:
564
696
  skip_next = False
@@ -567,25 +699,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
567
699
  role = message.get("role")
568
700
  content = message.get("content")
569
701
  tool_calls = message.get("tool_calls", [])
570
-
702
+
571
703
  # Handle assistant messages with text content
572
704
  if role == "assistant" and content and isinstance(content, str):
573
- responses_items.append({
574
- "type": "message",
575
- "role": "assistant",
576
- "content": [{
577
- "type": "output_text",
578
- "text": content
579
- }]
580
- })
581
-
705
+ responses_items.append(
706
+ {
707
+ "type": "message",
708
+ "role": "assistant",
709
+ "content": [{"type": "output_text", "text": content}],
710
+ }
711
+ )
712
+
582
713
  # Handle tool calls
583
714
  if tool_calls:
584
715
  for tool_call in tool_calls:
585
716
  if tool_call.get("type") == "function":
586
717
  function = tool_call.get("function", {})
587
718
  function_name = function.get("name")
588
-
719
+
589
720
  if function_name == "computer":
590
721
  # Parse computer action
591
722
  try:
@@ -594,31 +725,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
594
725
  if action.get("action"):
595
726
  action["type"] = action["action"]
596
727
  del action["action"]
597
- responses_items.append({
598
- "type": "computer_call",
599
- "call_id": tool_call.get("id"),
600
- "action": action,
601
- "status": "completed"
602
- })
728
+ responses_items.append(
729
+ {
730
+ "type": "computer_call",
731
+ "call_id": tool_call.get("id"),
732
+ "action": action,
733
+ "status": "completed",
734
+ }
735
+ )
603
736
  except json.JSONDecodeError:
604
737
  # Fallback to function call format
605
- responses_items.append({
738
+ responses_items.append(
739
+ {
740
+ "type": "function_call",
741
+ "call_id": tool_call.get("id"),
742
+ "name": function_name,
743
+ "arguments": function.get("arguments", "{}"),
744
+ "status": "completed",
745
+ }
746
+ )
747
+ else:
748
+ # Regular function call
749
+ responses_items.append(
750
+ {
606
751
  "type": "function_call",
607
752
  "call_id": tool_call.get("id"),
608
753
  "name": function_name,
609
754
  "arguments": function.get("arguments", "{}"),
610
- "status": "completed"
611
- })
612
- else:
613
- # Regular function call
614
- responses_items.append({
615
- "type": "function_call",
616
- "call_id": tool_call.get("id"),
617
- "name": function_name,
618
- "arguments": function.get("arguments", "{}"),
619
- "status": "completed"
620
- })
621
-
755
+ "status": "completed",
756
+ }
757
+ )
758
+
622
759
  # Handle tool messages (function/computer call outputs)
623
760
  elif role == "tool" and content:
624
761
  tool_call_id = message.get("tool_call_id")
@@ -627,74 +764,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
627
764
  if content == "[Execution completed. See screenshot below]":
628
765
  # Look ahead for the next user message with image
629
766
  next_idx = i + 1
630
- if (next_idx < len(completion_messages) and
631
- completion_messages[next_idx].get("role") == "user" and
632
- isinstance(completion_messages[next_idx].get("content"), list)):
767
+ if (
768
+ next_idx < len(completion_messages)
769
+ and completion_messages[next_idx].get("role") == "user"
770
+ and isinstance(completion_messages[next_idx].get("content"), list)
771
+ ):
633
772
  # Found the pattern - extract image from next message
634
773
  next_content = completion_messages[next_idx]["content"]
635
774
  for item in next_content:
636
775
  if item.get("type") == "image_url":
637
- responses_items.append({
638
- "type": "computer_call_output",
639
- "call_id": tool_call_id,
640
- "output": {
641
- "type": "input_image",
642
- "image_url": item.get("image_url", {}).get("url")
776
+ responses_items.append(
777
+ {
778
+ "type": "computer_call_output",
779
+ "call_id": tool_call_id,
780
+ "output": {
781
+ "type": "input_image",
782
+ "image_url": item.get("image_url", {}).get("url"),
783
+ },
643
784
  }
644
- })
785
+ )
645
786
  # Skip the next user message since we processed it
646
787
  skip_next = True
647
788
  break
648
789
  else:
649
790
  # No matching user message, treat as regular text
650
- responses_items.append({
651
- "type": "computer_call_output",
652
- "call_id": tool_call_id,
653
- "output": content
654
- })
791
+ responses_items.append(
792
+ {
793
+ "type": "computer_call_output",
794
+ "call_id": tool_call_id,
795
+ "output": content,
796
+ }
797
+ )
655
798
  else:
656
799
  # Determine if this is a computer call or function call output
657
800
  try:
658
801
  # Try to parse as structured output
659
802
  parsed_content = json.loads(content)
660
803
  if parsed_content.get("type") == "input_image":
661
- responses_items.append({
662
- "type": "computer_call_output",
663
- "call_id": tool_call_id,
664
- "output": parsed_content
665
- })
804
+ responses_items.append(
805
+ {
806
+ "type": "computer_call_output",
807
+ "call_id": tool_call_id,
808
+ "output": parsed_content,
809
+ }
810
+ )
666
811
  else:
667
- responses_items.append({
668
- "type": "computer_call_output",
669
- "call_id": tool_call_id,
670
- "output": content
671
- })
812
+ responses_items.append(
813
+ {
814
+ "type": "computer_call_output",
815
+ "call_id": tool_call_id,
816
+ "output": content,
817
+ }
818
+ )
672
819
  except json.JSONDecodeError:
673
820
  # Plain text output - could be function or computer call
674
- responses_items.append({
675
- "type": "function_call_output",
676
- "call_id": tool_call_id,
677
- "output": content
678
- })
821
+ responses_items.append(
822
+ {
823
+ "type": "function_call_output",
824
+ "call_id": tool_call_id,
825
+ "output": content,
826
+ }
827
+ )
679
828
  elif isinstance(content, list):
680
829
  # Handle structured content (e.g., images)
681
830
  for item in content:
682
831
  if item.get("type") == "image_url":
683
- responses_items.append({
684
- "type": "computer_call_output",
685
- "call_id": tool_call_id,
686
- "output": {
687
- "type": "input_image",
688
- "image_url": item.get("image_url", {}).get("url")
832
+ responses_items.append(
833
+ {
834
+ "type": "computer_call_output",
835
+ "call_id": tool_call_id,
836
+ "output": {
837
+ "type": "input_image",
838
+ "image_url": item.get("image_url", {}).get("url"),
839
+ },
689
840
  }
690
- })
841
+ )
691
842
  elif item.get("type") == "text":
692
- responses_items.append({
693
- "type": "function_call_output",
694
- "call_id": tool_call_id,
695
- "output": item.get("text")
696
- })
697
-
843
+ responses_items.append(
844
+ {
845
+ "type": "function_call_output",
846
+ "call_id": tool_call_id,
847
+ "output": item.get("text"),
848
+ }
849
+ )
850
+
698
851
  # Handle actual user messages
699
852
  elif role == "user" and content:
700
853
  if isinstance(content, list):
@@ -702,27 +855,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
702
855
  user_content = []
703
856
  for item in content:
704
857
  if item.get("type") == "image_url":
705
- user_content.append({
706
- "type": "input_image",
707
- "image_url": item.get("image_url", {}).get("url")
708
- })
858
+ user_content.append(
859
+ {
860
+ "type": "input_image",
861
+ "image_url": item.get("image_url", {}).get("url"),
862
+ }
863
+ )
709
864
  elif item.get("type") == "text":
710
- user_content.append({
711
- "type": "input_text",
712
- "text": item.get("text")
713
- })
714
-
865
+ user_content.append({"type": "input_text", "text": item.get("text")})
866
+
715
867
  if user_content:
716
- responses_items.append({
717
- "role": "user",
718
- "type": "message",
719
- "content": user_content
720
- })
868
+ responses_items.append(
869
+ {"role": "user", "type": "message", "content": user_content}
870
+ )
721
871
  elif isinstance(content, str):
722
872
  # Handle simple text user message
723
- responses_items.append({
724
- "role": "user",
725
- "content": content
726
- })
727
-
873
+ responses_items.append({"role": "user", "content": content})
874
+
728
875
  return responses_items