cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/responses.py CHANGED
@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
6
6
  import base64
7
7
  import json
8
8
  import uuid
9
- from typing import List, Dict, Any, Literal, Union, Optional
9
+ from typing import Any, Dict, List, Literal, Optional, Union
10
10
 
11
+ from openai.types.responses.easy_input_message_param import EasyInputMessageParam
11
12
  from openai.types.responses.response_computer_tool_call_param import (
12
- ResponseComputerToolCallParam,
13
13
  ActionClick,
14
14
  ActionDoubleClick,
15
15
  ActionDrag,
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
18
18
  ActionMove,
19
19
  ActionScreenshot,
20
20
  ActionScroll,
21
+ )
22
+ from openai.types.responses.response_computer_tool_call_param import (
21
23
  ActionType as ActionTypeAction,
24
+ )
25
+ from openai.types.responses.response_computer_tool_call_param import (
22
26
  ActionWait,
23
- PendingSafetyCheck
27
+ PendingSafetyCheck,
28
+ ResponseComputerToolCallParam,
29
+ )
30
+ from openai.types.responses.response_function_tool_call_param import (
31
+ ResponseFunctionToolCallParam,
24
32
  )
25
-
26
- from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
27
- from openai.types.responses.response_output_text_param import ResponseOutputTextParam
28
- from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
29
- from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
30
- from openai.types.responses.easy_input_message_param import EasyInputMessageParam
31
33
  from openai.types.responses.response_input_image_param import ResponseInputImageParam
34
+ from openai.types.responses.response_output_message_param import (
35
+ ResponseOutputMessageParam,
36
+ )
37
+ from openai.types.responses.response_output_text_param import ResponseOutputTextParam
38
+ from openai.types.responses.response_reasoning_item_param import (
39
+ ResponseReasoningItemParam,
40
+ Summary,
41
+ )
42
+
32
43
 
33
44
  def random_id():
34
45
  return str(uuid.uuid4())
35
46
 
47
+
36
48
  # User message items
37
49
  def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
38
50
  return EasyInputMessageParam(
39
51
  content=[
40
52
  ResponseInputImageParam(
41
53
  type="input_image",
42
- image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
43
- ) # type: ignore
54
+ image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
55
+ ) # type: ignore
44
56
  ],
45
57
  role="user",
46
- type="message"
58
+ type="message",
47
59
  )
48
60
 
61
+
49
62
  # Text items
50
63
  def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
51
64
  return ResponseReasoningItemParam(
52
- id=random_id(),
53
- summary=[
54
- Summary(text=reasoning, type="summary_text")
55
- ],
56
- type="reasoning"
65
+ id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
57
66
  )
58
67
 
68
+
59
69
  def make_output_text_item(content: str) -> ResponseOutputMessageParam:
60
70
  return ResponseOutputMessageParam(
61
71
  id=random_id(),
62
- content=[
63
- ResponseOutputTextParam(
64
- text=content,
65
- type="output_text",
66
- annotations=[]
67
- )
68
- ],
72
+ content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
69
73
  role="assistant",
70
74
  status="completed",
71
- type="message"
75
+ type="message",
72
76
  )
73
77
 
78
+
74
79
  # Function call items
75
- def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
80
+ def make_function_call_item(
81
+ function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
82
+ ) -> ResponseFunctionToolCallParam:
76
83
  return ResponseFunctionToolCallParam(
77
84
  id=random_id(),
78
85
  call_id=call_id if call_id else random_id(),
79
86
  name=function_name,
80
87
  arguments=json.dumps(arguments),
81
88
  status="completed",
82
- type="function_call"
89
+ type="function_call",
83
90
  )
84
91
 
92
+
85
93
  # Computer tool call items
86
- def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
94
+ def make_click_item(
95
+ x: int,
96
+ y: int,
97
+ button: Literal["left", "right", "wheel", "back", "forward"] = "left",
98
+ call_id: Optional[str] = None,
99
+ ) -> ResponseComputerToolCallParam:
87
100
  return ResponseComputerToolCallParam(
88
101
  id=random_id(),
89
102
  call_id=call_id if call_id else random_id(),
90
- action=ActionClick(
91
- button=button,
92
- type="click",
93
- x=x,
94
- y=y
95
- ),
103
+ action=ActionClick(button=button, type="click", x=x, y=y),
96
104
  pending_safety_checks=[],
97
105
  status="completed",
98
- type="computer_call"
106
+ type="computer_call",
99
107
  )
100
108
 
101
- def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
109
+
110
+ def make_double_click_item(
111
+ x: int, y: int, call_id: Optional[str] = None
112
+ ) -> ResponseComputerToolCallParam:
102
113
  return ResponseComputerToolCallParam(
103
114
  id=random_id(),
104
115
  call_id=call_id if call_id else random_id(),
105
- action=ActionDoubleClick(
106
- type="double_click",
107
- x=x,
108
- y=y
109
- ),
116
+ action=ActionDoubleClick(type="double_click", x=x, y=y),
110
117
  pending_safety_checks=[],
111
118
  status="completed",
112
- type="computer_call"
119
+ type="computer_call",
113
120
  )
114
121
 
115
- def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
122
+
123
+ def make_drag_item(
124
+ path: List[Dict[str, int]], call_id: Optional[str] = None
125
+ ) -> ResponseComputerToolCallParam:
116
126
  drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
117
127
  return ResponseComputerToolCallParam(
118
128
  id=random_id(),
119
129
  call_id=call_id if call_id else random_id(),
120
- action=ActionDrag(
121
- path=drag_path,
122
- type="drag"
123
- ),
130
+ action=ActionDrag(path=drag_path, type="drag"),
124
131
  pending_safety_checks=[],
125
132
  status="completed",
126
- type="computer_call"
133
+ type="computer_call",
127
134
  )
128
135
 
129
- def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
136
+
137
+ def make_keypress_item(
138
+ keys: List[str], call_id: Optional[str] = None
139
+ ) -> ResponseComputerToolCallParam:
130
140
  return ResponseComputerToolCallParam(
131
141
  id=random_id(),
132
142
  call_id=call_id if call_id else random_id(),
133
- action=ActionKeypress(
134
- keys=keys,
135
- type="keypress"
136
- ),
143
+ action=ActionKeypress(keys=keys, type="keypress"),
137
144
  pending_safety_checks=[],
138
145
  status="completed",
139
- type="computer_call"
146
+ type="computer_call",
140
147
  )
141
148
 
149
+
142
150
  def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
143
151
  return ResponseComputerToolCallParam(
144
152
  id=random_id(),
145
153
  call_id=call_id if call_id else random_id(),
146
- action=ActionMove(
147
- type="move",
148
- x=x,
149
- y=y
150
- ),
154
+ action=ActionMove(type="move", x=x, y=y),
151
155
  pending_safety_checks=[],
152
156
  status="completed",
153
- type="computer_call"
157
+ type="computer_call",
154
158
  )
155
159
 
160
+
156
161
  def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
157
162
  return ResponseComputerToolCallParam(
158
163
  id=random_id(),
159
164
  call_id=call_id if call_id else random_id(),
160
- action=ActionScreenshot(
161
- type="screenshot"
162
- ),
165
+ action=ActionScreenshot(type="screenshot"),
163
166
  pending_safety_checks=[],
164
167
  status="completed",
165
- type="computer_call"
168
+ type="computer_call",
166
169
  )
167
170
 
168
- def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
171
+
172
+ def make_scroll_item(
173
+ x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
174
+ ) -> ResponseComputerToolCallParam:
169
175
  return ResponseComputerToolCallParam(
170
176
  id=random_id(),
171
177
  call_id=call_id if call_id else random_id(),
172
- action=ActionScroll(
173
- scroll_x=scroll_x,
174
- scroll_y=scroll_y,
175
- type="scroll",
176
- x=x,
177
- y=y
178
- ),
178
+ action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
179
179
  pending_safety_checks=[],
180
180
  status="completed",
181
- type="computer_call"
181
+ type="computer_call",
182
182
  )
183
183
 
184
+
184
185
  def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
185
186
  return ResponseComputerToolCallParam(
186
187
  id=random_id(),
187
188
  call_id=call_id if call_id else random_id(),
188
- action=ActionTypeAction(
189
- text=text,
190
- type="type"
191
- ),
189
+ action=ActionTypeAction(text=text, type="type"),
192
190
  pending_safety_checks=[],
193
191
  status="completed",
194
- type="computer_call"
192
+ type="computer_call",
195
193
  )
196
194
 
195
+
197
196
  def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
198
197
  return ResponseComputerToolCallParam(
199
198
  id=random_id(),
200
199
  call_id=call_id if call_id else random_id(),
201
- action=ActionWait(
202
- type="wait"
203
- ),
200
+ action=ActionWait(type="wait"),
204
201
  pending_safety_checks=[],
205
202
  status="completed",
206
- type="computer_call"
203
+ type="computer_call",
207
204
  )
208
205
 
206
+
209
207
  # Extra anthropic computer calls
210
- def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
208
+ def make_left_mouse_down_item(
209
+ x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
210
+ ) -> Dict[str, Any]:
211
211
  return {
212
212
  "id": random_id(),
213
213
  "call_id": call_id if call_id else random_id(),
214
- "action": {
215
- "type": "left_mouse_down",
216
- "x": x,
217
- "y": y
218
- },
214
+ "action": {"type": "left_mouse_down", "x": x, "y": y},
219
215
  "pending_safety_checks": [],
220
216
  "status": "completed",
221
- "type": "computer_call"
217
+ "type": "computer_call",
222
218
  }
223
219
 
224
- def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
220
+
221
+ def make_left_mouse_up_item(
222
+ x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
223
+ ) -> Dict[str, Any]:
225
224
  return {
226
225
  "id": random_id(),
227
226
  "call_id": call_id if call_id else random_id(),
228
- "action": {
229
- "type": "left_mouse_up",
230
- "x": x,
231
- "y": y
232
- },
227
+ "action": {"type": "left_mouse_up", "x": x, "y": y},
233
228
  "pending_safety_checks": [],
234
229
  "status": "completed",
235
- "type": "computer_call"
230
+ "type": "computer_call",
236
231
  }
237
232
 
238
- def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]:
233
+
234
+ def make_failed_tool_call_items(
235
+ tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
236
+ ) -> List[Dict[str, Any]]:
239
237
  call_id = call_id if call_id else random_id()
240
238
  return [
241
239
  {
@@ -249,9 +247,10 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
249
247
  "type": "function_call_output",
250
248
  "call_id": call_id,
251
249
  "output": json.dumps({"error": error_message}),
252
- }
250
+ },
253
251
  ]
254
252
 
253
+
255
254
  def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
256
255
  call_id = call_id if call_id else random_id()
257
256
  return {
@@ -260,12 +259,15 @@ def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> D
260
259
  "output": json.dumps({"error": error_message}),
261
260
  }
262
261
 
263
- def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
262
+
263
+ def replace_failed_computer_calls_with_function_calls(
264
+ messages: List[Dict[str, Any]],
265
+ ) -> List[Dict[str, Any]]:
264
266
  """
265
267
  Replace computer_call items with function_call items if they share a call_id with a function_call_output.
266
268
  This indicates the computer call failed and should be treated as a function call instead.
267
269
  We do this because the computer_call_output items do not support text output.
268
-
270
+
269
271
  Args:
270
272
  messages: List of message items to process
271
273
  """
@@ -278,16 +280,15 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
278
280
  call_id = msg.get("call_id")
279
281
  if call_id:
280
282
  failed_call_ids.add(call_id)
281
-
283
+
282
284
  # Replace computer_call items that have matching call_ids
283
285
  for i, msg in enumerate(messages):
284
- if (msg.get("type") == "computer_call" and
285
- msg.get("call_id") in failed_call_ids):
286
-
286
+ if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
287
+
287
288
  # Extract action from computer_call
288
289
  action = msg.get("action", {})
289
290
  call_id = msg.get("call_id")
290
-
291
+
291
292
  # Create function_call replacement
292
293
  messages[i] = {
293
294
  "type": "function_call",
@@ -296,27 +297,30 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
296
297
  "name": "computer",
297
298
  "arguments": json.dumps(action),
298
299
  }
299
-
300
+
300
301
  return messages
301
302
 
303
+
302
304
  # Conversion functions between element descriptions and coordinates
303
- def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
305
+ def convert_computer_calls_desc2xy(
306
+ responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
307
+ ) -> List[Dict[str, Any]]:
304
308
  """
305
309
  Convert computer calls from element descriptions to x,y coordinates.
306
-
310
+
307
311
  Args:
308
312
  responses_items: List of response items containing computer calls with element_description
309
313
  desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
310
-
314
+
311
315
  Returns:
312
316
  List of response items with element_description replaced by x,y coordinates
313
317
  """
314
318
  converted_items = []
315
-
319
+
316
320
  for item in responses_items:
317
321
  if item.get("type") == "computer_call" and "action" in item:
318
322
  action = item["action"].copy()
319
-
323
+
320
324
  # Handle single element_description
321
325
  if "element_description" in action:
322
326
  desc = action["element_description"]
@@ -325,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
325
329
  action["x"] = x
326
330
  action["y"] = y
327
331
  del action["element_description"]
328
-
332
+
329
333
  # Handle start_element_description and end_element_description for drag operations
330
334
  elif "start_element_description" in action and "end_element_description" in action:
331
335
  start_desc = action["start_element_description"]
332
336
  end_desc = action["end_element_description"]
333
-
337
+
334
338
  if start_desc in desc2xy and end_desc in desc2xy:
335
339
  start_x, start_y = desc2xy[start_desc]
336
340
  end_x, end_y = desc2xy[end_desc]
337
341
  action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
338
342
  del action["start_element_description"]
339
343
  del action["end_element_description"]
340
-
344
+
341
345
  converted_item = item.copy()
342
346
  converted_item["action"] = action
343
347
  converted_items.append(converted_item)
344
348
  else:
345
349
  converted_items.append(item)
346
-
350
+
347
351
  return converted_items
348
352
 
349
353
 
350
- def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
354
+ def convert_computer_calls_xy2desc(
355
+ responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
356
+ ) -> List[Dict[str, Any]]:
351
357
  """
352
358
  Convert computer calls from x,y coordinates to element descriptions.
353
-
359
+
354
360
  Args:
355
361
  responses_items: List of response items containing computer calls with x,y coordinates
356
362
  desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
357
-
363
+
358
364
  Returns:
359
365
  List of response items with x,y coordinates replaced by element_description
360
366
  """
361
367
  # Create reverse mapping from coordinates to descriptions
362
368
  xy2desc = {coords: desc for desc, coords in desc2xy.items()}
363
-
369
+
364
370
  converted_items = []
365
-
371
+
366
372
  for item in responses_items:
367
373
  if item.get("type") == "computer_call" and "action" in item:
368
374
  action = item["action"].copy()
369
-
375
+
370
376
  # Handle single x,y coordinates
371
377
  if "x" in action and "y" in action:
372
378
  coords = (action["x"], action["y"])
@@ -374,77 +380,94 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
374
380
  action["element_description"] = xy2desc[coords]
375
381
  del action["x"]
376
382
  del action["y"]
377
-
383
+
378
384
  # Handle path for drag operations
379
385
  elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
380
386
  start_point = action["path"][0]
381
387
  end_point = action["path"][1]
382
-
383
- if ("x" in start_point and "y" in start_point and
384
- "x" in end_point and "y" in end_point):
385
-
388
+
389
+ if (
390
+ "x" in start_point
391
+ and "y" in start_point
392
+ and "x" in end_point
393
+ and "y" in end_point
394
+ ):
395
+
386
396
  start_coords = (start_point["x"], start_point["y"])
387
397
  end_coords = (end_point["x"], end_point["y"])
388
-
398
+
389
399
  if start_coords in xy2desc and end_coords in xy2desc:
390
400
  action["start_element_description"] = xy2desc[start_coords]
391
401
  action["end_element_description"] = xy2desc[end_coords]
392
402
  del action["path"]
393
-
403
+
394
404
  converted_item = item.copy()
395
405
  converted_item["action"] = action
396
406
  converted_items.append(converted_item)
397
407
  else:
398
408
  converted_items.append(item)
399
-
409
+
400
410
  return converted_items
401
411
 
402
412
 
403
413
  def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
404
414
  """
405
415
  Extract all element descriptions from computer calls in responses items.
406
-
416
+
407
417
  Args:
408
418
  responses_items: List of response items containing computer calls
409
-
419
+
410
420
  Returns:
411
421
  List of unique element descriptions found in computer calls
412
422
  """
413
423
  descriptions = set()
414
-
424
+
415
425
  for item in responses_items:
416
426
  if item.get("type") == "computer_call" and "action" in item:
417
427
  action = item["action"]
418
-
428
+
419
429
  # Handle single element_description
420
430
  if "element_description" in action:
421
431
  descriptions.add(action["element_description"])
422
-
432
+
423
433
  # Handle start_element_description and end_element_description for drag operations
424
434
  if "start_element_description" in action:
425
435
  descriptions.add(action["start_element_description"])
426
-
436
+
427
437
  if "end_element_description" in action:
428
438
  descriptions.add(action["end_element_description"])
429
-
439
+
430
440
  return list(descriptions)
431
441
 
432
442
 
433
443
  # Conversion functions between responses_items and completion messages formats
434
- def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True) -> List[Dict[str, Any]]:
444
+ def convert_responses_items_to_completion_messages(
445
+ messages: List[Dict[str, Any]],
446
+ allow_images_in_tool_results: bool = True,
447
+ send_multiple_user_images_per_parallel_tool_results: bool = False,
448
+ use_xml_tools: bool = False,
449
+ ) -> List[Dict[str, Any]]:
435
450
  """Convert responses_items message format to liteLLM completion format.
436
-
451
+
437
452
  Args:
438
453
  messages: List of responses_items format messages
439
454
  allow_images_in_tool_results: If True, include images in tool role messages.
440
455
  If False, send tool message + separate user message with image.
456
+ send_multiple_user_images_per_parallel_tool_results: If True, send multiple user images in parallel tool results.
457
+ use_xml_tools: If True, use XML-style <tool_call> tags instead of tool_calls array.
458
+ Also sends tool results as user messages instead of tool role.
441
459
  """
460
+ # Assert that allow_images_in_tool_results is False when use_xml_tools is True
461
+ if use_xml_tools:
462
+ assert (
463
+ not allow_images_in_tool_results
464
+ ), "allow_images_in_tool_results must be False when use_xml_tools is True"
442
465
  completion_messages = []
443
-
444
- for message in messages:
466
+
467
+ for i, message in enumerate(messages):
445
468
  msg_type = message.get("type")
446
469
  role = message.get("role")
447
-
470
+
448
471
  # Handle user messages (both with and without explicit type)
449
472
  if role == "user" or msg_type == "user":
450
473
  content = message.get("content", "")
@@ -453,34 +476,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
453
476
  completion_content = []
454
477
  for item in content:
455
478
  if item.get("type") == "input_image":
456
- completion_content.append({
457
- "type": "image_url",
458
- "image_url": {
459
- "url": item.get("image_url")
460
- }
461
- })
479
+ completion_content.append(
480
+ {"type": "image_url", "image_url": {"url": item.get("image_url")}}
481
+ )
462
482
  elif item.get("type") == "input_text":
463
- completion_content.append({
464
- "type": "text",
465
- "text": item.get("text")
466
- })
483
+ completion_content.append({"type": "text", "text": item.get("text")})
467
484
  elif item.get("type") == "text":
468
- completion_content.append({
469
- "type": "text",
470
- "text": item.get("text")
471
- })
472
-
473
- completion_messages.append({
474
- "role": "user",
475
- "content": completion_content
476
- })
485
+ completion_content.append({"type": "text", "text": item.get("text")})
486
+
487
+ completion_messages.append({"role": "user", "content": completion_content})
477
488
  elif isinstance(content, str):
478
489
  # Handle string content
479
- completion_messages.append({
480
- "role": "user",
481
- "content": content
482
- })
483
-
490
+ completion_messages.append({"role": "user", "content": content})
491
+
484
492
  # Handle assistant messages
485
493
  elif role == "assistant" or msg_type == "message":
486
494
  content = message.get("content", [])
@@ -491,13 +499,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
491
499
  text_parts.append(item.get("text", ""))
492
500
  elif item.get("type") == "text":
493
501
  text_parts.append(item.get("text", ""))
494
-
502
+
495
503
  if text_parts:
496
- completion_messages.append({
497
- "role": "assistant",
498
- "content": "\n".join(text_parts)
499
- })
500
-
504
+ completion_messages.append(
505
+ {"role": "assistant", "content": "\n".join(text_parts)}
506
+ )
507
+
501
508
  # Handle reasoning items (convert to assistant message)
502
509
  elif msg_type == "reasoning":
503
510
  summary = message.get("summary", [])
@@ -505,107 +512,185 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
505
512
  for item in summary:
506
513
  if item.get("type") == "summary_text":
507
514
  text_parts.append(item.get("text", ""))
508
-
515
+
509
516
  if text_parts:
510
- completion_messages.append({
511
- "role": "assistant",
512
- "content": "\n".join(text_parts)
513
- })
514
-
517
+ completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
518
+
515
519
  # Handle function calls
516
520
  elif msg_type == "function_call":
517
- # Add tool call to last assistant message or create new one
518
- if not completion_messages or completion_messages[-1]["role"] != "assistant":
519
- completion_messages.append({
520
- "role": "assistant",
521
- "content": "",
522
- "tool_calls": []
523
- })
524
-
525
- if "tool_calls" not in completion_messages[-1]:
526
- completion_messages[-1]["tool_calls"] = []
527
-
528
- completion_messages[-1]["tool_calls"].append({
529
- "id": message.get("call_id"),
530
- "type": "function",
531
- "function": {
532
- "name": message.get("name"),
533
- "arguments": message.get("arguments")
534
- }
535
- })
536
-
521
+ if use_xml_tools:
522
+ # Use XML format instead of tool_calls array
523
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
524
+ completion_messages.append({"role": "assistant", "content": ""})
525
+
526
+ # Ensure arguments is a JSON string (not a dict)
527
+ arguments = message.get("arguments")
528
+ if isinstance(arguments, dict):
529
+ arguments = json.dumps(arguments)
530
+
531
+ # Format as XML tool call
532
+ tool_call_xml = f'<tool_call>{{"name": "{message.get("name")}", "arguments": {arguments}}}</tool_call>'
533
+ if completion_messages[-1]["content"]:
534
+ completion_messages[-1]["content"] += "\n" + tool_call_xml
535
+ else:
536
+ completion_messages[-1]["content"] = tool_call_xml
537
+ else:
538
+ # Add tool call to last assistant message or create new one
539
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
540
+ completion_messages.append(
541
+ {"role": "assistant", "content": "", "tool_calls": []}
542
+ )
543
+
544
+ if "tool_calls" not in completion_messages[-1]:
545
+ completion_messages[-1]["tool_calls"] = []
546
+
547
+ # Ensure arguments is a JSON string (not a dict)
548
+ arguments = message.get("arguments")
549
+ if isinstance(arguments, dict):
550
+ arguments = json.dumps(arguments)
551
+
552
+ completion_messages[-1]["tool_calls"].append(
553
+ {
554
+ "id": message.get("call_id"),
555
+ "type": "function",
556
+ "function": {
557
+ "name": message.get("name"),
558
+ "arguments": arguments,
559
+ },
560
+ }
561
+ )
562
+
537
563
  # Handle computer calls
538
564
  elif msg_type == "computer_call":
539
- # Add tool call to last assistant message or create new one
540
- if not completion_messages or completion_messages[-1]["role"] != "assistant":
541
- completion_messages.append({
542
- "role": "assistant",
543
- "content": "",
544
- "tool_calls": []
545
- })
546
-
547
- if "tool_calls" not in completion_messages[-1]:
548
- completion_messages[-1]["tool_calls"] = []
549
-
550
- action = message.get("action", {})
551
- completion_messages[-1]["tool_calls"].append({
552
- "id": message.get("call_id"),
553
- "type": "function",
554
- "function": {
555
- "name": "computer",
556
- "arguments": json.dumps(action)
557
- }
558
- })
559
-
565
+ if use_xml_tools:
566
+ # Use XML format instead of tool_calls array
567
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
568
+ completion_messages.append({"role": "assistant", "content": ""})
569
+
570
+ action = message.get("action", {})
571
+ # Format as XML tool call
572
+ tool_call_xml = f'<tool_call>{{"name": "computer", "arguments": {json.dumps(action)}}}</tool_call>'
573
+ if completion_messages[-1]["content"]:
574
+ completion_messages[-1]["content"] += "\n" + tool_call_xml
575
+ else:
576
+ completion_messages[-1]["content"] = tool_call_xml
577
+ else:
578
+ # Add tool call to last assistant message or create new one
579
+ if not completion_messages or completion_messages[-1]["role"] != "assistant":
580
+ completion_messages.append(
581
+ {"role": "assistant", "content": "", "tool_calls": []}
582
+ )
583
+
584
+ if "tool_calls" not in completion_messages[-1]:
585
+ completion_messages[-1]["tool_calls"] = []
586
+
587
+ action = message.get("action", {})
588
+ completion_messages[-1]["tool_calls"].append(
589
+ {
590
+ "id": message.get("call_id"),
591
+ "type": "function",
592
+ "function": {"name": "computer", "arguments": json.dumps(action)},
593
+ }
594
+ )
595
+
560
596
  # Handle function/computer call outputs
561
597
  elif msg_type in ["function_call_output", "computer_call_output"]:
562
598
  output = message.get("output")
563
599
  call_id = message.get("call_id")
564
-
565
- if isinstance(output, dict) and output.get("type") == "input_image":
566
- if allow_images_in_tool_results:
567
- # Handle image output as tool response (may not work with all APIs)
568
- completion_messages.append({
569
- "role": "tool",
570
- "tool_call_id": call_id,
571
- "content": [{
572
- "type": "image_url",
573
- "image_url": {
574
- "url": output.get("image_url")
575
- }
576
- }]
577
- })
600
+
601
+ if use_xml_tools:
602
+ # When using XML tools, send all results as user messages
603
+ if isinstance(output, dict) and output.get("type") == "input_image":
604
+ # Send image as user message
605
+ completion_messages.append(
606
+ {
607
+ "role": "user",
608
+ "content": [
609
+ {
610
+ "type": "image_url",
611
+ "image_url": {"url": output.get("image_url")},
612
+ }
613
+ ],
614
+ }
615
+ )
578
616
  else:
579
- # Send tool message + separate user message with image (OpenAI compatible)
580
- completion_messages += [{
581
- "role": "tool",
582
- "tool_call_id": call_id,
583
- "content": "[Execution completed. See screenshot below]"
584
- }, {
585
- "role": "user",
586
- "content": [{
587
- "type": "image_url",
588
- "image_url": {
589
- "url": output.get("image_url")
590
- }
591
- }]
592
- }]
617
+ # Send text result as user message
618
+ completion_messages.append(
619
+ {
620
+ "role": "user",
621
+ "content": str(output),
622
+ }
623
+ )
593
624
  else:
594
- # Handle text output as tool response
595
- completion_messages.append({
596
- "role": "tool",
597
- "tool_call_id": call_id,
598
- "content": str(output)
599
- })
600
-
625
+ # Standard tool message handling
626
+ if isinstance(output, dict) and output.get("type") == "input_image":
627
+ if allow_images_in_tool_results:
628
+ # Handle image output as tool response (may not work with all APIs)
629
+ completion_messages.append(
630
+ {
631
+ "role": "tool",
632
+ "tool_call_id": call_id,
633
+ "content": [
634
+ {
635
+ "type": "image_url",
636
+ "image_url": {"url": output.get("image_url")},
637
+ }
638
+ ],
639
+ }
640
+ )
641
+ else:
642
+ # Determine if the next message is also a tool call output
643
+ next_type = None
644
+ if i + 1 < len(messages):
645
+ next_msg = messages[i + 1]
646
+ next_type = next_msg.get("type")
647
+ is_next_message_image_result = next_type in [
648
+ "computer_call_output",
649
+ ]
650
+ # Send tool message + separate user message with image (OpenAI compatible)
651
+ completion_messages += (
652
+ [
653
+ {
654
+ "role": "tool",
655
+ "tool_call_id": call_id,
656
+ "content": "[Execution completed. See screenshot below]",
657
+ },
658
+ {
659
+ "role": "user",
660
+ "content": [
661
+ {
662
+ "type": "image_url",
663
+ "image_url": {"url": output.get("image_url")},
664
+ }
665
+ ],
666
+ },
667
+ ]
668
+ if send_multiple_user_images_per_parallel_tool_results
669
+ or (not is_next_message_image_result)
670
+ else [
671
+ {
672
+ "role": "tool",
673
+ "tool_call_id": call_id,
674
+ "content": "[Execution completed. See screenshot below]",
675
+ },
676
+ ]
677
+ )
678
+ else:
679
+ # Handle text output as tool response
680
+ completion_messages.append(
681
+ {"role": "tool", "tool_call_id": call_id, "content": str(output)}
682
+ )
683
+
601
684
  return completion_messages
602
685
 
603
686
 
604
- def convert_completion_messages_to_responses_items(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
687
+ def convert_completion_messages_to_responses_items(
688
+ completion_messages: List[Dict[str, Any]],
689
+ ) -> List[Dict[str, Any]]:
605
690
  """Convert completion messages format to responses_items message format."""
606
691
  responses_items = []
607
692
  skip_next = False
608
-
693
+
609
694
  for i, message in enumerate(completion_messages):
610
695
  if skip_next:
611
696
  skip_next = False
@@ -614,25 +699,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
614
699
  role = message.get("role")
615
700
  content = message.get("content")
616
701
  tool_calls = message.get("tool_calls", [])
617
-
702
+
618
703
  # Handle assistant messages with text content
619
704
  if role == "assistant" and content and isinstance(content, str):
620
- responses_items.append({
621
- "type": "message",
622
- "role": "assistant",
623
- "content": [{
624
- "type": "output_text",
625
- "text": content
626
- }]
627
- })
628
-
705
+ responses_items.append(
706
+ {
707
+ "type": "message",
708
+ "role": "assistant",
709
+ "content": [{"type": "output_text", "text": content}],
710
+ }
711
+ )
712
+
629
713
  # Handle tool calls
630
714
  if tool_calls:
631
715
  for tool_call in tool_calls:
632
716
  if tool_call.get("type") == "function":
633
717
  function = tool_call.get("function", {})
634
718
  function_name = function.get("name")
635
-
719
+
636
720
  if function_name == "computer":
637
721
  # Parse computer action
638
722
  try:
@@ -641,31 +725,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
641
725
  if action.get("action"):
642
726
  action["type"] = action["action"]
643
727
  del action["action"]
644
- responses_items.append({
645
- "type": "computer_call",
646
- "call_id": tool_call.get("id"),
647
- "action": action,
648
- "status": "completed"
649
- })
728
+ responses_items.append(
729
+ {
730
+ "type": "computer_call",
731
+ "call_id": tool_call.get("id"),
732
+ "action": action,
733
+ "status": "completed",
734
+ }
735
+ )
650
736
  except json.JSONDecodeError:
651
737
  # Fallback to function call format
652
- responses_items.append({
738
+ responses_items.append(
739
+ {
740
+ "type": "function_call",
741
+ "call_id": tool_call.get("id"),
742
+ "name": function_name,
743
+ "arguments": function.get("arguments", "{}"),
744
+ "status": "completed",
745
+ }
746
+ )
747
+ else:
748
+ # Regular function call
749
+ responses_items.append(
750
+ {
653
751
  "type": "function_call",
654
752
  "call_id": tool_call.get("id"),
655
753
  "name": function_name,
656
754
  "arguments": function.get("arguments", "{}"),
657
- "status": "completed"
658
- })
659
- else:
660
- # Regular function call
661
- responses_items.append({
662
- "type": "function_call",
663
- "call_id": tool_call.get("id"),
664
- "name": function_name,
665
- "arguments": function.get("arguments", "{}"),
666
- "status": "completed"
667
- })
668
-
755
+ "status": "completed",
756
+ }
757
+ )
758
+
669
759
  # Handle tool messages (function/computer call outputs)
670
760
  elif role == "tool" and content:
671
761
  tool_call_id = message.get("tool_call_id")
@@ -674,74 +764,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
674
764
  if content == "[Execution completed. See screenshot below]":
675
765
  # Look ahead for the next user message with image
676
766
  next_idx = i + 1
677
- if (next_idx < len(completion_messages) and
678
- completion_messages[next_idx].get("role") == "user" and
679
- isinstance(completion_messages[next_idx].get("content"), list)):
767
+ if (
768
+ next_idx < len(completion_messages)
769
+ and completion_messages[next_idx].get("role") == "user"
770
+ and isinstance(completion_messages[next_idx].get("content"), list)
771
+ ):
680
772
  # Found the pattern - extract image from next message
681
773
  next_content = completion_messages[next_idx]["content"]
682
774
  for item in next_content:
683
775
  if item.get("type") == "image_url":
684
- responses_items.append({
685
- "type": "computer_call_output",
686
- "call_id": tool_call_id,
687
- "output": {
688
- "type": "input_image",
689
- "image_url": item.get("image_url", {}).get("url")
776
+ responses_items.append(
777
+ {
778
+ "type": "computer_call_output",
779
+ "call_id": tool_call_id,
780
+ "output": {
781
+ "type": "input_image",
782
+ "image_url": item.get("image_url", {}).get("url"),
783
+ },
690
784
  }
691
- })
785
+ )
692
786
  # Skip the next user message since we processed it
693
787
  skip_next = True
694
788
  break
695
789
  else:
696
790
  # No matching user message, treat as regular text
697
- responses_items.append({
698
- "type": "computer_call_output",
699
- "call_id": tool_call_id,
700
- "output": content
701
- })
791
+ responses_items.append(
792
+ {
793
+ "type": "computer_call_output",
794
+ "call_id": tool_call_id,
795
+ "output": content,
796
+ }
797
+ )
702
798
  else:
703
799
  # Determine if this is a computer call or function call output
704
800
  try:
705
801
  # Try to parse as structured output
706
802
  parsed_content = json.loads(content)
707
803
  if parsed_content.get("type") == "input_image":
708
- responses_items.append({
709
- "type": "computer_call_output",
710
- "call_id": tool_call_id,
711
- "output": parsed_content
712
- })
804
+ responses_items.append(
805
+ {
806
+ "type": "computer_call_output",
807
+ "call_id": tool_call_id,
808
+ "output": parsed_content,
809
+ }
810
+ )
713
811
  else:
714
- responses_items.append({
715
- "type": "computer_call_output",
716
- "call_id": tool_call_id,
717
- "output": content
718
- })
812
+ responses_items.append(
813
+ {
814
+ "type": "computer_call_output",
815
+ "call_id": tool_call_id,
816
+ "output": content,
817
+ }
818
+ )
719
819
  except json.JSONDecodeError:
720
820
  # Plain text output - could be function or computer call
721
- responses_items.append({
722
- "type": "function_call_output",
723
- "call_id": tool_call_id,
724
- "output": content
725
- })
821
+ responses_items.append(
822
+ {
823
+ "type": "function_call_output",
824
+ "call_id": tool_call_id,
825
+ "output": content,
826
+ }
827
+ )
726
828
  elif isinstance(content, list):
727
829
  # Handle structured content (e.g., images)
728
830
  for item in content:
729
831
  if item.get("type") == "image_url":
730
- responses_items.append({
731
- "type": "computer_call_output",
732
- "call_id": tool_call_id,
733
- "output": {
734
- "type": "input_image",
735
- "image_url": item.get("image_url", {}).get("url")
832
+ responses_items.append(
833
+ {
834
+ "type": "computer_call_output",
835
+ "call_id": tool_call_id,
836
+ "output": {
837
+ "type": "input_image",
838
+ "image_url": item.get("image_url", {}).get("url"),
839
+ },
736
840
  }
737
- })
841
+ )
738
842
  elif item.get("type") == "text":
739
- responses_items.append({
740
- "type": "function_call_output",
741
- "call_id": tool_call_id,
742
- "output": item.get("text")
743
- })
744
-
843
+ responses_items.append(
844
+ {
845
+ "type": "function_call_output",
846
+ "call_id": tool_call_id,
847
+ "output": item.get("text"),
848
+ }
849
+ )
850
+
745
851
  # Handle actual user messages
746
852
  elif role == "user" and content:
747
853
  if isinstance(content, list):
@@ -749,27 +855,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
749
855
  user_content = []
750
856
  for item in content:
751
857
  if item.get("type") == "image_url":
752
- user_content.append({
753
- "type": "input_image",
754
- "image_url": item.get("image_url", {}).get("url")
755
- })
858
+ user_content.append(
859
+ {
860
+ "type": "input_image",
861
+ "image_url": item.get("image_url", {}).get("url"),
862
+ }
863
+ )
756
864
  elif item.get("type") == "text":
757
- user_content.append({
758
- "type": "input_text",
759
- "text": item.get("text")
760
- })
761
-
865
+ user_content.append({"type": "input_text", "text": item.get("text")})
866
+
762
867
  if user_content:
763
- responses_items.append({
764
- "role": "user",
765
- "type": "message",
766
- "content": user_content
767
- })
868
+ responses_items.append(
869
+ {"role": "user", "type": "message", "content": user_content}
870
+ )
768
871
  elif isinstance(content, str):
769
872
  # Handle simple text user message
770
- responses_items.append({
771
- "role": "user",
772
- "content": content
773
- })
774
-
873
+ responses_items.append({"role": "user", "content": content})
874
+
775
875
  return responses_items