cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/responses.py CHANGED
@@ -6,10 +6,10 @@ Based on the OpenAI spec for Responses API items.
6
6
  import base64
7
7
  import json
8
8
  import uuid
9
- from typing import List, Dict, Any, Literal, Union, Optional
9
+ from typing import Any, Dict, List, Literal, Optional, Union
10
10
 
11
+ from openai.types.responses.easy_input_message_param import EasyInputMessageParam
11
12
  from openai.types.responses.response_computer_tool_call_param import (
12
- ResponseComputerToolCallParam,
13
13
  ActionClick,
14
14
  ActionDoubleClick,
15
15
  ActionDrag,
@@ -18,224 +18,222 @@ from openai.types.responses.response_computer_tool_call_param import (
18
18
  ActionMove,
19
19
  ActionScreenshot,
20
20
  ActionScroll,
21
+ )
22
+ from openai.types.responses.response_computer_tool_call_param import (
21
23
  ActionType as ActionTypeAction,
24
+ )
25
+ from openai.types.responses.response_computer_tool_call_param import (
22
26
  ActionWait,
23
- PendingSafetyCheck
27
+ PendingSafetyCheck,
28
+ ResponseComputerToolCallParam,
29
+ )
30
+ from openai.types.responses.response_function_tool_call_param import (
31
+ ResponseFunctionToolCallParam,
24
32
  )
25
-
26
- from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
27
- from openai.types.responses.response_output_text_param import ResponseOutputTextParam
28
- from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
29
- from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
30
- from openai.types.responses.easy_input_message_param import EasyInputMessageParam
31
33
  from openai.types.responses.response_input_image_param import ResponseInputImageParam
34
+ from openai.types.responses.response_output_message_param import (
35
+ ResponseOutputMessageParam,
36
+ )
37
+ from openai.types.responses.response_output_text_param import ResponseOutputTextParam
38
+ from openai.types.responses.response_reasoning_item_param import (
39
+ ResponseReasoningItemParam,
40
+ Summary,
41
+ )
42
+
32
43
 
33
44
  def random_id():
34
45
  return str(uuid.uuid4())
35
46
 
47
+
36
48
  # User message items
37
49
  def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
38
50
  return EasyInputMessageParam(
39
51
  content=[
40
52
  ResponseInputImageParam(
41
53
  type="input_image",
42
- image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
43
- ) # type: ignore
54
+ image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}",
55
+ ) # type: ignore
44
56
  ],
45
57
  role="user",
46
- type="message"
58
+ type="message",
47
59
  )
48
60
 
61
+
49
62
  # Text items
50
63
  def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
51
64
  return ResponseReasoningItemParam(
52
- id=random_id(),
53
- summary=[
54
- Summary(text=reasoning, type="summary_text")
55
- ],
56
- type="reasoning"
65
+ id=random_id(), summary=[Summary(text=reasoning, type="summary_text")], type="reasoning"
57
66
  )
58
67
 
68
+
59
69
  def make_output_text_item(content: str) -> ResponseOutputMessageParam:
60
70
  return ResponseOutputMessageParam(
61
71
  id=random_id(),
62
- content=[
63
- ResponseOutputTextParam(
64
- text=content,
65
- type="output_text",
66
- annotations=[]
67
- )
68
- ],
72
+ content=[ResponseOutputTextParam(text=content, type="output_text", annotations=[])],
69
73
  role="assistant",
70
74
  status="completed",
71
- type="message"
75
+ type="message",
72
76
  )
73
77
 
78
+
74
79
  # Function call items
75
- def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
80
+ def make_function_call_item(
81
+ function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None
82
+ ) -> ResponseFunctionToolCallParam:
76
83
  return ResponseFunctionToolCallParam(
77
84
  id=random_id(),
78
85
  call_id=call_id if call_id else random_id(),
79
86
  name=function_name,
80
87
  arguments=json.dumps(arguments),
81
88
  status="completed",
82
- type="function_call"
89
+ type="function_call",
83
90
  )
84
91
 
92
+
85
93
  # Computer tool call items
86
- def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
94
+ def make_click_item(
95
+ x: int,
96
+ y: int,
97
+ button: Literal["left", "right", "wheel", "back", "forward"] = "left",
98
+ call_id: Optional[str] = None,
99
+ ) -> ResponseComputerToolCallParam:
87
100
  return ResponseComputerToolCallParam(
88
101
  id=random_id(),
89
102
  call_id=call_id if call_id else random_id(),
90
- action=ActionClick(
91
- button=button,
92
- type="click",
93
- x=x,
94
- y=y
95
- ),
103
+ action=ActionClick(button=button, type="click", x=x, y=y),
96
104
  pending_safety_checks=[],
97
105
  status="completed",
98
- type="computer_call"
106
+ type="computer_call",
99
107
  )
100
108
 
101
- def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
109
+
110
+ def make_double_click_item(
111
+ x: int, y: int, call_id: Optional[str] = None
112
+ ) -> ResponseComputerToolCallParam:
102
113
  return ResponseComputerToolCallParam(
103
114
  id=random_id(),
104
115
  call_id=call_id if call_id else random_id(),
105
- action=ActionDoubleClick(
106
- type="double_click",
107
- x=x,
108
- y=y
109
- ),
116
+ action=ActionDoubleClick(type="double_click", x=x, y=y),
110
117
  pending_safety_checks=[],
111
118
  status="completed",
112
- type="computer_call"
119
+ type="computer_call",
113
120
  )
114
121
 
115
- def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
122
+
123
+ def make_drag_item(
124
+ path: List[Dict[str, int]], call_id: Optional[str] = None
125
+ ) -> ResponseComputerToolCallParam:
116
126
  drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
117
127
  return ResponseComputerToolCallParam(
118
128
  id=random_id(),
119
129
  call_id=call_id if call_id else random_id(),
120
- action=ActionDrag(
121
- path=drag_path,
122
- type="drag"
123
- ),
130
+ action=ActionDrag(path=drag_path, type="drag"),
124
131
  pending_safety_checks=[],
125
132
  status="completed",
126
- type="computer_call"
133
+ type="computer_call",
127
134
  )
128
135
 
129
- def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
136
+
137
+ def make_keypress_item(
138
+ keys: List[str], call_id: Optional[str] = None
139
+ ) -> ResponseComputerToolCallParam:
130
140
  return ResponseComputerToolCallParam(
131
141
  id=random_id(),
132
142
  call_id=call_id if call_id else random_id(),
133
- action=ActionKeypress(
134
- keys=keys,
135
- type="keypress"
136
- ),
143
+ action=ActionKeypress(keys=keys, type="keypress"),
137
144
  pending_safety_checks=[],
138
145
  status="completed",
139
- type="computer_call"
146
+ type="computer_call",
140
147
  )
141
148
 
149
+
142
150
  def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
143
151
  return ResponseComputerToolCallParam(
144
152
  id=random_id(),
145
153
  call_id=call_id if call_id else random_id(),
146
- action=ActionMove(
147
- type="move",
148
- x=x,
149
- y=y
150
- ),
154
+ action=ActionMove(type="move", x=x, y=y),
151
155
  pending_safety_checks=[],
152
156
  status="completed",
153
- type="computer_call"
157
+ type="computer_call",
154
158
  )
155
159
 
160
+
156
161
  def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
157
162
  return ResponseComputerToolCallParam(
158
163
  id=random_id(),
159
164
  call_id=call_id if call_id else random_id(),
160
- action=ActionScreenshot(
161
- type="screenshot"
162
- ),
165
+ action=ActionScreenshot(type="screenshot"),
163
166
  pending_safety_checks=[],
164
167
  status="completed",
165
- type="computer_call"
168
+ type="computer_call",
166
169
  )
167
170
 
168
- def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
171
+
172
+ def make_scroll_item(
173
+ x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None
174
+ ) -> ResponseComputerToolCallParam:
169
175
  return ResponseComputerToolCallParam(
170
176
  id=random_id(),
171
177
  call_id=call_id if call_id else random_id(),
172
- action=ActionScroll(
173
- scroll_x=scroll_x,
174
- scroll_y=scroll_y,
175
- type="scroll",
176
- x=x,
177
- y=y
178
- ),
178
+ action=ActionScroll(scroll_x=scroll_x, scroll_y=scroll_y, type="scroll", x=x, y=y),
179
179
  pending_safety_checks=[],
180
180
  status="completed",
181
- type="computer_call"
181
+ type="computer_call",
182
182
  )
183
183
 
184
+
184
185
  def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
185
186
  return ResponseComputerToolCallParam(
186
187
  id=random_id(),
187
188
  call_id=call_id if call_id else random_id(),
188
- action=ActionTypeAction(
189
- text=text,
190
- type="type"
191
- ),
189
+ action=ActionTypeAction(text=text, type="type"),
192
190
  pending_safety_checks=[],
193
191
  status="completed",
194
- type="computer_call"
192
+ type="computer_call",
195
193
  )
196
194
 
195
+
197
196
  def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
198
197
  return ResponseComputerToolCallParam(
199
198
  id=random_id(),
200
199
  call_id=call_id if call_id else random_id(),
201
- action=ActionWait(
202
- type="wait"
203
- ),
200
+ action=ActionWait(type="wait"),
204
201
  pending_safety_checks=[],
205
202
  status="completed",
206
- type="computer_call"
203
+ type="computer_call",
207
204
  )
208
205
 
206
+
209
207
  # Extra anthropic computer calls
210
- def make_left_mouse_down_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
208
+ def make_left_mouse_down_item(
209
+ x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
210
+ ) -> Dict[str, Any]:
211
211
  return {
212
212
  "id": random_id(),
213
213
  "call_id": call_id if call_id else random_id(),
214
- "action": {
215
- "type": "left_mouse_down",
216
- "x": x,
217
- "y": y
218
- },
214
+ "action": {"type": "left_mouse_down", "x": x, "y": y},
219
215
  "pending_safety_checks": [],
220
216
  "status": "completed",
221
- "type": "computer_call"
217
+ "type": "computer_call",
222
218
  }
223
219
 
224
- def make_left_mouse_up_item(x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None) -> Dict[str, Any]:
220
+
221
+ def make_left_mouse_up_item(
222
+ x: Optional[int] = None, y: Optional[int] = None, call_id: Optional[str] = None
223
+ ) -> Dict[str, Any]:
225
224
  return {
226
225
  "id": random_id(),
227
226
  "call_id": call_id if call_id else random_id(),
228
- "action": {
229
- "type": "left_mouse_up",
230
- "x": x,
231
- "y": y
232
- },
227
+ "action": {"type": "left_mouse_up", "x": x, "y": y},
233
228
  "pending_safety_checks": [],
234
229
  "status": "completed",
235
- "type": "computer_call"
230
+ "type": "computer_call",
236
231
  }
237
232
 
238
- def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None) -> List[Dict[str, Any]]:
233
+
234
+ def make_failed_tool_call_items(
235
+ tool_name: str, tool_kwargs: Dict[str, Any], error_message: str, call_id: Optional[str] = None
236
+ ) -> List[Dict[str, Any]]:
239
237
  call_id = call_id if call_id else random_id()
240
238
  return [
241
239
  {
@@ -249,9 +247,10 @@ def make_failed_tool_call_items(tool_name: str, tool_kwargs: Dict[str, Any], err
249
247
  "type": "function_call_output",
250
248
  "call_id": call_id,
251
249
  "output": json.dumps({"error": error_message}),
252
- }
250
+ },
253
251
  ]
254
252
 
253
+
255
254
  def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> Dict[str, Any]:
256
255
  call_id = call_id if call_id else random_id()
257
256
  return {
@@ -260,12 +259,15 @@ def make_tool_error_item(error_message: str, call_id: Optional[str] = None) -> D
260
259
  "output": json.dumps({"error": error_message}),
261
260
  }
262
261
 
263
- def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
262
+
263
+ def replace_failed_computer_calls_with_function_calls(
264
+ messages: List[Dict[str, Any]],
265
+ ) -> List[Dict[str, Any]]:
264
266
  """
265
267
  Replace computer_call items with function_call items if they share a call_id with a function_call_output.
266
268
  This indicates the computer call failed and should be treated as a function call instead.
267
269
  We do this because the computer_call_output items do not support text output.
268
-
270
+
269
271
  Args:
270
272
  messages: List of message items to process
271
273
  """
@@ -278,16 +280,15 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
278
280
  call_id = msg.get("call_id")
279
281
  if call_id:
280
282
  failed_call_ids.add(call_id)
281
-
283
+
282
284
  # Replace computer_call items that have matching call_ids
283
285
  for i, msg in enumerate(messages):
284
- if (msg.get("type") == "computer_call" and
285
- msg.get("call_id") in failed_call_ids):
286
-
286
+ if msg.get("type") == "computer_call" and msg.get("call_id") in failed_call_ids:
287
+
287
288
  # Extract action from computer_call
288
289
  action = msg.get("action", {})
289
290
  call_id = msg.get("call_id")
290
-
291
+
291
292
  # Create function_call replacement
292
293
  messages[i] = {
293
294
  "type": "function_call",
@@ -296,27 +297,30 @@ def replace_failed_computer_calls_with_function_calls(messages: List[Dict[str, A
296
297
  "name": "computer",
297
298
  "arguments": json.dumps(action),
298
299
  }
299
-
300
+
300
301
  return messages
301
302
 
303
+
302
304
  # Conversion functions between element descriptions and coordinates
303
- def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
305
+ def convert_computer_calls_desc2xy(
306
+ responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
307
+ ) -> List[Dict[str, Any]]:
304
308
  """
305
309
  Convert computer calls from element descriptions to x,y coordinates.
306
-
310
+
307
311
  Args:
308
312
  responses_items: List of response items containing computer calls with element_description
309
313
  desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
310
-
314
+
311
315
  Returns:
312
316
  List of response items with element_description replaced by x,y coordinates
313
317
  """
314
318
  converted_items = []
315
-
319
+
316
320
  for item in responses_items:
317
321
  if item.get("type") == "computer_call" and "action" in item:
318
322
  action = item["action"].copy()
319
-
323
+
320
324
  # Handle single element_description
321
325
  if "element_description" in action:
322
326
  desc = action["element_description"]
@@ -325,48 +329,50 @@ def convert_computer_calls_desc2xy(responses_items: List[Dict[str, Any]], desc2x
325
329
  action["x"] = x
326
330
  action["y"] = y
327
331
  del action["element_description"]
328
-
332
+
329
333
  # Handle start_element_description and end_element_description for drag operations
330
334
  elif "start_element_description" in action and "end_element_description" in action:
331
335
  start_desc = action["start_element_description"]
332
336
  end_desc = action["end_element_description"]
333
-
337
+
334
338
  if start_desc in desc2xy and end_desc in desc2xy:
335
339
  start_x, start_y = desc2xy[start_desc]
336
340
  end_x, end_y = desc2xy[end_desc]
337
341
  action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
338
342
  del action["start_element_description"]
339
343
  del action["end_element_description"]
340
-
344
+
341
345
  converted_item = item.copy()
342
346
  converted_item["action"] = action
343
347
  converted_items.append(converted_item)
344
348
  else:
345
349
  converted_items.append(item)
346
-
350
+
347
351
  return converted_items
348
352
 
349
353
 
350
- def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]) -> List[Dict[str, Any]]:
354
+ def convert_computer_calls_xy2desc(
355
+ responses_items: List[Dict[str, Any]], desc2xy: Dict[str, tuple]
356
+ ) -> List[Dict[str, Any]]:
351
357
  """
352
358
  Convert computer calls from x,y coordinates to element descriptions.
353
-
359
+
354
360
  Args:
355
361
  responses_items: List of response items containing computer calls with x,y coordinates
356
362
  desc2xy: Dictionary mapping element descriptions to (x, y) coordinate tuples
357
-
363
+
358
364
  Returns:
359
365
  List of response items with x,y coordinates replaced by element_description
360
366
  """
361
367
  # Create reverse mapping from coordinates to descriptions
362
368
  xy2desc = {coords: desc for desc, coords in desc2xy.items()}
363
-
369
+
364
370
  converted_items = []
365
-
371
+
366
372
  for item in responses_items:
367
373
  if item.get("type") == "computer_call" and "action" in item:
368
374
  action = item["action"].copy()
369
-
375
+
370
376
  # Handle single x,y coordinates
371
377
  if "x" in action and "y" in action:
372
378
  coords = (action["x"], action["y"])
@@ -374,77 +380,83 @@ def convert_computer_calls_xy2desc(responses_items: List[Dict[str, Any]], desc2x
374
380
  action["element_description"] = xy2desc[coords]
375
381
  del action["x"]
376
382
  del action["y"]
377
-
383
+
378
384
  # Handle path for drag operations
379
385
  elif "path" in action and isinstance(action["path"], list) and len(action["path"]) == 2:
380
386
  start_point = action["path"][0]
381
387
  end_point = action["path"][1]
382
-
383
- if ("x" in start_point and "y" in start_point and
384
- "x" in end_point and "y" in end_point):
385
-
388
+
389
+ if (
390
+ "x" in start_point
391
+ and "y" in start_point
392
+ and "x" in end_point
393
+ and "y" in end_point
394
+ ):
395
+
386
396
  start_coords = (start_point["x"], start_point["y"])
387
397
  end_coords = (end_point["x"], end_point["y"])
388
-
398
+
389
399
  if start_coords in xy2desc and end_coords in xy2desc:
390
400
  action["start_element_description"] = xy2desc[start_coords]
391
401
  action["end_element_description"] = xy2desc[end_coords]
392
402
  del action["path"]
393
-
403
+
394
404
  converted_item = item.copy()
395
405
  converted_item["action"] = action
396
406
  converted_items.append(converted_item)
397
407
  else:
398
408
  converted_items.append(item)
399
-
409
+
400
410
  return converted_items
401
411
 
402
412
 
403
413
  def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[str]:
404
414
  """
405
415
  Extract all element descriptions from computer calls in responses items.
406
-
416
+
407
417
  Args:
408
418
  responses_items: List of response items containing computer calls
409
-
419
+
410
420
  Returns:
411
421
  List of unique element descriptions found in computer calls
412
422
  """
413
423
  descriptions = set()
414
-
424
+
415
425
  for item in responses_items:
416
426
  if item.get("type") == "computer_call" and "action" in item:
417
427
  action = item["action"]
418
-
428
+
419
429
  # Handle single element_description
420
430
  if "element_description" in action:
421
431
  descriptions.add(action["element_description"])
422
-
432
+
423
433
  # Handle start_element_description and end_element_description for drag operations
424
434
  if "start_element_description" in action:
425
435
  descriptions.add(action["start_element_description"])
426
-
436
+
427
437
  if "end_element_description" in action:
428
438
  descriptions.add(action["end_element_description"])
429
-
439
+
430
440
  return list(descriptions)
431
441
 
432
442
 
433
443
  # Conversion functions between responses_items and completion messages formats
434
- def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True) -> List[Dict[str, Any]]:
444
+ def convert_responses_items_to_completion_messages(
445
+ messages: List[Dict[str, Any]], allow_images_in_tool_results: bool = True
446
+ ) -> List[Dict[str, Any]]:
435
447
  """Convert responses_items message format to liteLLM completion format.
436
-
448
+
437
449
  Args:
438
450
  messages: List of responses_items format messages
439
451
  allow_images_in_tool_results: If True, include images in tool role messages.
440
452
  If False, send tool message + separate user message with image.
441
453
  """
442
454
  completion_messages = []
443
-
455
+
444
456
  for message in messages:
445
457
  msg_type = message.get("type")
446
458
  role = message.get("role")
447
-
459
+
448
460
  # Handle user messages (both with and without explicit type)
449
461
  if role == "user" or msg_type == "user":
450
462
  content = message.get("content", "")
@@ -453,34 +465,19 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
453
465
  completion_content = []
454
466
  for item in content:
455
467
  if item.get("type") == "input_image":
456
- completion_content.append({
457
- "type": "image_url",
458
- "image_url": {
459
- "url": item.get("image_url")
460
- }
461
- })
468
+ completion_content.append(
469
+ {"type": "image_url", "image_url": {"url": item.get("image_url")}}
470
+ )
462
471
  elif item.get("type") == "input_text":
463
- completion_content.append({
464
- "type": "text",
465
- "text": item.get("text")
466
- })
472
+ completion_content.append({"type": "text", "text": item.get("text")})
467
473
  elif item.get("type") == "text":
468
- completion_content.append({
469
- "type": "text",
470
- "text": item.get("text")
471
- })
472
-
473
- completion_messages.append({
474
- "role": "user",
475
- "content": completion_content
476
- })
474
+ completion_content.append({"type": "text", "text": item.get("text")})
475
+
476
+ completion_messages.append({"role": "user", "content": completion_content})
477
477
  elif isinstance(content, str):
478
478
  # Handle string content
479
- completion_messages.append({
480
- "role": "user",
481
- "content": content
482
- })
483
-
479
+ completion_messages.append({"role": "user", "content": content})
480
+
484
481
  # Handle assistant messages
485
482
  elif role == "assistant" or msg_type == "message":
486
483
  content = message.get("content", [])
@@ -491,13 +488,12 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
491
488
  text_parts.append(item.get("text", ""))
492
489
  elif item.get("type") == "text":
493
490
  text_parts.append(item.get("text", ""))
494
-
491
+
495
492
  if text_parts:
496
- completion_messages.append({
497
- "role": "assistant",
498
- "content": "\n".join(text_parts)
499
- })
500
-
493
+ completion_messages.append(
494
+ {"role": "assistant", "content": "\n".join(text_parts)}
495
+ )
496
+
501
497
  # Handle reasoning items (convert to assistant message)
502
498
  elif msg_type == "reasoning":
503
499
  summary = message.get("summary", [])
@@ -505,107 +501,96 @@ def convert_responses_items_to_completion_messages(messages: List[Dict[str, Any]
505
501
  for item in summary:
506
502
  if item.get("type") == "summary_text":
507
503
  text_parts.append(item.get("text", ""))
508
-
504
+
509
505
  if text_parts:
510
- completion_messages.append({
511
- "role": "assistant",
512
- "content": "\n".join(text_parts)
513
- })
514
-
506
+ completion_messages.append({"role": "assistant", "content": "\n".join(text_parts)})
507
+
515
508
  # Handle function calls
516
509
  elif msg_type == "function_call":
517
510
  # Add tool call to last assistant message or create new one
518
511
  if not completion_messages or completion_messages[-1]["role"] != "assistant":
519
- completion_messages.append({
520
- "role": "assistant",
521
- "content": "",
522
- "tool_calls": []
523
- })
524
-
512
+ completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
513
+
525
514
  if "tool_calls" not in completion_messages[-1]:
526
515
  completion_messages[-1]["tool_calls"] = []
527
-
528
- completion_messages[-1]["tool_calls"].append({
529
- "id": message.get("call_id"),
530
- "type": "function",
531
- "function": {
532
- "name": message.get("name"),
533
- "arguments": message.get("arguments")
516
+
517
+ completion_messages[-1]["tool_calls"].append(
518
+ {
519
+ "id": message.get("call_id"),
520
+ "type": "function",
521
+ "function": {
522
+ "name": message.get("name"),
523
+ "arguments": message.get("arguments"),
524
+ },
534
525
  }
535
- })
536
-
526
+ )
527
+
537
528
  # Handle computer calls
538
529
  elif msg_type == "computer_call":
539
530
  # Add tool call to last assistant message or create new one
540
531
  if not completion_messages or completion_messages[-1]["role"] != "assistant":
541
- completion_messages.append({
542
- "role": "assistant",
543
- "content": "",
544
- "tool_calls": []
545
- })
546
-
532
+ completion_messages.append({"role": "assistant", "content": "", "tool_calls": []})
533
+
547
534
  if "tool_calls" not in completion_messages[-1]:
548
535
  completion_messages[-1]["tool_calls"] = []
549
-
536
+
550
537
  action = message.get("action", {})
551
- completion_messages[-1]["tool_calls"].append({
552
- "id": message.get("call_id"),
553
- "type": "function",
554
- "function": {
555
- "name": "computer",
556
- "arguments": json.dumps(action)
538
+ completion_messages[-1]["tool_calls"].append(
539
+ {
540
+ "id": message.get("call_id"),
541
+ "type": "function",
542
+ "function": {"name": "computer", "arguments": json.dumps(action)},
557
543
  }
558
- })
559
-
544
+ )
545
+
560
546
  # Handle function/computer call outputs
561
547
  elif msg_type in ["function_call_output", "computer_call_output"]:
562
548
  output = message.get("output")
563
549
  call_id = message.get("call_id")
564
-
550
+
565
551
  if isinstance(output, dict) and output.get("type") == "input_image":
566
552
  if allow_images_in_tool_results:
567
553
  # Handle image output as tool response (may not work with all APIs)
568
- completion_messages.append({
569
- "role": "tool",
570
- "tool_call_id": call_id,
571
- "content": [{
572
- "type": "image_url",
573
- "image_url": {
574
- "url": output.get("image_url")
575
- }
576
- }]
577
- })
554
+ completion_messages.append(
555
+ {
556
+ "role": "tool",
557
+ "tool_call_id": call_id,
558
+ "content": [
559
+ {"type": "image_url", "image_url": {"url": output.get("image_url")}}
560
+ ],
561
+ }
562
+ )
578
563
  else:
579
564
  # Send tool message + separate user message with image (OpenAI compatible)
580
- completion_messages += [{
581
- "role": "tool",
582
- "tool_call_id": call_id,
583
- "content": "[Execution completed. See screenshot below]"
584
- }, {
585
- "role": "user",
586
- "content": [{
587
- "type": "image_url",
588
- "image_url": {
589
- "url": output.get("image_url")
590
- }
591
- }]
592
- }]
565
+ completion_messages += [
566
+ {
567
+ "role": "tool",
568
+ "tool_call_id": call_id,
569
+ "content": "[Execution completed. See screenshot below]",
570
+ },
571
+ {
572
+ "role": "user",
573
+ "content": [
574
+ {"type": "image_url", "image_url": {"url": output.get("image_url")}}
575
+ ],
576
+ },
577
+ ]
593
578
  else:
594
579
  # Handle text output as tool response
595
- completion_messages.append({
596
- "role": "tool",
597
- "tool_call_id": call_id,
598
- "content": str(output)
599
- })
600
-
580
+ completion_messages.append(
581
+ {"role": "tool", "tool_call_id": call_id, "content": str(output)}
582
+ )
583
+
601
584
  return completion_messages
602
585
 
603
586
 
604
- def convert_completion_messages_to_responses_items(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
587
+ def convert_completion_messages_to_responses_items(
588
+ completion_messages: List[Dict[str, Any]],
589
+ ) -> List[Dict[str, Any]]:
605
590
  """Convert completion messages format to responses_items message format."""
606
591
  responses_items = []
607
592
  skip_next = False
608
-
593
+
609
594
  for i, message in enumerate(completion_messages):
610
595
  if skip_next:
611
596
  skip_next = False
@@ -614,25 +599,24 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
614
599
  role = message.get("role")
615
600
  content = message.get("content")
616
601
  tool_calls = message.get("tool_calls", [])
617
-
602
+
618
603
  # Handle assistant messages with text content
619
604
  if role == "assistant" and content and isinstance(content, str):
620
- responses_items.append({
621
- "type": "message",
622
- "role": "assistant",
623
- "content": [{
624
- "type": "output_text",
625
- "text": content
626
- }]
627
- })
628
-
605
+ responses_items.append(
606
+ {
607
+ "type": "message",
608
+ "role": "assistant",
609
+ "content": [{"type": "output_text", "text": content}],
610
+ }
611
+ )
612
+
629
613
  # Handle tool calls
630
614
  if tool_calls:
631
615
  for tool_call in tool_calls:
632
616
  if tool_call.get("type") == "function":
633
617
  function = tool_call.get("function", {})
634
618
  function_name = function.get("name")
635
-
619
+
636
620
  if function_name == "computer":
637
621
  # Parse computer action
638
622
  try:
@@ -641,31 +625,37 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
641
625
  if action.get("action"):
642
626
  action["type"] = action["action"]
643
627
  del action["action"]
644
- responses_items.append({
645
- "type": "computer_call",
646
- "call_id": tool_call.get("id"),
647
- "action": action,
648
- "status": "completed"
649
- })
628
+ responses_items.append(
629
+ {
630
+ "type": "computer_call",
631
+ "call_id": tool_call.get("id"),
632
+ "action": action,
633
+ "status": "completed",
634
+ }
635
+ )
650
636
  except json.JSONDecodeError:
651
637
  # Fallback to function call format
652
- responses_items.append({
638
+ responses_items.append(
639
+ {
640
+ "type": "function_call",
641
+ "call_id": tool_call.get("id"),
642
+ "name": function_name,
643
+ "arguments": function.get("arguments", "{}"),
644
+ "status": "completed",
645
+ }
646
+ )
647
+ else:
648
+ # Regular function call
649
+ responses_items.append(
650
+ {
653
651
  "type": "function_call",
654
652
  "call_id": tool_call.get("id"),
655
653
  "name": function_name,
656
654
  "arguments": function.get("arguments", "{}"),
657
- "status": "completed"
658
- })
659
- else:
660
- # Regular function call
661
- responses_items.append({
662
- "type": "function_call",
663
- "call_id": tool_call.get("id"),
664
- "name": function_name,
665
- "arguments": function.get("arguments", "{}"),
666
- "status": "completed"
667
- })
668
-
655
+ "status": "completed",
656
+ }
657
+ )
658
+
669
659
  # Handle tool messages (function/computer call outputs)
670
660
  elif role == "tool" and content:
671
661
  tool_call_id = message.get("tool_call_id")
@@ -674,74 +664,90 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
674
664
  if content == "[Execution completed. See screenshot below]":
675
665
  # Look ahead for the next user message with image
676
666
  next_idx = i + 1
677
- if (next_idx < len(completion_messages) and
678
- completion_messages[next_idx].get("role") == "user" and
679
- isinstance(completion_messages[next_idx].get("content"), list)):
667
+ if (
668
+ next_idx < len(completion_messages)
669
+ and completion_messages[next_idx].get("role") == "user"
670
+ and isinstance(completion_messages[next_idx].get("content"), list)
671
+ ):
680
672
  # Found the pattern - extract image from next message
681
673
  next_content = completion_messages[next_idx]["content"]
682
674
  for item in next_content:
683
675
  if item.get("type") == "image_url":
684
- responses_items.append({
685
- "type": "computer_call_output",
686
- "call_id": tool_call_id,
687
- "output": {
688
- "type": "input_image",
689
- "image_url": item.get("image_url", {}).get("url")
676
+ responses_items.append(
677
+ {
678
+ "type": "computer_call_output",
679
+ "call_id": tool_call_id,
680
+ "output": {
681
+ "type": "input_image",
682
+ "image_url": item.get("image_url", {}).get("url"),
683
+ },
690
684
  }
691
- })
685
+ )
692
686
  # Skip the next user message since we processed it
693
687
  skip_next = True
694
688
  break
695
689
  else:
696
690
  # No matching user message, treat as regular text
697
- responses_items.append({
698
- "type": "computer_call_output",
699
- "call_id": tool_call_id,
700
- "output": content
701
- })
691
+ responses_items.append(
692
+ {
693
+ "type": "computer_call_output",
694
+ "call_id": tool_call_id,
695
+ "output": content,
696
+ }
697
+ )
702
698
  else:
703
699
  # Determine if this is a computer call or function call output
704
700
  try:
705
701
  # Try to parse as structured output
706
702
  parsed_content = json.loads(content)
707
703
  if parsed_content.get("type") == "input_image":
708
- responses_items.append({
709
- "type": "computer_call_output",
710
- "call_id": tool_call_id,
711
- "output": parsed_content
712
- })
704
+ responses_items.append(
705
+ {
706
+ "type": "computer_call_output",
707
+ "call_id": tool_call_id,
708
+ "output": parsed_content,
709
+ }
710
+ )
713
711
  else:
714
- responses_items.append({
715
- "type": "computer_call_output",
716
- "call_id": tool_call_id,
717
- "output": content
718
- })
712
+ responses_items.append(
713
+ {
714
+ "type": "computer_call_output",
715
+ "call_id": tool_call_id,
716
+ "output": content,
717
+ }
718
+ )
719
719
  except json.JSONDecodeError:
720
720
  # Plain text output - could be function or computer call
721
- responses_items.append({
722
- "type": "function_call_output",
723
- "call_id": tool_call_id,
724
- "output": content
725
- })
721
+ responses_items.append(
722
+ {
723
+ "type": "function_call_output",
724
+ "call_id": tool_call_id,
725
+ "output": content,
726
+ }
727
+ )
726
728
  elif isinstance(content, list):
727
729
  # Handle structured content (e.g., images)
728
730
  for item in content:
729
731
  if item.get("type") == "image_url":
730
- responses_items.append({
731
- "type": "computer_call_output",
732
- "call_id": tool_call_id,
733
- "output": {
734
- "type": "input_image",
735
- "image_url": item.get("image_url", {}).get("url")
732
+ responses_items.append(
733
+ {
734
+ "type": "computer_call_output",
735
+ "call_id": tool_call_id,
736
+ "output": {
737
+ "type": "input_image",
738
+ "image_url": item.get("image_url", {}).get("url"),
739
+ },
736
740
  }
737
- })
741
+ )
738
742
  elif item.get("type") == "text":
739
- responses_items.append({
740
- "type": "function_call_output",
741
- "call_id": tool_call_id,
742
- "output": item.get("text")
743
- })
744
-
743
+ responses_items.append(
744
+ {
745
+ "type": "function_call_output",
746
+ "call_id": tool_call_id,
747
+ "output": item.get("text"),
748
+ }
749
+ )
750
+
745
751
  # Handle actual user messages
746
752
  elif role == "user" and content:
747
753
  if isinstance(content, list):
@@ -749,27 +755,21 @@ def convert_completion_messages_to_responses_items(completion_messages: List[Dic
749
755
  user_content = []
750
756
  for item in content:
751
757
  if item.get("type") == "image_url":
752
- user_content.append({
753
- "type": "input_image",
754
- "image_url": item.get("image_url", {}).get("url")
755
- })
758
+ user_content.append(
759
+ {
760
+ "type": "input_image",
761
+ "image_url": item.get("image_url", {}).get("url"),
762
+ }
763
+ )
756
764
  elif item.get("type") == "text":
757
- user_content.append({
758
- "type": "input_text",
759
- "text": item.get("text")
760
- })
761
-
765
+ user_content.append({"type": "input_text", "text": item.get("text")})
766
+
762
767
  if user_content:
763
- responses_items.append({
764
- "role": "user",
765
- "type": "message",
766
- "content": user_content
767
- })
768
+ responses_items.append(
769
+ {"role": "user", "type": "message", "content": user_content}
770
+ )
768
771
  elif isinstance(content, str):
769
772
  # Handle simple text user message
770
- responses_items.append({
771
- "role": "user",
772
- "content": content
773
- })
774
-
773
+ responses_items.append({"role": "user", "content": content})
774
+
775
775
  return responses_items