cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/anthropic.py CHANGED
@@ -4,30 +4,33 @@ Anthropic hosted tools agent loop implementation using liteLLM
4
4
 
5
5
  import asyncio
6
6
  import json
7
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
7
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
8
+
8
9
  import litellm
9
- from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
10
+ from litellm.responses.litellm_completion_transformation.transformation import (
11
+ LiteLLMCompletionResponsesConfig,
12
+ )
10
13
 
11
14
  from ..decorators import register_agent
12
- from ..types import Messages, AgentResponse, Tools, AgentCapability
13
15
  from ..loops.base import AsyncAgentConfig
14
16
  from ..responses import (
15
- make_reasoning_item,
16
- make_output_text_item,
17
17
  make_click_item,
18
18
  make_double_click_item,
19
19
  make_drag_item,
20
+ make_failed_tool_call_items,
21
+ make_input_image_item,
20
22
  make_keypress_item,
23
+ make_left_mouse_down_item,
24
+ make_left_mouse_up_item,
21
25
  make_move_item,
26
+ make_output_text_item,
27
+ make_reasoning_item,
28
+ make_screenshot_item,
22
29
  make_scroll_item,
23
30
  make_type_item,
24
31
  make_wait_item,
25
- make_input_image_item,
26
- make_screenshot_item,
27
- make_failed_tool_call_items,
28
- make_left_mouse_down_item,
29
- make_left_mouse_up_item
30
32
  )
33
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
31
34
 
32
35
  # Model version mapping to tool version and beta flag
33
36
  MODEL_TOOL_MAPPING = [
@@ -35,38 +38,34 @@ MODEL_TOOL_MAPPING = [
35
38
  {
36
39
  "pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
37
40
  "tool_version": "computer_20250124",
38
- "beta_flag": "computer-use-2025-01-24"
41
+ "beta_flag": "computer-use-2025-01-24",
39
42
  },
40
43
  # Claude 3.7 models
41
44
  {
42
45
  "pattern": r"claude-3\.?7|claude-3-7",
43
46
  "tool_version": "computer_20250124",
44
- "beta_flag": "computer-use-2025-01-24"
47
+ "beta_flag": "computer-use-2025-01-24",
45
48
  },
46
49
  # Claude 3.5 models (fallback)
47
50
  {
48
51
  "pattern": r"claude-3\.?5|claude-3-5",
49
52
  "tool_version": "computer_20241022",
50
- "beta_flag": "computer-use-2024-10-22"
51
- }
53
+ "beta_flag": "computer-use-2024-10-22",
54
+ },
52
55
  ]
53
56
 
57
+
54
58
  def _get_tool_config_for_model(model: str) -> Dict[str, str]:
55
59
  """Get tool version and beta flag for the given model."""
56
60
  import re
57
-
61
+
58
62
  for mapping in MODEL_TOOL_MAPPING:
59
63
  if re.search(mapping["pattern"], model, re.IGNORECASE):
60
- return {
61
- "tool_version": mapping["tool_version"],
62
- "beta_flag": mapping["beta_flag"]
63
- }
64
-
64
+ return {"tool_version": mapping["tool_version"], "beta_flag": mapping["beta_flag"]}
65
+
65
66
  # Default to Claude 3.5 configuration
66
- return {
67
- "tool_version": "computer_20241022",
68
- "beta_flag": "computer-use-2024-10-22"
69
- }
67
+ return {"tool_version": "computer_20241022", "beta_flag": "computer-use-2024-10-22"}
68
+
70
69
 
71
70
  async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
72
71
  """Map a computer tool to Anthropic's hosted tool schema."""
@@ -76,7 +75,7 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
76
75
  except Exception:
77
76
  # Fallback to default dimensions if method fails
78
77
  width, height = 1024, 768
79
-
78
+
80
79
  return {
81
80
  "type": tool_version,
82
81
  "function": {
@@ -89,32 +88,37 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
89
88
  },
90
89
  }
91
90
 
91
+
92
92
  async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
93
93
  """Prepare tools for Anthropic API format."""
94
94
  tool_config = _get_tool_config_for_model(model)
95
95
  anthropic_tools = []
96
-
96
+
97
97
  for schema in tool_schemas:
98
98
  if schema["type"] == "computer":
99
99
  # Map computer tool to Anthropic format
100
- anthropic_tools.append(await _map_computer_tool_to_anthropic(
101
- schema["computer"],
102
- tool_config["tool_version"]
103
- ))
100
+ anthropic_tools.append(
101
+ await _map_computer_tool_to_anthropic(
102
+ schema["computer"], tool_config["tool_version"]
103
+ )
104
+ )
104
105
  elif schema["type"] == "function":
105
106
  # Function tools - convert to Anthropic format
106
107
  function_schema = schema["function"]
107
- anthropic_tools.append({
108
- "type": "function",
109
- "function": {
110
- "name": function_schema["name"],
111
- "description": function_schema.get("description", ""),
112
- "parameters": function_schema.get("parameters", {})
108
+ anthropic_tools.append(
109
+ {
110
+ "type": "function",
111
+ "function": {
112
+ "name": function_schema["name"],
113
+ "description": function_schema.get("description", ""),
114
+ "parameters": function_schema.get("parameters", {}),
115
+ },
113
116
  }
114
- })
115
-
117
+ )
118
+
116
119
  return anthropic_tools
117
120
 
121
+
118
122
  def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
119
123
  """Convert responses_items message format to liteLLM completion format."""
120
124
  completion_messages = []
@@ -123,7 +127,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
123
127
  for message in messages:
124
128
  msg_type = message.get("type")
125
129
  role = message.get("role")
126
-
130
+
127
131
  # Handle user messages (both with and without explicit type)
128
132
  if role == "user" or msg_type == "user":
129
133
  content = message.get("content", "")
@@ -135,51 +139,38 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
135
139
  # Convert input_image to OpenAI image format
136
140
  image_url = item.get("image_url", "")
137
141
  if image_url and image_url != "[omitted]":
138
- converted_content.append({
139
- "type": "image_url",
140
- "image_url": {
141
- "url": image_url
142
- }
143
- })
142
+ converted_content.append(
143
+ {"type": "image_url", "image_url": {"url": image_url}}
144
+ )
144
145
  elif isinstance(item, dict) and item.get("type") == "input_text":
145
146
  # Convert input_text to OpenAI text format
146
147
  text = item.get("text", "")
147
- converted_content.append({
148
- "type": "text",
149
- "text": text
150
- })
148
+ converted_content.append({"type": "text", "text": text})
151
149
  else:
152
150
  # Keep other content types as-is
153
151
  converted_content.append(item)
154
-
155
- completion_messages.append({
156
- "role": "user",
157
- "content": converted_content if converted_content else content
158
- })
152
+
153
+ completion_messages.append(
154
+ {"role": "user", "content": converted_content if converted_content else content}
155
+ )
159
156
  else:
160
157
  # Text content
161
- completion_messages.append({
162
- "role": "user",
163
- "content": content
164
- })
165
-
158
+ completion_messages.append({"role": "user", "content": content})
159
+
166
160
  # Handle assistant messages
167
161
  elif role == "assistant":
168
162
  content = message.get("content", [])
169
163
  if isinstance(content, str):
170
- content = [{ "type": "output_text", "text": content }]
171
-
164
+ content = [{"type": "output_text", "text": content}]
165
+
172
166
  content = "\n".join(item.get("text", "") for item in content)
173
- completion_messages.append({
174
- "role": "assistant",
175
- "content": content
176
- })
177
-
167
+ completion_messages.append({"role": "assistant", "content": content})
168
+
178
169
  elif msg_type == "reasoning":
179
170
  # Reasoning becomes part of assistant message
180
171
  summary = message.get("summary", [])
181
172
  reasoning_text = ""
182
-
173
+
183
174
  if isinstance(summary, list) and summary:
184
175
  # Extract text from summary items
185
176
  for item in summary:
@@ -189,58 +180,54 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
189
180
  else:
190
181
  # Fallback to direct reasoning field
191
182
  reasoning_text = message.get("reasoning", "")
192
-
183
+
193
184
  if reasoning_text:
194
- completion_messages.append({
195
- "role": "assistant",
196
- "content": reasoning_text
197
- })
198
-
185
+ completion_messages.append({"role": "assistant", "content": reasoning_text})
186
+
199
187
  elif msg_type == "function_call":
200
188
  fn_name = message.get("name")
201
189
  fn_args = message.get("arguments", "{}")
202
190
  call_id = message.get("call_id", "call_1")
203
191
  call_id_to_fn_name[call_id] = fn_name
204
- openai_tool_calls = [{
205
- "id": call_id,
206
- "type": "function",
207
- "function": {
208
- "name": fn_name,
209
- "arguments": fn_args
192
+ openai_tool_calls = [
193
+ {
194
+ "id": call_id,
195
+ "type": "function",
196
+ "function": {"name": fn_name, "arguments": fn_args},
210
197
  }
211
- }] # If the last completion message is an assistant message, extend the tool_calls
198
+ ] # If the last completion message is an assistant message, extend the tool_calls
212
199
  if completion_messages and completion_messages[-1].get("role") == "assistant":
213
200
  if "tool_calls" not in completion_messages[-1]:
214
201
  completion_messages[-1]["tool_calls"] = []
215
202
  completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
216
203
  else:
217
204
  # Create new assistant message with tool calls
218
- completion_messages.append({
219
- "role": "assistant",
220
- "content": None,
221
- "tool_calls": openai_tool_calls
222
- })
223
-
205
+ completion_messages.append(
206
+ {"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
207
+ )
208
+
224
209
  elif msg_type == "function_call_output":
225
210
  call_id = message.get("call_id", "call_1")
226
211
  fn_output = message.get("output", "")
227
212
  fn_name = call_id_to_fn_name.get(call_id, "computer")
228
213
 
229
- completion_messages.append({
230
- "role": "function",
231
- "name": fn_name,
232
- "tool_call_id": call_id,
233
- "content": str(fn_output)
234
- })
235
-
214
+ completion_messages.append(
215
+ {
216
+ "role": "function",
217
+ "name": fn_name,
218
+ "tool_call_id": call_id,
219
+ "content": str(fn_output),
220
+ }
221
+ )
222
+
236
223
  elif msg_type == "computer_call":
237
224
  # Computer call becomes tool use in assistant message
238
225
  action = message.get("action", {})
239
226
  action_type = action.get("type")
240
227
  call_id = message.get("call_id", "call_1")
241
-
228
+
242
229
  tool_use_content = []
243
-
230
+
244
231
  # Basic actions (all versions)
245
232
  if action_type == "click":
246
233
  # Input:
@@ -253,7 +240,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
253
240
  # "y": 200
254
241
  # }
255
242
  # }
256
-
243
+
257
244
  # Output:
258
245
  # {
259
246
  # "function": {
@@ -267,16 +254,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
267
254
  # "type": "function"
268
255
  # }
269
256
  button = action.get("button", "left")
270
- action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
271
- tool_use_content.append({
272
- "type": "tool_use",
273
- "id": call_id,
274
- "name": "computer",
275
- "input": {
276
- "action": action_name,
277
- "coordinate": [action.get("x", 0), action.get("y", 0)]
257
+ action_name = (
258
+ "right_click"
259
+ if button == "right"
260
+ else "middle_click" if button == "wheel" else "left_click"
261
+ )
262
+ tool_use_content.append(
263
+ {
264
+ "type": "tool_use",
265
+ "id": call_id,
266
+ "name": "computer",
267
+ "input": {
268
+ "action": action_name,
269
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
270
+ },
278
271
  }
279
- })
272
+ )
280
273
  elif action_type == "double_click":
281
274
  # Input:
282
275
  # {
@@ -288,7 +281,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
288
281
  # "y": 240
289
282
  # }
290
283
  # }
291
-
284
+
292
285
  # Output:
293
286
  # {
294
287
  # "function": {
@@ -301,15 +294,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
301
294
  # "id": "call_1",
302
295
  # "type": "function"
303
296
  # }
304
- tool_use_content.append({
305
- "type": "tool_use",
306
- "id": call_id,
307
- "name": "computer",
308
- "input": {
309
- "action": "double_click",
310
- "coordinate": [action.get("x", 0), action.get("y", 0)]
297
+ tool_use_content.append(
298
+ {
299
+ "type": "tool_use",
300
+ "id": call_id,
301
+ "name": "computer",
302
+ "input": {
303
+ "action": "double_click",
304
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
305
+ },
311
306
  }
312
- })
307
+ )
313
308
  elif action_type == "type":
314
309
  # Input:
315
310
  # {
@@ -320,7 +315,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
320
315
  # "text": "Hello World"
321
316
  # }
322
317
  # }
323
-
318
+
324
319
  # Output:
325
320
  # {
326
321
  # "function": {
@@ -333,15 +328,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
333
328
  # "id": "call_1",
334
329
  # "type": "function"
335
330
  # }
336
- tool_use_content.append({
337
- "type": "tool_use",
338
- "id": call_id,
339
- "name": "computer",
340
- "input": {
341
- "action": "type",
342
- "text": action.get("text", "")
331
+ tool_use_content.append(
332
+ {
333
+ "type": "tool_use",
334
+ "id": call_id,
335
+ "name": "computer",
336
+ "input": {"action": "type", "text": action.get("text", "")},
343
337
  }
344
- })
338
+ )
345
339
  elif action_type == "keypress":
346
340
  # Input:
347
341
  # {
@@ -352,7 +346,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
352
346
  # "keys": ["ctrl", "c"]
353
347
  # }
354
348
  # }
355
-
349
+
356
350
  # Output:
357
351
  # {
358
352
  # "function": {
@@ -365,15 +359,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
365
359
  # "id": "call_1",
366
360
  # "type": "function"
367
361
  # }
368
- tool_use_content.append({
369
- "type": "tool_use",
370
- "id": call_id,
371
- "name": "computer",
372
- "input": {
373
- "action": "key",
374
- "text": "+".join(action.get("keys", []))
362
+ tool_use_content.append(
363
+ {
364
+ "type": "tool_use",
365
+ "id": call_id,
366
+ "name": "computer",
367
+ "input": {"action": "key", "text": "+".join(action.get("keys", []))},
375
368
  }
376
- })
369
+ )
377
370
  elif action_type in ["mouse_move", "move"]:
378
371
  # Input:
379
372
  # {
@@ -385,7 +378,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
385
378
  # "y": 250
386
379
  # }
387
380
  # }
388
-
381
+
389
382
  # Output:
390
383
  # {
391
384
  # "function": {
@@ -398,15 +391,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
398
391
  # "id": "call_1",
399
392
  # "type": "function"
400
393
  # }
401
- tool_use_content.append({
402
- "type": "tool_use",
403
- "id": call_id,
404
- "name": "computer",
405
- "input": {
406
- "action": "mouse_move",
407
- "coordinate": [action.get("x", 0), action.get("y", 0)]
394
+ tool_use_content.append(
395
+ {
396
+ "type": "tool_use",
397
+ "id": call_id,
398
+ "name": "computer",
399
+ "input": {
400
+ "action": "mouse_move",
401
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
402
+ },
408
403
  }
409
- })
404
+ )
410
405
  elif action_type == "scroll":
411
406
  # Input:
412
407
  # {
@@ -420,7 +415,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
420
415
  # "scroll_y": -5
421
416
  # }
422
417
  # }
423
-
418
+
424
419
  # Output:
425
420
  # {
426
421
  # "function": {
@@ -453,18 +448,20 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
453
448
  else:
454
449
  direction = "down"
455
450
  amount = 3
456
-
457
- tool_use_content.append({
458
- "type": "tool_use",
459
- "id": call_id,
460
- "name": "computer",
461
- "input": {
462
- "action": "scroll",
463
- "coordinate": [action.get("x", 0), action.get("y", 0)],
464
- "scroll_direction": direction,
465
- "scroll_amount": amount
451
+
452
+ tool_use_content.append(
453
+ {
454
+ "type": "tool_use",
455
+ "id": call_id,
456
+ "name": "computer",
457
+ "input": {
458
+ "action": "scroll",
459
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
460
+ "scroll_direction": direction,
461
+ "scroll_amount": amount,
462
+ },
466
463
  }
467
- })
464
+ )
468
465
  elif action_type == "drag":
469
466
  # Input:
470
467
  # {
@@ -478,7 +475,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
478
475
  # ]
479
476
  # }
480
477
  # }
481
-
478
+
482
479
  # Output:
483
480
  # {
484
481
  # "function": {
@@ -498,17 +495,19 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
498
495
  if isinstance(path, list) and len(path) >= 2:
499
496
  start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
500
497
  end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
501
-
502
- tool_use_content.append({
503
- "type": "tool_use",
504
- "id": call_id,
505
- "name": "computer",
506
- "input": {
507
- "action": "left_click_drag",
508
- "start_coordinate": start_coord,
509
- "end_coordinate": end_coord
498
+
499
+ tool_use_content.append(
500
+ {
501
+ "type": "tool_use",
502
+ "id": call_id,
503
+ "name": "computer",
504
+ "input": {
505
+ "action": "left_click_drag",
506
+ "start_coordinate": start_coord,
507
+ "end_coordinate": end_coord,
508
+ },
510
509
  }
511
- })
510
+ )
512
511
  elif action_type == "wait":
513
512
  # Input:
514
513
  # {
@@ -518,7 +517,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
518
517
  # "type": "wait"
519
518
  # }
520
519
  # }
521
-
520
+
522
521
  # Output:
523
522
  # {
524
523
  # "function": {
@@ -530,14 +529,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
530
529
  # "id": "call_1",
531
530
  # "type": "function"
532
531
  # }
533
- tool_use_content.append({
534
- "type": "tool_use",
535
- "id": call_id,
536
- "name": "computer",
537
- "input": {
538
- "action": "wait"
532
+ tool_use_content.append(
533
+ {
534
+ "type": "tool_use",
535
+ "id": call_id,
536
+ "name": "computer",
537
+ "input": {"action": "wait"},
539
538
  }
540
- })
539
+ )
541
540
  elif action_type == "screenshot":
542
541
  # Input:
543
542
  # {
@@ -547,7 +546,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
547
546
  # "type": "screenshot"
548
547
  # }
549
548
  # }
550
-
549
+
551
550
  # Output:
552
551
  # {
553
552
  # "function": {
@@ -559,47 +558,53 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
559
558
  # "id": "call_1",
560
559
  # "type": "function"
561
560
  # }
562
- tool_use_content.append({
563
- "type": "tool_use",
564
- "id": call_id,
565
- "name": "computer",
566
- "input": {
567
- "action": "screenshot"
561
+ tool_use_content.append(
562
+ {
563
+ "type": "tool_use",
564
+ "id": call_id,
565
+ "name": "computer",
566
+ "input": {"action": "screenshot"},
568
567
  }
569
- })
568
+ )
570
569
  elif action_type == "left_mouse_down":
571
- tool_use_content.append({
572
- "type": "tool_use",
573
- "id": call_id,
574
- "name": "computer",
575
- "input": {
576
- "action": "left_mouse_down",
577
- "coordinate": [action.get("x", None), action.get("y", None)]
570
+ tool_use_content.append(
571
+ {
572
+ "type": "tool_use",
573
+ "id": call_id,
574
+ "name": "computer",
575
+ "input": {
576
+ "action": "left_mouse_down",
577
+ "coordinate": [action.get("x", None), action.get("y", None)],
578
+ },
578
579
  }
579
- })
580
+ )
580
581
  elif action_type == "left_mouse_up":
581
- tool_use_content.append({
582
- "type": "tool_use",
583
- "id": call_id,
584
- "name": "computer",
585
- "input": {
586
- "action": "left_mouse_up",
587
- "coordinate": [action.get("x", None), action.get("y", None)]
582
+ tool_use_content.append(
583
+ {
584
+ "type": "tool_use",
585
+ "id": call_id,
586
+ "name": "computer",
587
+ "input": {
588
+ "action": "left_mouse_up",
589
+ "coordinate": [action.get("x", None), action.get("y", None)],
590
+ },
588
591
  }
589
- })
590
-
592
+ )
593
+
591
594
  # Convert tool_use_content to OpenAI tool_calls format
592
595
  openai_tool_calls = []
593
596
  for tool_use in tool_use_content:
594
- openai_tool_calls.append({
595
- "id": tool_use["id"],
596
- "type": "function",
597
- "function": {
598
- "name": tool_use["name"],
599
- "arguments": json.dumps(tool_use["input"])
597
+ openai_tool_calls.append(
598
+ {
599
+ "id": tool_use["id"],
600
+ "type": "function",
601
+ "function": {
602
+ "name": tool_use["name"],
603
+ "arguments": json.dumps(tool_use["input"]),
604
+ },
600
605
  }
601
- })
602
-
606
+ )
607
+
603
608
  # If the last completion message is an assistant message, extend the tool_calls
604
609
  if completion_messages and completion_messages[-1].get("role") == "assistant":
605
610
  if "tool_calls" not in completion_messages[-1]:
@@ -607,54 +612,52 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
607
612
  completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
608
613
  else:
609
614
  # Create new assistant message with tool calls
610
- completion_messages.append({
611
- "role": "assistant",
612
- "content": None,
613
- "tool_calls": openai_tool_calls
614
- })
615
-
615
+ completion_messages.append(
616
+ {"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
617
+ )
618
+
616
619
  elif msg_type == "computer_call_output":
617
620
  # Computer call output becomes OpenAI function result
618
621
  output = message.get("output", {})
619
622
  call_id = message.get("call_id", "call_1")
620
-
623
+
621
624
  if output.get("type") == "input_image":
622
625
  # Screenshot result - convert to OpenAI format with image_url content
623
626
  image_url = output.get("image_url", "")
624
- completion_messages.append({
625
- "role": "function",
626
- "name": "computer",
627
- "tool_call_id": call_id,
628
- "content": [{
629
- "type": "image_url",
630
- "image_url": {
631
- "url": image_url
632
- }
633
- }]
634
- })
627
+ completion_messages.append(
628
+ {
629
+ "role": "function",
630
+ "name": "computer",
631
+ "tool_call_id": call_id,
632
+ "content": [{"type": "image_url", "image_url": {"url": image_url}}],
633
+ }
634
+ )
635
635
  else:
636
636
  # Text result - convert to OpenAI format
637
- completion_messages.append({
638
- "role": "function",
639
- "name": "computer",
640
- "tool_call_id": call_id,
641
- "content": str(output)
642
- })
643
-
637
+ completion_messages.append(
638
+ {
639
+ "role": "function",
640
+ "name": "computer",
641
+ "tool_call_id": call_id,
642
+ "content": str(output),
643
+ }
644
+ )
645
+
644
646
  return completion_messages
645
647
 
648
+
646
649
  def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
647
650
  """Convert liteLLM completion response to responses_items message format."""
648
651
  responses_items = []
649
-
650
- if not response or not hasattr(response, 'choices') or not response.choices:
652
+
653
+ if not response or not hasattr(response, "choices") or not response.choices:
651
654
  return responses_items
652
-
655
+
653
656
  choice = response.choices[0]
654
657
  message = choice.message
655
-
658
+
656
659
  # Handle text content
657
- if hasattr(message, 'content') and message.content:
660
+ if hasattr(message, "content") and message.content:
658
661
  if isinstance(message.content, str):
659
662
  responses_items.append(make_output_text_item(message.content))
660
663
  elif isinstance(message.content, list):
@@ -667,31 +670,36 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
667
670
  tool_input = content_item.get("input", {})
668
671
  action_type = tool_input.get("action")
669
672
  call_id = content_item.get("id")
670
-
673
+
671
674
  # Action reference:
672
675
  # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
673
-
676
+
674
677
  try:
675
678
  # Basic actions (all versions)
676
679
  if action_type == "screenshot":
677
680
  responses_items.append(make_screenshot_item(call_id=call_id))
678
681
  elif action_type in ["click", "left_click"]:
679
682
  coordinate = tool_input.get("coordinate", [0, 0])
680
- responses_items.append(make_click_item(
681
- x=coordinate[0] if len(coordinate) > 0 else 0,
682
- y=coordinate[1] if len(coordinate) > 1 else 0,
683
- call_id=call_id
684
- ))
683
+ responses_items.append(
684
+ make_click_item(
685
+ x=coordinate[0] if len(coordinate) > 0 else 0,
686
+ y=coordinate[1] if len(coordinate) > 1 else 0,
687
+ call_id=call_id,
688
+ )
689
+ )
685
690
  elif action_type in ["type", "type_text"]:
686
- responses_items.append(make_type_item(
687
- text=tool_input.get("text", ""),
688
- call_id=call_id
689
- ))
691
+ responses_items.append(
692
+ make_type_item(text=tool_input.get("text", ""), call_id=call_id)
693
+ )
690
694
  elif action_type in ["key", "keypress", "hotkey"]:
691
- responses_items.append(make_keypress_item(
692
- keys=tool_input.get("text", "").replace("+", "-").split("-"),
693
- call_id=call_id
694
- ))
695
+ responses_items.append(
696
+ make_keypress_item(
697
+ keys=tool_input.get("text", "")
698
+ .replace("+", "-")
699
+ .split("-"),
700
+ call_id=call_id,
701
+ )
702
+ )
695
703
  elif action_type in ["mouse_move", "move_cursor", "move"]:
696
704
  # Mouse move - create a custom action item
697
705
  coordinate = tool_input.get("coordinate", [0, 0])
@@ -699,64 +707,88 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
699
707
  make_move_item(
700
708
  x=coordinate[0] if len(coordinate) > 0 else 0,
701
709
  y=coordinate[1] if len(coordinate) > 1 else 0,
702
- call_id=call_id
710
+ call_id=call_id,
703
711
  )
704
712
  )
705
-
713
+
706
714
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
707
715
  elif action_type == "scroll":
708
716
  coordinate = tool_input.get("coordinate", [0, 0])
709
717
  scroll_amount = tool_input.get("scroll_amount", 3)
710
- scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
711
- -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
712
- scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
713
- -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
714
- responses_items.append(make_scroll_item(
715
- x=coordinate[0] if len(coordinate) > 0 else 0,
716
- y=coordinate[1] if len(coordinate) > 1 else 0,
717
- scroll_x=scroll_x,
718
- scroll_y=scroll_y,
719
- call_id=call_id
720
- ))
718
+ scroll_x = (
719
+ scroll_amount
720
+ if tool_input.get("scroll_direction", "down") == "right"
721
+ else (
722
+ -scroll_amount
723
+ if tool_input.get("scroll_direction", "down") == "left"
724
+ else 0
725
+ )
726
+ )
727
+ scroll_y = (
728
+ scroll_amount
729
+ if tool_input.get("scroll_direction", "down") == "down"
730
+ else (
731
+ -scroll_amount
732
+ if tool_input.get("scroll_direction", "down") == "up"
733
+ else 0
734
+ )
735
+ )
736
+ responses_items.append(
737
+ make_scroll_item(
738
+ x=coordinate[0] if len(coordinate) > 0 else 0,
739
+ y=coordinate[1] if len(coordinate) > 1 else 0,
740
+ scroll_x=scroll_x,
741
+ scroll_y=scroll_y,
742
+ call_id=call_id,
743
+ )
744
+ )
721
745
  elif action_type in ["left_click_drag", "drag"]:
722
746
  start_coord = tool_input.get("start_coordinate", [0, 0])
723
747
  end_coord = tool_input.get("end_coordinate", [0, 0])
724
- responses_items.append(make_drag_item(
725
- path=[
726
- {
727
- "x": start_coord[0] if len(start_coord) > 0 else 0,
728
- "y": start_coord[1] if len(start_coord) > 1 else 0
729
- },
730
- {
731
- "x": end_coord[0] if len(end_coord) > 0 else 0,
732
- "y": end_coord[1] if len(end_coord) > 1 else 0
733
- }
734
- ],
735
- call_id=call_id
736
- ))
748
+ responses_items.append(
749
+ make_drag_item(
750
+ path=[
751
+ {
752
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
753
+ "y": start_coord[1] if len(start_coord) > 1 else 0,
754
+ },
755
+ {
756
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
757
+ "y": end_coord[1] if len(end_coord) > 1 else 0,
758
+ },
759
+ ],
760
+ call_id=call_id,
761
+ )
762
+ )
737
763
  elif action_type == "right_click":
738
764
  coordinate = tool_input.get("coordinate", [0, 0])
739
- responses_items.append(make_click_item(
740
- x=coordinate[0] if len(coordinate) > 0 else 0,
741
- y=coordinate[1] if len(coordinate) > 1 else 0,
742
- button="right",
743
- call_id=call_id
744
- ))
765
+ responses_items.append(
766
+ make_click_item(
767
+ x=coordinate[0] if len(coordinate) > 0 else 0,
768
+ y=coordinate[1] if len(coordinate) > 1 else 0,
769
+ button="right",
770
+ call_id=call_id,
771
+ )
772
+ )
745
773
  elif action_type == "middle_click":
746
774
  coordinate = tool_input.get("coordinate", [0, 0])
747
- responses_items.append(make_click_item(
748
- x=coordinate[0] if len(coordinate) > 0 else 0,
749
- y=coordinate[1] if len(coordinate) > 1 else 0,
750
- button="wheel",
751
- call_id=call_id
752
- ))
775
+ responses_items.append(
776
+ make_click_item(
777
+ x=coordinate[0] if len(coordinate) > 0 else 0,
778
+ y=coordinate[1] if len(coordinate) > 1 else 0,
779
+ button="wheel",
780
+ call_id=call_id,
781
+ )
782
+ )
753
783
  elif action_type == "double_click":
754
784
  coordinate = tool_input.get("coordinate", [0, 0])
755
- responses_items.append(make_double_click_item(
756
- x=coordinate[0] if len(coordinate) > 0 else 0,
757
- y=coordinate[1] if len(coordinate) > 1 else 0,
758
- call_id=call_id
759
- ))
785
+ responses_items.append(
786
+ make_double_click_item(
787
+ x=coordinate[0] if len(coordinate) > 0 else 0,
788
+ y=coordinate[1] if len(coordinate) > 1 else 0,
789
+ call_id=call_id,
790
+ )
791
+ )
760
792
  elif action_type == "triple_click":
761
793
  # coordinate = tool_input.get("coordinate", [0, 0])
762
794
  # responses_items.append({
@@ -782,11 +814,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
782
814
  # }
783
815
  # })
784
816
  coordinate = tool_input.get("coordinate", [None, None])
785
- responses_items.append(make_left_mouse_down_item(
786
- x=coordinate[0] if len(coordinate) > 0 else None,
787
- y=coordinate[1] if len(coordinate) > 1 else None,
788
- call_id=call_id
789
- ))
817
+ responses_items.append(
818
+ make_left_mouse_down_item(
819
+ x=coordinate[0] if len(coordinate) > 0 else None,
820
+ y=coordinate[1] if len(coordinate) > 1 else None,
821
+ call_id=call_id,
822
+ )
823
+ )
790
824
  elif action_type == "left_mouse_up":
791
825
  # coordinate = tool_input.get("coordinate", [0, 0])
792
826
  # responses_items.append({
@@ -800,11 +834,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
800
834
  # }
801
835
  # })
802
836
  coordinate = tool_input.get("coordinate", [None, None])
803
- responses_items.append(make_left_mouse_up_item(
804
- x=coordinate[0] if len(coordinate) > 0 else None,
805
- y=coordinate[1] if len(coordinate) > 1 else None,
806
- call_id=call_id
807
- ))
837
+ responses_items.append(
838
+ make_left_mouse_up_item(
839
+ x=coordinate[0] if len(coordinate) > 0 else None,
840
+ y=coordinate[1] if len(coordinate) > 1 else None,
841
+ call_id=call_id,
842
+ )
843
+ )
808
844
  elif action_type == "hold_key":
809
845
  # responses_items.append({
810
846
  # "type": "computer_call",
@@ -816,21 +852,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
816
852
  # })
817
853
  raise NotImplementedError("hold_key")
818
854
  elif action_type == "wait":
819
- responses_items.append(make_wait_item(
820
- call_id=call_id
821
- ))
855
+ responses_items.append(make_wait_item(call_id=call_id))
822
856
  else:
823
857
  raise ValueError(f"Unknown action type: {action_type}")
824
858
  except Exception as e:
825
- responses_items.extend(make_failed_tool_call_items(
826
- tool_name="computer",
827
- tool_kwargs=tool_input,
828
- error_message=repr(e),
829
- call_id=call_id
830
- ))
831
-
859
+ responses_items.extend(
860
+ make_failed_tool_call_items(
861
+ tool_name="computer",
862
+ tool_kwargs=tool_input,
863
+ error_message=repr(e),
864
+ call_id=call_id,
865
+ )
866
+ )
867
+
832
868
  # Handle tool calls (alternative format)
833
- if hasattr(message, 'tool_calls') and message.tool_calls:
869
+ if hasattr(message, "tool_calls") and message.tool_calls:
834
870
  for tool_call in message.tool_calls:
835
871
  if tool_call.function.name == "computer":
836
872
  try:
@@ -852,7 +888,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
852
888
  # "id": "call_1",
853
889
  # "type": "function"
854
890
  # }
855
-
891
+
856
892
  # Output:
857
893
  # {
858
894
  # "type": "computer_call",
@@ -861,9 +897,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
861
897
  # "type": "screenshot"
862
898
  # }
863
899
  # }
864
- responses_items.append(make_screenshot_item(
865
- call_id=call_id
866
- ))
900
+ responses_items.append(make_screenshot_item(call_id=call_id))
867
901
  elif action_type in ["click", "left_click"]:
868
902
  # Input:
869
903
  # {
@@ -877,7 +911,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
877
911
  # "id": "call_1",
878
912
  # "type": "function"
879
913
  # }
880
-
914
+
881
915
  # Output:
882
916
  # {
883
917
  # "type": "computer_call",
@@ -889,11 +923,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
889
923
  # }
890
924
  # }
891
925
  coordinate = args.get("coordinate", [0, 0])
892
- responses_items.append(make_click_item(
893
- x=coordinate[0] if len(coordinate) > 0 else 0,
894
- y=coordinate[1] if len(coordinate) > 1 else 0,
895
- call_id=call_id
896
- ))
926
+ responses_items.append(
927
+ make_click_item(
928
+ x=coordinate[0] if len(coordinate) > 0 else 0,
929
+ y=coordinate[1] if len(coordinate) > 1 else 0,
930
+ call_id=call_id,
931
+ )
932
+ )
897
933
  elif action_type in ["type", "type_text"]:
898
934
  # Input:
899
935
  # {
@@ -907,7 +943,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
907
943
  # "id": "call_1",
908
944
  # "type": "function"
909
945
  # }
910
-
946
+
911
947
  # Output:
912
948
  # {
913
949
  # "type": "computer_call",
@@ -917,10 +953,9 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
917
953
  # "text": "Hello World"
918
954
  # }
919
955
  # }
920
- responses_items.append(make_type_item(
921
- text=args.get("text", ""),
922
- call_id=call_id
923
- ))
956
+ responses_items.append(
957
+ make_type_item(text=args.get("text", ""), call_id=call_id)
958
+ )
924
959
  elif action_type in ["key", "keypress", "hotkey"]:
925
960
  # Input:
926
961
  # {
@@ -934,7 +969,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
934
969
  # "id": "call_1",
935
970
  # "type": "function"
936
971
  # }
937
-
972
+
938
973
  # Output:
939
974
  # {
940
975
  # "type": "computer_call",
@@ -944,10 +979,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
944
979
  # "keys": ["ctrl", "c"]
945
980
  # }
946
981
  # }
947
- responses_items.append(make_keypress_item(
948
- keys=args.get("text", "").replace("+", "-").split("-"),
949
- call_id=call_id
950
- ))
982
+ responses_items.append(
983
+ make_keypress_item(
984
+ keys=args.get("text", "").replace("+", "-").split("-"),
985
+ call_id=call_id,
986
+ )
987
+ )
951
988
  elif action_type in ["mouse_move", "move_cursor", "move"]:
952
989
  # Input:
953
990
  # {
@@ -961,7 +998,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
961
998
  # "id": "call_1",
962
999
  # "type": "function"
963
1000
  # }
964
-
1001
+
965
1002
  # Output:
966
1003
  # {
967
1004
  # "type": "computer_call",
@@ -973,12 +1010,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
973
1010
  # }
974
1011
  # }
975
1012
  coordinate = args.get("coordinate", [0, 0])
976
- responses_items.append(make_move_item(
977
- x=coordinate[0] if len(coordinate) > 0 else 0,
978
- y=coordinate[1] if len(coordinate) > 1 else 0,
979
- call_id=call_id
980
- ))
981
-
1013
+ responses_items.append(
1014
+ make_move_item(
1015
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1016
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1017
+ call_id=call_id,
1018
+ )
1019
+ )
1020
+
982
1021
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
983
1022
  elif action_type == "scroll":
984
1023
  # Input:
@@ -995,7 +1034,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
995
1034
  # "id": "call_1",
996
1035
  # "type": "function"
997
1036
  # }
998
-
1037
+
999
1038
  # Output:
1000
1039
  # {
1001
1040
  # "type": "computer_call",
@@ -1011,17 +1050,25 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1011
1050
  coordinate = args.get("coordinate", [0, 0])
1012
1051
  direction = args.get("scroll_direction", "down")
1013
1052
  amount = args.get("scroll_amount", 3)
1014
- scroll_x = amount if direction == "left" else \
1015
- -amount if direction == "right" else 0
1016
- scroll_y = amount if direction == "up" else \
1017
- -amount if direction == "down" else 0
1018
- responses_items.append(make_scroll_item(
1019
- x=coordinate[0] if len(coordinate) > 0 else 0,
1020
- y=coordinate[1] if len(coordinate) > 1 else 0,
1021
- scroll_x=scroll_x,
1022
- scroll_y=scroll_y,
1023
- call_id=call_id
1024
- ))
1053
+ scroll_x = (
1054
+ amount
1055
+ if direction == "left"
1056
+ else -amount if direction == "right" else 0
1057
+ )
1058
+ scroll_y = (
1059
+ amount
1060
+ if direction == "up"
1061
+ else -amount if direction == "down" else 0
1062
+ )
1063
+ responses_items.append(
1064
+ make_scroll_item(
1065
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1066
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1067
+ scroll_x=scroll_x,
1068
+ scroll_y=scroll_y,
1069
+ call_id=call_id,
1070
+ )
1071
+ )
1025
1072
  elif action_type in ["left_click_drag", "drag"]:
1026
1073
  # Input:
1027
1074
  # {
@@ -1036,7 +1083,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1036
1083
  # "id": "call_1",
1037
1084
  # "type": "function"
1038
1085
  # }
1039
-
1086
+
1040
1087
  # Output:
1041
1088
  # {
1042
1089
  # "type": "computer_call",
@@ -1051,19 +1098,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1051
1098
  # }
1052
1099
  start_coord = args.get("start_coordinate", [0, 0])
1053
1100
  end_coord = args.get("end_coordinate", [0, 0])
1054
- responses_items.append(make_drag_item(
1055
- path=[
1056
- {
1057
- "x": start_coord[0] if len(start_coord) > 0 else 0,
1058
- "y": start_coord[1] if len(start_coord) > 1 else 0
1059
- },
1060
- {
1061
- "x": end_coord[0] if len(end_coord) > 0 else 0,
1062
- "y": end_coord[1] if len(end_coord) > 1 else 0
1063
- }
1064
- ],
1065
- call_id=call_id
1066
- ))
1101
+ responses_items.append(
1102
+ make_drag_item(
1103
+ path=[
1104
+ {
1105
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
1106
+ "y": start_coord[1] if len(start_coord) > 1 else 0,
1107
+ },
1108
+ {
1109
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
1110
+ "y": end_coord[1] if len(end_coord) > 1 else 0,
1111
+ },
1112
+ ],
1113
+ call_id=call_id,
1114
+ )
1115
+ )
1067
1116
  elif action_type == "right_click":
1068
1117
  # Input:
1069
1118
  # {
@@ -1077,7 +1126,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1077
1126
  # "id": "call_1",
1078
1127
  # "type": "function"
1079
1128
  # }
1080
-
1129
+
1081
1130
  # Output:
1082
1131
  # {
1083
1132
  # "type": "computer_call",
@@ -1090,12 +1139,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1090
1139
  # }
1091
1140
  # }
1092
1141
  coordinate = args.get("coordinate", [0, 0])
1093
- responses_items.append(make_click_item(
1094
- x=coordinate[0] if len(coordinate) > 0 else 0,
1095
- y=coordinate[1] if len(coordinate) > 1 else 0,
1096
- button="right",
1097
- call_id=call_id
1098
- ))
1142
+ responses_items.append(
1143
+ make_click_item(
1144
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1145
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1146
+ button="right",
1147
+ call_id=call_id,
1148
+ )
1149
+ )
1099
1150
  elif action_type == "middle_click":
1100
1151
  # Input:
1101
1152
  # {
@@ -1109,7 +1160,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1109
1160
  # "id": "call_1",
1110
1161
  # "type": "function"
1111
1162
  # }
1112
-
1163
+
1113
1164
  # Output:
1114
1165
  # {
1115
1166
  # "type": "computer_call",
@@ -1122,12 +1173,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1122
1173
  # }
1123
1174
  # }
1124
1175
  coordinate = args.get("coordinate", [0, 0])
1125
- responses_items.append(make_click_item(
1126
- x=coordinate[0] if len(coordinate) > 0 else 0,
1127
- y=coordinate[1] if len(coordinate) > 1 else 0,
1128
- button="wheel",
1129
- call_id=call_id
1130
- ))
1176
+ responses_items.append(
1177
+ make_click_item(
1178
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1179
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1180
+ button="wheel",
1181
+ call_id=call_id,
1182
+ )
1183
+ )
1131
1184
  elif action_type == "double_click":
1132
1185
  # Input:
1133
1186
  # {
@@ -1141,7 +1194,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1141
1194
  # "id": "call_1",
1142
1195
  # "type": "function"
1143
1196
  # }
1144
-
1197
+
1145
1198
  # Output:
1146
1199
  # {
1147
1200
  # "type": "computer_call",
@@ -1153,11 +1206,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1153
1206
  # }
1154
1207
  # }
1155
1208
  coordinate = args.get("coordinate", [0, 0])
1156
- responses_items.append(make_double_click_item(
1157
- x=coordinate[0] if len(coordinate) > 0 else 0,
1158
- y=coordinate[1] if len(coordinate) > 1 else 0,
1159
- call_id=call_id
1160
- ))
1209
+ responses_items.append(
1210
+ make_double_click_item(
1211
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1212
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1213
+ call_id=call_id,
1214
+ )
1215
+ )
1161
1216
  elif action_type == "triple_click":
1162
1217
  # Input:
1163
1218
  # {
@@ -1171,7 +1226,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1171
1226
  # "id": "call_1",
1172
1227
  # "type": "function"
1173
1228
  # }
1174
-
1229
+
1175
1230
  # Output:
1176
1231
  # {
1177
1232
  # "type": "computer_call",
@@ -1196,7 +1251,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1196
1251
  # "id": "call_1",
1197
1252
  # "type": "function"
1198
1253
  # }
1199
-
1254
+
1200
1255
  # Output:
1201
1256
  # {
1202
1257
  # "type": "computer_call",
@@ -1209,11 +1264,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1209
1264
  # }
1210
1265
  # }
1211
1266
  coordinate = args.get("coordinate", [None, None])
1212
- responses_items.append(make_left_mouse_down_item(
1213
- x=coordinate[0] if len(coordinate) > 0 else None,
1214
- y=coordinate[1] if len(coordinate) > 1 else None,
1215
- call_id=call_id
1216
- ))
1267
+ responses_items.append(
1268
+ make_left_mouse_down_item(
1269
+ x=coordinate[0] if len(coordinate) > 0 else None,
1270
+ y=coordinate[1] if len(coordinate) > 1 else None,
1271
+ call_id=call_id,
1272
+ )
1273
+ )
1217
1274
  elif action_type == "left_mouse_up":
1218
1275
  # Input:
1219
1276
  # {
@@ -1227,7 +1284,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1227
1284
  # "id": "call_1",
1228
1285
  # "type": "function"
1229
1286
  # }
1230
-
1287
+
1231
1288
  # Output:
1232
1289
  # {
1233
1290
  # "type": "computer_call",
@@ -1240,11 +1297,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1240
1297
  # }
1241
1298
  # }
1242
1299
  coordinate = args.get("coordinate", [None, None])
1243
- responses_items.append(make_left_mouse_up_item(
1244
- x=coordinate[0] if len(coordinate) > 0 else None,
1245
- y=coordinate[1] if len(coordinate) > 1 else None,
1246
- call_id=call_id
1247
- ))
1300
+ responses_items.append(
1301
+ make_left_mouse_up_item(
1302
+ x=coordinate[0] if len(coordinate) > 0 else None,
1303
+ y=coordinate[1] if len(coordinate) > 1 else None,
1304
+ call_id=call_id,
1305
+ )
1306
+ )
1248
1307
  elif action_type == "hold_key":
1249
1308
  # Input:
1250
1309
  # {
@@ -1258,7 +1317,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1258
1317
  # "id": "call_1",
1259
1318
  # "type": "function"
1260
1319
  # }
1261
-
1320
+
1262
1321
  # Output:
1263
1322
  # {
1264
1323
  # "type": "computer_call",
@@ -1281,7 +1340,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1281
1340
  # "id": "call_1",
1282
1341
  # "type": "function"
1283
1342
  # }
1284
-
1343
+
1285
1344
  # Output:
1286
1345
  # {
1287
1346
  # "type": "computer_call",
@@ -1290,74 +1349,77 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1290
1349
  # "type": "wait"
1291
1350
  # }
1292
1351
  # }
1293
- responses_items.append(make_wait_item(
1294
- call_id=call_id
1295
- ))
1352
+ responses_items.append(make_wait_item(call_id=call_id))
1296
1353
  except Exception as e:
1297
- responses_items.extend(make_failed_tool_call_items(
1298
- tool_name="computer",
1299
- tool_kwargs=args,
1300
- error_message=repr(e),
1301
- call_id=call_id
1302
- ))
1354
+ responses_items.extend(
1355
+ make_failed_tool_call_items(
1356
+ tool_name="computer",
1357
+ tool_kwargs=args,
1358
+ error_message=repr(e),
1359
+ call_id=call_id,
1360
+ )
1361
+ )
1303
1362
  except json.JSONDecodeError:
1304
1363
  print("Failed to decode tool call arguments")
1305
1364
  # Skip malformed tool calls
1306
1365
  continue
1307
-
1366
+
1308
1367
  return responses_items
1309
1368
 
1369
+
1310
1370
  def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1311
1371
  """Add cache control to completion messages"""
1312
1372
  num_writes = 0
1313
1373
  for message in completion_messages:
1314
- message["cache_control"] = { "type": "ephemeral" }
1374
+ message["cache_control"] = {"type": "ephemeral"}
1315
1375
  num_writes += 1
1316
1376
  # Cache control has a maximum of 4 blocks
1317
1377
  if num_writes >= 4:
1318
1378
  break
1319
-
1379
+
1320
1380
  return completion_messages
1321
1381
 
1382
+
1322
1383
  def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1323
1384
  """Combine completion messages with the same role"""
1324
1385
  if not completion_messages:
1325
1386
  return completion_messages
1326
-
1387
+
1327
1388
  combined_messages = []
1328
-
1389
+
1329
1390
  for message in completion_messages:
1330
1391
  # If this is the first message or role is different from last, add as new message
1331
1392
  if not combined_messages or combined_messages[-1]["role"] != message["role"]:
1332
1393
  # Ensure content is a list format and normalize text content
1333
1394
  new_message = message.copy()
1334
1395
  new_message["content"] = _normalize_content(message.get("content", ""))
1335
-
1396
+
1336
1397
  # Copy tool_calls if present
1337
1398
  if "tool_calls" in message:
1338
1399
  new_message["tool_calls"] = message["tool_calls"].copy()
1339
-
1400
+
1340
1401
  combined_messages.append(new_message)
1341
1402
  else:
1342
1403
  # Same role as previous message, combine them
1343
1404
  last_message = combined_messages[-1]
1344
-
1405
+
1345
1406
  # Combine content
1346
1407
  current_content = _normalize_content(message.get("content", ""))
1347
1408
  last_message["content"].extend(current_content)
1348
-
1409
+
1349
1410
  # Combine tool_calls if present
1350
1411
  if "tool_calls" in message:
1351
1412
  if "tool_calls" not in last_message:
1352
1413
  last_message["tool_calls"] = []
1353
1414
  last_message["tool_calls"].extend(message["tool_calls"])
1354
-
1415
+
1355
1416
  # Post-process to merge consecutive text blocks
1356
1417
  for message in combined_messages:
1357
1418
  message["content"] = _merge_consecutive_text(message["content"])
1358
-
1419
+
1359
1420
  return combined_messages
1360
1421
 
1422
+
1361
1423
  def _normalize_content(content) -> List[Dict[str, Any]]:
1362
1424
  """Normalize content to list format"""
1363
1425
  if isinstance(content, str):
@@ -1370,28 +1432,28 @@ def _normalize_content(content) -> List[Dict[str, Any]]:
1370
1432
  else:
1371
1433
  return []
1372
1434
 
1435
+
1373
1436
  def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1374
1437
  """Merge consecutive text blocks with newlines"""
1375
1438
  if not content_list:
1376
1439
  return content_list
1377
-
1440
+
1378
1441
  merged = []
1379
-
1442
+
1380
1443
  for item in content_list:
1381
- if (item.get("type") == "text" and
1382
- merged and
1383
- merged[-1].get("type") == "text"):
1444
+ if item.get("type") == "text" and merged and merged[-1].get("type") == "text":
1384
1445
  # Merge with previous text block
1385
1446
  merged[-1]["text"] += "\n" + item["text"]
1386
1447
  else:
1387
1448
  merged.append(item.copy())
1388
-
1449
+
1389
1450
  return merged
1390
1451
 
1452
+
1391
1453
  @register_agent(models=r".*claude-.*")
1392
1454
  class AnthropicHostedToolsConfig(AsyncAgentConfig):
1393
1455
  """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
1394
-
1456
+
1395
1457
  async def predict_step(
1396
1458
  self,
1397
1459
  messages: Messages,
@@ -1405,21 +1467,21 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1405
1467
  _on_api_end=None,
1406
1468
  _on_usage=None,
1407
1469
  _on_screenshot=None,
1408
- **kwargs
1470
+ **kwargs,
1409
1471
  ) -> Dict[str, Any]:
1410
1472
  """
1411
1473
  Anthropic hosted tools agent loop using liteLLM acompletion.
1412
-
1474
+
1413
1475
  Supports Anthropic's computer use models with hosted tools.
1414
1476
  """
1415
1477
  tools = tools or []
1416
-
1478
+
1417
1479
  # Get tool configuration for this model
1418
1480
  tool_config = _get_tool_config_for_model(model)
1419
-
1481
+
1420
1482
  # Prepare tools for Anthropic API
1421
1483
  anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
1422
-
1484
+
1423
1485
  # Convert responses_items messages to completion format
1424
1486
  completion_messages = _convert_responses_items_to_completion_messages(messages)
1425
1487
  if use_prompt_caching:
@@ -1427,7 +1489,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1427
1489
  completion_messages = _combine_completion_messages(completion_messages)
1428
1490
  # Then add cache control, anthropic requires explicit "cache_control" dicts
1429
1491
  completion_messages = _add_cache_control(completion_messages)
1430
-
1492
+
1431
1493
  # Prepare API call kwargs
1432
1494
  api_kwargs = {
1433
1495
  "model": model,
@@ -1435,80 +1497,74 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1435
1497
  "tools": anthropic_tools if anthropic_tools else None,
1436
1498
  "stream": stream,
1437
1499
  "num_retries": max_retries,
1438
- **kwargs
1500
+ **kwargs,
1439
1501
  }
1440
-
1502
+
1441
1503
  # Add beta header for computer use
1442
1504
  if anthropic_tools:
1443
- api_kwargs["headers"] = {
1444
- "anthropic-beta": tool_config["beta_flag"]
1445
- }
1446
-
1505
+ api_kwargs["headers"] = {"anthropic-beta": tool_config["beta_flag"]}
1506
+
1447
1507
  # Call API start hook
1448
1508
  if _on_api_start:
1449
1509
  await _on_api_start(api_kwargs)
1450
-
1510
+
1451
1511
  # Use liteLLM acompletion
1452
1512
  response = await litellm.acompletion(**api_kwargs)
1453
-
1513
+
1454
1514
  # Call API end hook
1455
1515
  if _on_api_end:
1456
1516
  await _on_api_end(api_kwargs, response)
1457
-
1517
+
1458
1518
  # Convert response to responses_items format
1459
1519
  responses_items = _convert_completion_to_responses_items(response)
1460
1520
 
1461
1521
  # Extract usage information
1462
- responses_usage = {
1463
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1522
+ responses_usage = {
1523
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
1524
+ response.usage
1525
+ ).model_dump(),
1464
1526
  "response_cost": response._hidden_params.get("response_cost", 0.0),
1465
1527
  }
1466
1528
  if _on_usage:
1467
1529
  await _on_usage(responses_usage)
1468
1530
 
1469
1531
  # Return in AsyncAgentConfig format
1470
- return {
1471
- "output": responses_items,
1472
- "usage": responses_usage
1473
- }
1474
-
1532
+ return {"output": responses_items, "usage": responses_usage}
1533
+
1475
1534
  async def predict_click(
1476
- self,
1477
- model: str,
1478
- image_b64: str,
1479
- instruction: str,
1480
- **kwargs
1535
+ self, model: str, image_b64: str, instruction: str, **kwargs
1481
1536
  ) -> Optional[Tuple[int, int]]:
1482
1537
  """
1483
1538
  Predict click coordinates based on image and instruction.
1484
-
1539
+
1485
1540
  Uses Anthropic's computer use models with a custom prompt that instructs
1486
1541
  the agent to only output clicks.
1487
-
1542
+
1488
1543
  Args:
1489
1544
  model: Model name to use
1490
1545
  image_b64: Base64 encoded image
1491
1546
  instruction: Instruction for where to click
1492
-
1547
+
1493
1548
  Returns:
1494
1549
  Tuple of (x, y) coordinates or None if prediction fails
1495
1550
  """
1496
1551
  # Get image dimensions from base64 data
1497
1552
  try:
1498
1553
  import base64
1499
- from PIL import Image
1500
1554
  from io import BytesIO
1501
-
1555
+
1556
+ from PIL import Image
1557
+
1502
1558
  image_data = base64.b64decode(image_b64)
1503
1559
  image = Image.open(BytesIO(image_data))
1504
1560
  display_width, display_height = image.size
1505
1561
  except Exception:
1506
1562
  # Fallback to default dimensions if image parsing fails
1507
1563
  display_width, display_height = 1024, 768
1508
-
1564
+
1509
1565
  # Get tool configuration for this model
1510
1566
  tool_config = _get_tool_config_for_model(model)
1511
-
1567
+
1512
1568
  # Prepare computer tool for Anthropic format
1513
1569
  computer_tool = {
1514
1570
  "type": tool_config["tool_version"],
@@ -1521,7 +1577,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1521
1577
  },
1522
1578
  },
1523
1579
  }
1524
-
1580
+
1525
1581
  # Construct messages in OpenAI chat completion format for liteLLM
1526
1582
  messages = [
1527
1583
  {
@@ -1540,18 +1596,16 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1540
1596
  7. Be decisive and action-oriented. Complete the requested task fully.
1541
1597
 
1542
1598
  Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
1543
- Task: Click {instruction}. Output ONLY a click action on the target element."""
1599
+ Task: Click {instruction}. Output ONLY a click action on the target element.""",
1544
1600
  },
1545
1601
  {
1546
1602
  "type": "image_url",
1547
- "image_url": {
1548
- "url": f"data:image/png;base64,{image_b64}"
1549
- }
1550
- }
1551
- ]
1603
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
1604
+ },
1605
+ ],
1552
1606
  }
1553
1607
  ]
1554
-
1608
+
1555
1609
  # Prepare API call kwargs
1556
1610
  api_kwargs = {
1557
1611
  "model": model,
@@ -1559,31 +1613,31 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
1559
1613
  "tools": [computer_tool],
1560
1614
  "stream": False,
1561
1615
  "max_tokens": 100, # Keep response short for click prediction
1562
- "headers": {
1563
- "anthropic-beta": tool_config["beta_flag"]
1564
- }
1616
+ "headers": {"anthropic-beta": tool_config["beta_flag"]},
1565
1617
  }
1566
-
1618
+
1567
1619
  # Use liteLLM acompletion
1568
1620
  response = await litellm.acompletion(**api_kwargs)
1569
-
1621
+
1570
1622
  # Convert response to responses_items format to extract click coordinates
1571
1623
  responses_items = _convert_completion_to_responses_items(response)
1572
-
1624
+
1573
1625
  # Look for computer_call with click action
1574
1626
  for item in responses_items:
1575
- if (isinstance(item, dict) and
1576
- item.get("type") == "computer_call" and
1577
- isinstance(item.get("action"), dict)):
1578
-
1627
+ if (
1628
+ isinstance(item, dict)
1629
+ and item.get("type") == "computer_call"
1630
+ and isinstance(item.get("action"), dict)
1631
+ ):
1632
+
1579
1633
  action = item["action"]
1580
1634
  if action.get("x") and action.get("y"):
1581
1635
  x = action.get("x")
1582
1636
  y = action.get("y")
1583
1637
  return (int(x), int(y))
1584
-
1638
+
1585
1639
  return None
1586
-
1640
+
1587
1641
  def get_capabilities(self) -> List[AgentCapability]:
1588
1642
  """Return the capabilities supported by this agent."""
1589
1643
  return ["click", "step"]