cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/anthropic.py CHANGED
@@ -4,69 +4,68 @@ Anthropic hosted tools agent loop implementation using liteLLM
4
4
 
5
5
  import asyncio
6
6
  import json
7
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
7
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
8
+
8
9
  import litellm
9
- from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
10
+ from litellm.responses.litellm_completion_transformation.transformation import (
11
+ LiteLLMCompletionResponsesConfig,
12
+ )
10
13
 
11
14
  from ..decorators import register_agent
12
- from ..types import Messages, AgentResponse, Tools, AgentCapability
13
15
  from ..loops.base import AsyncAgentConfig
14
16
  from ..responses import (
15
- make_reasoning_item,
16
- make_output_text_item,
17
17
  make_click_item,
18
18
  make_double_click_item,
19
19
  make_drag_item,
20
+ make_failed_tool_call_items,
21
+ make_input_image_item,
20
22
  make_keypress_item,
23
+ make_left_mouse_down_item,
24
+ make_left_mouse_up_item,
21
25
  make_move_item,
26
+ make_output_text_item,
27
+ make_reasoning_item,
28
+ make_screenshot_item,
22
29
  make_scroll_item,
23
30
  make_type_item,
24
31
  make_wait_item,
25
- make_input_image_item,
26
- make_screenshot_item,
27
- make_failed_tool_call_items,
28
- make_left_mouse_down_item,
29
- make_left_mouse_up_item
30
32
  )
33
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
31
34
 
32
35
  # Model version mapping to tool version and beta flag
33
36
  MODEL_TOOL_MAPPING = [
34
37
  # Claude 4 models
35
38
  {
36
- "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
39
+ "pattern": r"claude-4|claude-opus-4|claude-sonnet-4|claude-haiku-4",
37
40
  "tool_version": "computer_20250124",
38
- "beta_flag": "computer-use-2025-01-24"
41
+ "beta_flag": "computer-use-2025-01-24",
39
42
  },
40
43
  # Claude 3.7 models
41
44
  {
42
45
  "pattern": r"claude-3\.?7|claude-3-7",
43
46
  "tool_version": "computer_20250124",
44
- "beta_flag": "computer-use-2025-01-24"
47
+ "beta_flag": "computer-use-2025-01-24",
45
48
  },
46
49
  # Claude 3.5 models (fallback)
47
50
  {
48
51
  "pattern": r"claude-3\.?5|claude-3-5",
49
52
  "tool_version": "computer_20241022",
50
- "beta_flag": "computer-use-2024-10-22"
51
- }
53
+ "beta_flag": "computer-use-2024-10-22",
54
+ },
52
55
  ]
53
56
 
57
+
54
58
  def _get_tool_config_for_model(model: str) -> Dict[str, str]:
55
59
  """Get tool version and beta flag for the given model."""
56
60
  import re
57
-
61
+
58
62
  for mapping in MODEL_TOOL_MAPPING:
59
63
  if re.search(mapping["pattern"], model, re.IGNORECASE):
60
- return {
61
- "tool_version": mapping["tool_version"],
62
- "beta_flag": mapping["beta_flag"]
63
- }
64
-
64
+ return {"tool_version": mapping["tool_version"], "beta_flag": mapping["beta_flag"]}
65
+
65
66
  # Default to Claude 3.5 configuration
66
- return {
67
- "tool_version": "computer_20241022",
68
- "beta_flag": "computer-use-2024-10-22"
69
- }
67
+ return {"tool_version": "computer_20241022", "beta_flag": "computer-use-2024-10-22"}
68
+
70
69
 
71
70
  async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
72
71
  """Map a computer tool to Anthropic's hosted tool schema."""
@@ -76,7 +75,7 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
76
75
  except Exception:
77
76
  # Fallback to default dimensions if method fails
78
77
  width, height = 1024, 768
79
-
78
+
80
79
  return {
81
80
  "type": tool_version,
82
81
  "function": {
@@ -89,32 +88,34 @@ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str)
89
88
  },
90
89
  }
91
90
 
91
+
92
92
  async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
93
93
  """Prepare tools for Anthropic API format."""
94
94
  tool_config = _get_tool_config_for_model(model)
95
95
  anthropic_tools = []
96
-
96
+
97
97
  for schema in tool_schemas:
98
98
  if schema["type"] == "computer":
99
99
  # Map computer tool to Anthropic format
100
- anthropic_tools.append(await _map_computer_tool_to_anthropic(
101
- schema["computer"],
102
- tool_config["tool_version"]
103
- ))
100
+ anthropic_tools.append(
101
+ await _map_computer_tool_to_anthropic(
102
+ schema["computer"], tool_config["tool_version"]
103
+ )
104
+ )
104
105
  elif schema["type"] == "function":
105
106
  # Function tools - convert to Anthropic format
106
107
  function_schema = schema["function"]
107
- anthropic_tools.append({
108
- "type": "function",
109
- "function": {
108
+ anthropic_tools.append(
109
+ {
110
110
  "name": function_schema["name"],
111
111
  "description": function_schema.get("description", ""),
112
- "parameters": function_schema.get("parameters", {})
112
+ "input_schema": function_schema.get("parameters", {}),
113
113
  }
114
- })
115
-
114
+ )
115
+
116
116
  return anthropic_tools
117
117
 
118
+
118
119
  def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
119
120
  """Convert responses_items message format to liteLLM completion format."""
120
121
  completion_messages = []
@@ -123,7 +124,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
123
124
  for message in messages:
124
125
  msg_type = message.get("type")
125
126
  role = message.get("role")
126
-
127
+
127
128
  # Handle user messages (both with and without explicit type)
128
129
  if role == "user" or msg_type == "user":
129
130
  content = message.get("content", "")
@@ -132,55 +133,41 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
132
133
  converted_content = []
133
134
  for item in content:
134
135
  if isinstance(item, dict) and item.get("type") == "input_image":
135
- # Convert input_image to Anthropic image format
136
+ # Convert input_image to OpenAI image format
136
137
  image_url = item.get("image_url", "")
137
138
  if image_url and image_url != "[omitted]":
138
- # Extract base64 data from data URL
139
- if "," in image_url:
140
- base64_data = image_url.split(",")[-1]
141
- else:
142
- base64_data = image_url
143
-
144
- converted_content.append({
145
- "type": "image",
146
- "source": {
147
- "type": "base64",
148
- "media_type": "image/png",
149
- "data": base64_data
150
- }
151
- })
139
+ converted_content.append(
140
+ {"type": "image_url", "image_url": {"url": image_url}}
141
+ )
142
+ elif isinstance(item, dict) and item.get("type") == "input_text":
143
+ # Convert input_text to OpenAI text format
144
+ text = item.get("text", "")
145
+ converted_content.append({"type": "text", "text": text})
152
146
  else:
153
147
  # Keep other content types as-is
154
148
  converted_content.append(item)
155
-
156
- completion_messages.append({
157
- "role": "user",
158
- "content": converted_content if converted_content else content
159
- })
149
+
150
+ completion_messages.append(
151
+ {"role": "user", "content": converted_content if converted_content else content}
152
+ )
160
153
  else:
161
154
  # Text content
162
- completion_messages.append({
163
- "role": "user",
164
- "content": content
165
- })
166
-
155
+ completion_messages.append({"role": "user", "content": content})
156
+
167
157
  # Handle assistant messages
168
158
  elif role == "assistant":
169
159
  content = message.get("content", [])
170
160
  if isinstance(content, str):
171
- content = [{ "type": "output_text", "text": content }]
172
-
161
+ content = [{"type": "output_text", "text": content}]
162
+
173
163
  content = "\n".join(item.get("text", "") for item in content)
174
- completion_messages.append({
175
- "role": "assistant",
176
- "content": content
177
- })
178
-
164
+ completion_messages.append({"role": "assistant", "content": content})
165
+
179
166
  elif msg_type == "reasoning":
180
167
  # Reasoning becomes part of assistant message
181
168
  summary = message.get("summary", [])
182
169
  reasoning_text = ""
183
-
170
+
184
171
  if isinstance(summary, list) and summary:
185
172
  # Extract text from summary items
186
173
  for item in summary:
@@ -190,58 +177,54 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
190
177
  else:
191
178
  # Fallback to direct reasoning field
192
179
  reasoning_text = message.get("reasoning", "")
193
-
180
+
194
181
  if reasoning_text:
195
- completion_messages.append({
196
- "role": "assistant",
197
- "content": reasoning_text
198
- })
199
-
182
+ completion_messages.append({"role": "assistant", "content": reasoning_text})
183
+
200
184
  elif msg_type == "function_call":
201
185
  fn_name = message.get("name")
202
186
  fn_args = message.get("arguments", "{}")
203
187
  call_id = message.get("call_id", "call_1")
204
188
  call_id_to_fn_name[call_id] = fn_name
205
- openai_tool_calls = [{
206
- "id": call_id,
207
- "type": "function",
208
- "function": {
209
- "name": fn_name,
210
- "arguments": fn_args
189
+ openai_tool_calls = [
190
+ {
191
+ "id": call_id,
192
+ "type": "function",
193
+ "function": {"name": fn_name, "arguments": fn_args},
211
194
  }
212
- }] # If the last completion message is an assistant message, extend the tool_calls
195
+ ] # If the last completion message is an assistant message, extend the tool_calls
213
196
  if completion_messages and completion_messages[-1].get("role") == "assistant":
214
197
  if "tool_calls" not in completion_messages[-1]:
215
198
  completion_messages[-1]["tool_calls"] = []
216
199
  completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
217
200
  else:
218
201
  # Create new assistant message with tool calls
219
- completion_messages.append({
220
- "role": "assistant",
221
- "content": None,
222
- "tool_calls": openai_tool_calls
223
- })
224
-
202
+ completion_messages.append(
203
+ {"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
204
+ )
205
+
225
206
  elif msg_type == "function_call_output":
226
207
  call_id = message.get("call_id", "call_1")
227
208
  fn_output = message.get("output", "")
228
209
  fn_name = call_id_to_fn_name.get(call_id, "computer")
229
210
 
230
- completion_messages.append({
231
- "role": "function",
232
- "name": fn_name,
233
- "tool_call_id": call_id,
234
- "content": str(fn_output)
235
- })
236
-
211
+ completion_messages.append(
212
+ {
213
+ "role": "function",
214
+ "name": fn_name,
215
+ "tool_call_id": call_id,
216
+ "content": str(fn_output),
217
+ }
218
+ )
219
+
237
220
  elif msg_type == "computer_call":
238
221
  # Computer call becomes tool use in assistant message
239
222
  action = message.get("action", {})
240
223
  action_type = action.get("type")
241
224
  call_id = message.get("call_id", "call_1")
242
-
225
+
243
226
  tool_use_content = []
244
-
227
+
245
228
  # Basic actions (all versions)
246
229
  if action_type == "click":
247
230
  # Input:
@@ -254,7 +237,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
254
237
  # "y": 200
255
238
  # }
256
239
  # }
257
-
240
+
258
241
  # Output:
259
242
  # {
260
243
  # "function": {
@@ -268,16 +251,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
268
251
  # "type": "function"
269
252
  # }
270
253
  button = action.get("button", "left")
271
- action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
272
- tool_use_content.append({
273
- "type": "tool_use",
274
- "id": call_id,
275
- "name": "computer",
276
- "input": {
277
- "action": action_name,
278
- "coordinate": [action.get("x", 0), action.get("y", 0)]
254
+ action_name = (
255
+ "right_click"
256
+ if button == "right"
257
+ else "middle_click" if button == "wheel" else "left_click"
258
+ )
259
+ tool_use_content.append(
260
+ {
261
+ "type": "tool_use",
262
+ "id": call_id,
263
+ "name": "computer",
264
+ "input": {
265
+ "action": action_name,
266
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
267
+ },
279
268
  }
280
- })
269
+ )
281
270
  elif action_type == "double_click":
282
271
  # Input:
283
272
  # {
@@ -289,7 +278,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
289
278
  # "y": 240
290
279
  # }
291
280
  # }
292
-
281
+
293
282
  # Output:
294
283
  # {
295
284
  # "function": {
@@ -302,15 +291,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
302
291
  # "id": "call_1",
303
292
  # "type": "function"
304
293
  # }
305
- tool_use_content.append({
306
- "type": "tool_use",
307
- "id": call_id,
308
- "name": "computer",
309
- "input": {
310
- "action": "double_click",
311
- "coordinate": [action.get("x", 0), action.get("y", 0)]
294
+ tool_use_content.append(
295
+ {
296
+ "type": "tool_use",
297
+ "id": call_id,
298
+ "name": "computer",
299
+ "input": {
300
+ "action": "double_click",
301
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
302
+ },
312
303
  }
313
- })
304
+ )
314
305
  elif action_type == "type":
315
306
  # Input:
316
307
  # {
@@ -321,7 +312,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
321
312
  # "text": "Hello World"
322
313
  # }
323
314
  # }
324
-
315
+
325
316
  # Output:
326
317
  # {
327
318
  # "function": {
@@ -334,15 +325,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
334
325
  # "id": "call_1",
335
326
  # "type": "function"
336
327
  # }
337
- tool_use_content.append({
338
- "type": "tool_use",
339
- "id": call_id,
340
- "name": "computer",
341
- "input": {
342
- "action": "type",
343
- "text": action.get("text", "")
328
+ tool_use_content.append(
329
+ {
330
+ "type": "tool_use",
331
+ "id": call_id,
332
+ "name": "computer",
333
+ "input": {"action": "type", "text": action.get("text", "")},
344
334
  }
345
- })
335
+ )
346
336
  elif action_type == "keypress":
347
337
  # Input:
348
338
  # {
@@ -353,7 +343,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
353
343
  # "keys": ["ctrl", "c"]
354
344
  # }
355
345
  # }
356
-
346
+
357
347
  # Output:
358
348
  # {
359
349
  # "function": {
@@ -366,15 +356,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
366
356
  # "id": "call_1",
367
357
  # "type": "function"
368
358
  # }
369
- tool_use_content.append({
370
- "type": "tool_use",
371
- "id": call_id,
372
- "name": "computer",
373
- "input": {
374
- "action": "key",
375
- "text": "+".join(action.get("keys", []))
359
+ tool_use_content.append(
360
+ {
361
+ "type": "tool_use",
362
+ "id": call_id,
363
+ "name": "computer",
364
+ "input": {"action": "key", "text": "+".join(action.get("keys", []))},
376
365
  }
377
- })
366
+ )
378
367
  elif action_type in ["mouse_move", "move"]:
379
368
  # Input:
380
369
  # {
@@ -386,7 +375,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
386
375
  # "y": 250
387
376
  # }
388
377
  # }
389
-
378
+
390
379
  # Output:
391
380
  # {
392
381
  # "function": {
@@ -399,15 +388,17 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
399
388
  # "id": "call_1",
400
389
  # "type": "function"
401
390
  # }
402
- tool_use_content.append({
403
- "type": "tool_use",
404
- "id": call_id,
405
- "name": "computer",
406
- "input": {
407
- "action": "mouse_move",
408
- "coordinate": [action.get("x", 0), action.get("y", 0)]
391
+ tool_use_content.append(
392
+ {
393
+ "type": "tool_use",
394
+ "id": call_id,
395
+ "name": "computer",
396
+ "input": {
397
+ "action": "mouse_move",
398
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
399
+ },
409
400
  }
410
- })
401
+ )
411
402
  elif action_type == "scroll":
412
403
  # Input:
413
404
  # {
@@ -421,7 +412,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
421
412
  # "scroll_y": -5
422
413
  # }
423
414
  # }
424
-
415
+
425
416
  # Output:
426
417
  # {
427
418
  # "function": {
@@ -454,18 +445,20 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
454
445
  else:
455
446
  direction = "down"
456
447
  amount = 3
457
-
458
- tool_use_content.append({
459
- "type": "tool_use",
460
- "id": call_id,
461
- "name": "computer",
462
- "input": {
463
- "action": "scroll",
464
- "coordinate": [action.get("x", 0), action.get("y", 0)],
465
- "scroll_direction": direction,
466
- "scroll_amount": amount
448
+
449
+ tool_use_content.append(
450
+ {
451
+ "type": "tool_use",
452
+ "id": call_id,
453
+ "name": "computer",
454
+ "input": {
455
+ "action": "scroll",
456
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
457
+ "scroll_direction": direction,
458
+ "scroll_amount": amount,
459
+ },
467
460
  }
468
- })
461
+ )
469
462
  elif action_type == "drag":
470
463
  # Input:
471
464
  # {
@@ -479,7 +472,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
479
472
  # ]
480
473
  # }
481
474
  # }
482
-
475
+
483
476
  # Output:
484
477
  # {
485
478
  # "function": {
@@ -499,17 +492,19 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
499
492
  if isinstance(path, list) and len(path) >= 2:
500
493
  start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
501
494
  end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
502
-
503
- tool_use_content.append({
504
- "type": "tool_use",
505
- "id": call_id,
506
- "name": "computer",
507
- "input": {
508
- "action": "left_click_drag",
509
- "start_coordinate": start_coord,
510
- "end_coordinate": end_coord
495
+
496
+ tool_use_content.append(
497
+ {
498
+ "type": "tool_use",
499
+ "id": call_id,
500
+ "name": "computer",
501
+ "input": {
502
+ "action": "left_click_drag",
503
+ "start_coordinate": start_coord,
504
+ "end_coordinate": end_coord,
505
+ },
511
506
  }
512
- })
507
+ )
513
508
  elif action_type == "wait":
514
509
  # Input:
515
510
  # {
@@ -519,7 +514,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
519
514
  # "type": "wait"
520
515
  # }
521
516
  # }
522
-
517
+
523
518
  # Output:
524
519
  # {
525
520
  # "function": {
@@ -531,14 +526,14 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
531
526
  # "id": "call_1",
532
527
  # "type": "function"
533
528
  # }
534
- tool_use_content.append({
535
- "type": "tool_use",
536
- "id": call_id,
537
- "name": "computer",
538
- "input": {
539
- "action": "wait"
529
+ tool_use_content.append(
530
+ {
531
+ "type": "tool_use",
532
+ "id": call_id,
533
+ "name": "computer",
534
+ "input": {"action": "wait"},
540
535
  }
541
- })
536
+ )
542
537
  elif action_type == "screenshot":
543
538
  # Input:
544
539
  # {
@@ -548,7 +543,7 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
548
543
  # "type": "screenshot"
549
544
  # }
550
545
  # }
551
-
546
+
552
547
  # Output:
553
548
  # {
554
549
  # "function": {
@@ -560,47 +555,53 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
560
555
  # "id": "call_1",
561
556
  # "type": "function"
562
557
  # }
563
- tool_use_content.append({
564
- "type": "tool_use",
565
- "id": call_id,
566
- "name": "computer",
567
- "input": {
568
- "action": "screenshot"
558
+ tool_use_content.append(
559
+ {
560
+ "type": "tool_use",
561
+ "id": call_id,
562
+ "name": "computer",
563
+ "input": {"action": "screenshot"},
569
564
  }
570
- })
565
+ )
571
566
  elif action_type == "left_mouse_down":
572
- tool_use_content.append({
573
- "type": "tool_use",
574
- "id": call_id,
575
- "name": "computer",
576
- "input": {
577
- "action": "left_mouse_down",
578
- "coordinate": [action.get("x", None), action.get("y", None)]
567
+ tool_use_content.append(
568
+ {
569
+ "type": "tool_use",
570
+ "id": call_id,
571
+ "name": "computer",
572
+ "input": {
573
+ "action": "left_mouse_down",
574
+ "coordinate": [action.get("x", None), action.get("y", None)],
575
+ },
579
576
  }
580
- })
577
+ )
581
578
  elif action_type == "left_mouse_up":
582
- tool_use_content.append({
583
- "type": "tool_use",
584
- "id": call_id,
585
- "name": "computer",
586
- "input": {
587
- "action": "left_mouse_up",
588
- "coordinate": [action.get("x", None), action.get("y", None)]
579
+ tool_use_content.append(
580
+ {
581
+ "type": "tool_use",
582
+ "id": call_id,
583
+ "name": "computer",
584
+ "input": {
585
+ "action": "left_mouse_up",
586
+ "coordinate": [action.get("x", None), action.get("y", None)],
587
+ },
589
588
  }
590
- })
591
-
589
+ )
590
+
592
591
  # Convert tool_use_content to OpenAI tool_calls format
593
592
  openai_tool_calls = []
594
593
  for tool_use in tool_use_content:
595
- openai_tool_calls.append({
596
- "id": tool_use["id"],
597
- "type": "function",
598
- "function": {
599
- "name": tool_use["name"],
600
- "arguments": json.dumps(tool_use["input"])
594
+ openai_tool_calls.append(
595
+ {
596
+ "id": tool_use["id"],
597
+ "type": "function",
598
+ "function": {
599
+ "name": tool_use["name"],
600
+ "arguments": json.dumps(tool_use["input"]),
601
+ },
601
602
  }
602
- })
603
-
603
+ )
604
+
604
605
  # If the last completion message is an assistant message, extend the tool_calls
605
606
  if completion_messages and completion_messages[-1].get("role") == "assistant":
606
607
  if "tool_calls" not in completion_messages[-1]:
@@ -608,54 +609,52 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
608
609
  completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
609
610
  else:
610
611
  # Create new assistant message with tool calls
611
- completion_messages.append({
612
- "role": "assistant",
613
- "content": None,
614
- "tool_calls": openai_tool_calls
615
- })
616
-
612
+ completion_messages.append(
613
+ {"role": "assistant", "content": None, "tool_calls": openai_tool_calls}
614
+ )
615
+
617
616
  elif msg_type == "computer_call_output":
618
617
  # Computer call output becomes OpenAI function result
619
618
  output = message.get("output", {})
620
619
  call_id = message.get("call_id", "call_1")
621
-
620
+
622
621
  if output.get("type") == "input_image":
623
622
  # Screenshot result - convert to OpenAI format with image_url content
624
623
  image_url = output.get("image_url", "")
625
- completion_messages.append({
626
- "role": "function",
627
- "name": "computer",
628
- "tool_call_id": call_id,
629
- "content": [{
630
- "type": "image_url",
631
- "image_url": {
632
- "url": image_url
633
- }
634
- }]
635
- })
624
+ completion_messages.append(
625
+ {
626
+ "role": "function",
627
+ "name": "computer",
628
+ "tool_call_id": call_id,
629
+ "content": [{"type": "image_url", "image_url": {"url": image_url}}],
630
+ }
631
+ )
636
632
  else:
637
633
  # Text result - convert to OpenAI format
638
- completion_messages.append({
639
- "role": "function",
640
- "name": "computer",
641
- "tool_call_id": call_id,
642
- "content": str(output)
643
- })
644
-
634
+ completion_messages.append(
635
+ {
636
+ "role": "function",
637
+ "name": "computer",
638
+ "tool_call_id": call_id,
639
+ "content": str(output),
640
+ }
641
+ )
642
+
645
643
  return completion_messages
646
644
 
645
+
647
646
  def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
648
647
  """Convert liteLLM completion response to responses_items message format."""
649
648
  responses_items = []
650
-
651
- if not response or not hasattr(response, 'choices') or not response.choices:
649
+
650
+ if not response or not hasattr(response, "choices") or not response.choices:
652
651
  return responses_items
653
-
652
+
654
653
  choice = response.choices[0]
655
654
  message = choice.message
656
-
655
+
657
656
  # Handle text content
658
- if hasattr(message, 'content') and message.content:
657
+ if hasattr(message, "content") and message.content:
659
658
  if isinstance(message.content, str):
660
659
  responses_items.append(make_output_text_item(message.content))
661
660
  elif isinstance(message.content, list):
@@ -664,35 +663,54 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
664
663
  if content_item.get("type") == "text":
665
664
  responses_items.append(make_output_text_item(content_item.get("text", "")))
666
665
  elif content_item.get("type") == "tool_use":
667
- # Convert tool use to computer call
666
+ # Check if this is a custom function tool or computer tool
667
+ tool_name = content_item.get("name", "computer")
668
668
  tool_input = content_item.get("input", {})
669
- action_type = tool_input.get("action")
670
669
  call_id = content_item.get("id")
671
-
670
+
671
+ # Handle custom function tools (not computer tools)
672
+ if tool_name != "computer":
673
+ from ..responses import make_function_call_item
674
+
675
+ responses_items.append(
676
+ make_function_call_item(
677
+ function_name=tool_name, arguments=tool_input, call_id=call_id
678
+ )
679
+ )
680
+ continue
681
+
682
+ # Computer tool - process actions
683
+ action_type = tool_input.get("action")
684
+
672
685
  # Action reference:
673
686
  # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
674
-
687
+
675
688
  try:
676
689
  # Basic actions (all versions)
677
690
  if action_type == "screenshot":
678
691
  responses_items.append(make_screenshot_item(call_id=call_id))
679
692
  elif action_type in ["click", "left_click"]:
680
693
  coordinate = tool_input.get("coordinate", [0, 0])
681
- responses_items.append(make_click_item(
682
- x=coordinate[0] if len(coordinate) > 0 else 0,
683
- y=coordinate[1] if len(coordinate) > 1 else 0,
684
- call_id=call_id
685
- ))
694
+ responses_items.append(
695
+ make_click_item(
696
+ x=coordinate[0] if len(coordinate) > 0 else 0,
697
+ y=coordinate[1] if len(coordinate) > 1 else 0,
698
+ call_id=call_id,
699
+ )
700
+ )
686
701
  elif action_type in ["type", "type_text"]:
687
- responses_items.append(make_type_item(
688
- text=tool_input.get("text", ""),
689
- call_id=call_id
690
- ))
702
+ responses_items.append(
703
+ make_type_item(text=tool_input.get("text", ""), call_id=call_id)
704
+ )
691
705
  elif action_type in ["key", "keypress", "hotkey"]:
692
- responses_items.append(make_keypress_item(
693
- keys=tool_input.get("text", "").replace("+", "-").split("-"),
694
- call_id=call_id
695
- ))
706
+ responses_items.append(
707
+ make_keypress_item(
708
+ keys=tool_input.get("text", "")
709
+ .replace("+", "-")
710
+ .split("-"),
711
+ call_id=call_id,
712
+ )
713
+ )
696
714
  elif action_type in ["mouse_move", "move_cursor", "move"]:
697
715
  # Mouse move - create a custom action item
698
716
  coordinate = tool_input.get("coordinate", [0, 0])
@@ -700,64 +718,88 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
700
718
  make_move_item(
701
719
  x=coordinate[0] if len(coordinate) > 0 else 0,
702
720
  y=coordinate[1] if len(coordinate) > 1 else 0,
703
- call_id=call_id
721
+ call_id=call_id,
704
722
  )
705
723
  )
706
-
724
+
707
725
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
708
726
  elif action_type == "scroll":
709
727
  coordinate = tool_input.get("coordinate", [0, 0])
710
728
  scroll_amount = tool_input.get("scroll_amount", 3)
711
- scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
712
- -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
713
- scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
714
- -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
715
- responses_items.append(make_scroll_item(
716
- x=coordinate[0] if len(coordinate) > 0 else 0,
717
- y=coordinate[1] if len(coordinate) > 1 else 0,
718
- scroll_x=scroll_x,
719
- scroll_y=scroll_y,
720
- call_id=call_id
721
- ))
729
+ scroll_x = (
730
+ scroll_amount
731
+ if tool_input.get("scroll_direction", "down") == "right"
732
+ else (
733
+ -scroll_amount
734
+ if tool_input.get("scroll_direction", "down") == "left"
735
+ else 0
736
+ )
737
+ )
738
+ scroll_y = (
739
+ scroll_amount
740
+ if tool_input.get("scroll_direction", "down") == "down"
741
+ else (
742
+ -scroll_amount
743
+ if tool_input.get("scroll_direction", "down") == "up"
744
+ else 0
745
+ )
746
+ )
747
+ responses_items.append(
748
+ make_scroll_item(
749
+ x=coordinate[0] if len(coordinate) > 0 else 0,
750
+ y=coordinate[1] if len(coordinate) > 1 else 0,
751
+ scroll_x=scroll_x,
752
+ scroll_y=scroll_y,
753
+ call_id=call_id,
754
+ )
755
+ )
722
756
  elif action_type in ["left_click_drag", "drag"]:
723
757
  start_coord = tool_input.get("start_coordinate", [0, 0])
724
758
  end_coord = tool_input.get("end_coordinate", [0, 0])
725
- responses_items.append(make_drag_item(
726
- path=[
727
- {
728
- "x": start_coord[0] if len(start_coord) > 0 else 0,
729
- "y": start_coord[1] if len(start_coord) > 1 else 0
730
- },
731
- {
732
- "x": end_coord[0] if len(end_coord) > 0 else 0,
733
- "y": end_coord[1] if len(end_coord) > 1 else 0
734
- }
735
- ],
736
- call_id=call_id
737
- ))
759
+ responses_items.append(
760
+ make_drag_item(
761
+ path=[
762
+ {
763
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
764
+ "y": start_coord[1] if len(start_coord) > 1 else 0,
765
+ },
766
+ {
767
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
768
+ "y": end_coord[1] if len(end_coord) > 1 else 0,
769
+ },
770
+ ],
771
+ call_id=call_id,
772
+ )
773
+ )
738
774
  elif action_type == "right_click":
739
775
  coordinate = tool_input.get("coordinate", [0, 0])
740
- responses_items.append(make_click_item(
741
- x=coordinate[0] if len(coordinate) > 0 else 0,
742
- y=coordinate[1] if len(coordinate) > 1 else 0,
743
- button="right",
744
- call_id=call_id
745
- ))
776
+ responses_items.append(
777
+ make_click_item(
778
+ x=coordinate[0] if len(coordinate) > 0 else 0,
779
+ y=coordinate[1] if len(coordinate) > 1 else 0,
780
+ button="right",
781
+ call_id=call_id,
782
+ )
783
+ )
746
784
  elif action_type == "middle_click":
747
785
  coordinate = tool_input.get("coordinate", [0, 0])
748
- responses_items.append(make_click_item(
749
- x=coordinate[0] if len(coordinate) > 0 else 0,
750
- y=coordinate[1] if len(coordinate) > 1 else 0,
751
- button="wheel",
752
- call_id=call_id
753
- ))
786
+ responses_items.append(
787
+ make_click_item(
788
+ x=coordinate[0] if len(coordinate) > 0 else 0,
789
+ y=coordinate[1] if len(coordinate) > 1 else 0,
790
+ button="wheel",
791
+ call_id=call_id,
792
+ )
793
+ )
754
794
  elif action_type == "double_click":
755
795
  coordinate = tool_input.get("coordinate", [0, 0])
756
- responses_items.append(make_double_click_item(
757
- x=coordinate[0] if len(coordinate) > 0 else 0,
758
- y=coordinate[1] if len(coordinate) > 1 else 0,
759
- call_id=call_id
760
- ))
796
+ responses_items.append(
797
+ make_double_click_item(
798
+ x=coordinate[0] if len(coordinate) > 0 else 0,
799
+ y=coordinate[1] if len(coordinate) > 1 else 0,
800
+ call_id=call_id,
801
+ )
802
+ )
761
803
  elif action_type == "triple_click":
762
804
  # coordinate = tool_input.get("coordinate", [0, 0])
763
805
  # responses_items.append({
@@ -783,11 +825,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
783
825
  # }
784
826
  # })
785
827
  coordinate = tool_input.get("coordinate", [None, None])
786
- responses_items.append(make_left_mouse_down_item(
787
- x=coordinate[0] if len(coordinate) > 0 else None,
788
- y=coordinate[1] if len(coordinate) > 1 else None,
789
- call_id=call_id
790
- ))
828
+ responses_items.append(
829
+ make_left_mouse_down_item(
830
+ x=coordinate[0] if len(coordinate) > 0 else None,
831
+ y=coordinate[1] if len(coordinate) > 1 else None,
832
+ call_id=call_id,
833
+ )
834
+ )
791
835
  elif action_type == "left_mouse_up":
792
836
  # coordinate = tool_input.get("coordinate", [0, 0])
793
837
  # responses_items.append({
@@ -801,11 +845,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
801
845
  # }
802
846
  # })
803
847
  coordinate = tool_input.get("coordinate", [None, None])
804
- responses_items.append(make_left_mouse_up_item(
805
- x=coordinate[0] if len(coordinate) > 0 else None,
806
- y=coordinate[1] if len(coordinate) > 1 else None,
807
- call_id=call_id
808
- ))
848
+ responses_items.append(
849
+ make_left_mouse_up_item(
850
+ x=coordinate[0] if len(coordinate) > 0 else None,
851
+ y=coordinate[1] if len(coordinate) > 1 else None,
852
+ call_id=call_id,
853
+ )
854
+ )
809
855
  elif action_type == "hold_key":
810
856
  # responses_items.append({
811
857
  # "type": "computer_call",
@@ -817,22 +863,41 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
817
863
  # })
818
864
  raise NotImplementedError("hold_key")
819
865
  elif action_type == "wait":
820
- responses_items.append(make_wait_item(
821
- call_id=call_id
822
- ))
866
+ responses_items.append(make_wait_item(call_id=call_id))
823
867
  else:
824
868
  raise ValueError(f"Unknown action type: {action_type}")
825
869
  except Exception as e:
826
- responses_items.extend(make_failed_tool_call_items(
827
- tool_name="computer",
828
- tool_kwargs=tool_input,
829
- error_message=repr(e),
830
- call_id=call_id
831
- ))
832
-
870
+ responses_items.extend(
871
+ make_failed_tool_call_items(
872
+ tool_name="computer",
873
+ tool_kwargs=tool_input,
874
+ error_message=repr(e),
875
+ call_id=call_id,
876
+ )
877
+ )
878
+
833
879
  # Handle tool calls (alternative format)
834
- if hasattr(message, 'tool_calls') and message.tool_calls:
880
+ if hasattr(message, "tool_calls") and message.tool_calls:
835
881
  for tool_call in message.tool_calls:
882
+ tool_name = tool_call.function.name
883
+
884
+ # Handle custom function tools
885
+ if tool_name != "computer":
886
+ from ..responses import make_function_call_item
887
+
888
+ # tool_call.function.arguments is a JSON string, need to parse it
889
+ try:
890
+ args_dict = json.loads(tool_call.function.arguments)
891
+ except json.JSONDecodeError:
892
+ args_dict = {}
893
+ responses_items.append(
894
+ make_function_call_item(
895
+ function_name=tool_name, arguments=args_dict, call_id=tool_call.id
896
+ )
897
+ )
898
+ continue
899
+
900
+ # Handle computer tool
836
901
  if tool_call.function.name == "computer":
837
902
  try:
838
903
  try:
@@ -853,7 +918,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
853
918
  # "id": "call_1",
854
919
  # "type": "function"
855
920
  # }
856
-
921
+
857
922
  # Output:
858
923
  # {
859
924
  # "type": "computer_call",
@@ -862,9 +927,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
862
927
  # "type": "screenshot"
863
928
  # }
864
929
  # }
865
- responses_items.append(make_screenshot_item(
866
- call_id=call_id
867
- ))
930
+ responses_items.append(make_screenshot_item(call_id=call_id))
868
931
  elif action_type in ["click", "left_click"]:
869
932
  # Input:
870
933
  # {
@@ -878,7 +941,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
878
941
  # "id": "call_1",
879
942
  # "type": "function"
880
943
  # }
881
-
944
+
882
945
  # Output:
883
946
  # {
884
947
  # "type": "computer_call",
@@ -890,11 +953,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
890
953
  # }
891
954
  # }
892
955
  coordinate = args.get("coordinate", [0, 0])
893
- responses_items.append(make_click_item(
894
- x=coordinate[0] if len(coordinate) > 0 else 0,
895
- y=coordinate[1] if len(coordinate) > 1 else 0,
896
- call_id=call_id
897
- ))
956
+ responses_items.append(
957
+ make_click_item(
958
+ x=coordinate[0] if len(coordinate) > 0 else 0,
959
+ y=coordinate[1] if len(coordinate) > 1 else 0,
960
+ call_id=call_id,
961
+ )
962
+ )
898
963
  elif action_type in ["type", "type_text"]:
899
964
  # Input:
900
965
  # {
@@ -908,7 +973,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
908
973
  # "id": "call_1",
909
974
  # "type": "function"
910
975
  # }
911
-
976
+
912
977
  # Output:
913
978
  # {
914
979
  # "type": "computer_call",
@@ -918,10 +983,9 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
918
983
  # "text": "Hello World"
919
984
  # }
920
985
  # }
921
- responses_items.append(make_type_item(
922
- text=args.get("text", ""),
923
- call_id=call_id
924
- ))
986
+ responses_items.append(
987
+ make_type_item(text=args.get("text", ""), call_id=call_id)
988
+ )
925
989
  elif action_type in ["key", "keypress", "hotkey"]:
926
990
  # Input:
927
991
  # {
@@ -935,7 +999,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
935
999
  # "id": "call_1",
936
1000
  # "type": "function"
937
1001
  # }
938
-
1002
+
939
1003
  # Output:
940
1004
  # {
941
1005
  # "type": "computer_call",
@@ -945,10 +1009,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
945
1009
  # "keys": ["ctrl", "c"]
946
1010
  # }
947
1011
  # }
948
- responses_items.append(make_keypress_item(
949
- keys=args.get("text", "").replace("+", "-").split("-"),
950
- call_id=call_id
951
- ))
1012
+ responses_items.append(
1013
+ make_keypress_item(
1014
+ keys=args.get("text", "").replace("+", "-").split("-"),
1015
+ call_id=call_id,
1016
+ )
1017
+ )
952
1018
  elif action_type in ["mouse_move", "move_cursor", "move"]:
953
1019
  # Input:
954
1020
  # {
@@ -962,7 +1028,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
962
1028
  # "id": "call_1",
963
1029
  # "type": "function"
964
1030
  # }
965
-
1031
+
966
1032
  # Output:
967
1033
  # {
968
1034
  # "type": "computer_call",
@@ -974,12 +1040,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
974
1040
  # }
975
1041
  # }
976
1042
  coordinate = args.get("coordinate", [0, 0])
977
- responses_items.append(make_move_item(
978
- x=coordinate[0] if len(coordinate) > 0 else 0,
979
- y=coordinate[1] if len(coordinate) > 1 else 0,
980
- call_id=call_id
981
- ))
982
-
1043
+ responses_items.append(
1044
+ make_move_item(
1045
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1046
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1047
+ call_id=call_id,
1048
+ )
1049
+ )
1050
+
983
1051
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
984
1052
  elif action_type == "scroll":
985
1053
  # Input:
@@ -996,7 +1064,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
996
1064
  # "id": "call_1",
997
1065
  # "type": "function"
998
1066
  # }
999
-
1067
+
1000
1068
  # Output:
1001
1069
  # {
1002
1070
  # "type": "computer_call",
@@ -1012,17 +1080,25 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1012
1080
  coordinate = args.get("coordinate", [0, 0])
1013
1081
  direction = args.get("scroll_direction", "down")
1014
1082
  amount = args.get("scroll_amount", 3)
1015
- scroll_x = amount if direction == "left" else \
1016
- -amount if direction == "right" else 0
1017
- scroll_y = amount if direction == "up" else \
1018
- -amount if direction == "down" else 0
1019
- responses_items.append(make_scroll_item(
1020
- x=coordinate[0] if len(coordinate) > 0 else 0,
1021
- y=coordinate[1] if len(coordinate) > 1 else 0,
1022
- scroll_x=scroll_x,
1023
- scroll_y=scroll_y,
1024
- call_id=call_id
1025
- ))
1083
+ scroll_x = (
1084
+ amount
1085
+ if direction == "left"
1086
+ else -amount if direction == "right" else 0
1087
+ )
1088
+ scroll_y = (
1089
+ amount
1090
+ if direction == "up"
1091
+ else -amount if direction == "down" else 0
1092
+ )
1093
+ responses_items.append(
1094
+ make_scroll_item(
1095
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1096
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1097
+ scroll_x=scroll_x,
1098
+ scroll_y=scroll_y,
1099
+ call_id=call_id,
1100
+ )
1101
+ )
1026
1102
  elif action_type in ["left_click_drag", "drag"]:
1027
1103
  # Input:
1028
1104
  # {
@@ -1037,7 +1113,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1037
1113
  # "id": "call_1",
1038
1114
  # "type": "function"
1039
1115
  # }
1040
-
1116
+
1041
1117
  # Output:
1042
1118
  # {
1043
1119
  # "type": "computer_call",
@@ -1052,19 +1128,21 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1052
1128
  # }
1053
1129
  start_coord = args.get("start_coordinate", [0, 0])
1054
1130
  end_coord = args.get("end_coordinate", [0, 0])
1055
- responses_items.append(make_drag_item(
1056
- path=[
1057
- {
1058
- "x": start_coord[0] if len(start_coord) > 0 else 0,
1059
- "y": start_coord[1] if len(start_coord) > 1 else 0
1060
- },
1061
- {
1062
- "x": end_coord[0] if len(end_coord) > 0 else 0,
1063
- "y": end_coord[1] if len(end_coord) > 1 else 0
1064
- }
1065
- ],
1066
- call_id=call_id
1067
- ))
1131
+ responses_items.append(
1132
+ make_drag_item(
1133
+ path=[
1134
+ {
1135
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
1136
+ "y": start_coord[1] if len(start_coord) > 1 else 0,
1137
+ },
1138
+ {
1139
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
1140
+ "y": end_coord[1] if len(end_coord) > 1 else 0,
1141
+ },
1142
+ ],
1143
+ call_id=call_id,
1144
+ )
1145
+ )
1068
1146
  elif action_type == "right_click":
1069
1147
  # Input:
1070
1148
  # {
@@ -1078,7 +1156,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1078
1156
  # "id": "call_1",
1079
1157
  # "type": "function"
1080
1158
  # }
1081
-
1159
+
1082
1160
  # Output:
1083
1161
  # {
1084
1162
  # "type": "computer_call",
@@ -1091,12 +1169,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1091
1169
  # }
1092
1170
  # }
1093
1171
  coordinate = args.get("coordinate", [0, 0])
1094
- responses_items.append(make_click_item(
1095
- x=coordinate[0] if len(coordinate) > 0 else 0,
1096
- y=coordinate[1] if len(coordinate) > 1 else 0,
1097
- button="right",
1098
- call_id=call_id
1099
- ))
1172
+ responses_items.append(
1173
+ make_click_item(
1174
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1175
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1176
+ button="right",
1177
+ call_id=call_id,
1178
+ )
1179
+ )
1100
1180
  elif action_type == "middle_click":
1101
1181
  # Input:
1102
1182
  # {
@@ -1110,7 +1190,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1110
1190
  # "id": "call_1",
1111
1191
  # "type": "function"
1112
1192
  # }
1113
-
1193
+
1114
1194
  # Output:
1115
1195
  # {
1116
1196
  # "type": "computer_call",
@@ -1123,12 +1203,14 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1123
1203
  # }
1124
1204
  # }
1125
1205
  coordinate = args.get("coordinate", [0, 0])
1126
- responses_items.append(make_click_item(
1127
- x=coordinate[0] if len(coordinate) > 0 else 0,
1128
- y=coordinate[1] if len(coordinate) > 1 else 0,
1129
- button="wheel",
1130
- call_id=call_id
1131
- ))
1206
+ responses_items.append(
1207
+ make_click_item(
1208
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1209
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1210
+ button="wheel",
1211
+ call_id=call_id,
1212
+ )
1213
+ )
1132
1214
  elif action_type == "double_click":
1133
1215
  # Input:
1134
1216
  # {
@@ -1142,7 +1224,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1142
1224
  # "id": "call_1",
1143
1225
  # "type": "function"
1144
1226
  # }
1145
-
1227
+
1146
1228
  # Output:
1147
1229
  # {
1148
1230
  # "type": "computer_call",
@@ -1154,11 +1236,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1154
1236
  # }
1155
1237
  # }
1156
1238
  coordinate = args.get("coordinate", [0, 0])
1157
- responses_items.append(make_double_click_item(
1158
- x=coordinate[0] if len(coordinate) > 0 else 0,
1159
- y=coordinate[1] if len(coordinate) > 1 else 0,
1160
- call_id=call_id
1161
- ))
1239
+ responses_items.append(
1240
+ make_double_click_item(
1241
+ x=coordinate[0] if len(coordinate) > 0 else 0,
1242
+ y=coordinate[1] if len(coordinate) > 1 else 0,
1243
+ call_id=call_id,
1244
+ )
1245
+ )
1162
1246
  elif action_type == "triple_click":
1163
1247
  # Input:
1164
1248
  # {
@@ -1172,7 +1256,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1172
1256
  # "id": "call_1",
1173
1257
  # "type": "function"
1174
1258
  # }
1175
-
1259
+
1176
1260
  # Output:
1177
1261
  # {
1178
1262
  # "type": "computer_call",
@@ -1197,7 +1281,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1197
1281
  # "id": "call_1",
1198
1282
  # "type": "function"
1199
1283
  # }
1200
-
1284
+
1201
1285
  # Output:
1202
1286
  # {
1203
1287
  # "type": "computer_call",
@@ -1210,11 +1294,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1210
1294
  # }
1211
1295
  # }
1212
1296
  coordinate = args.get("coordinate", [None, None])
1213
- responses_items.append(make_left_mouse_down_item(
1214
- x=coordinate[0] if len(coordinate) > 0 else None,
1215
- y=coordinate[1] if len(coordinate) > 1 else None,
1216
- call_id=call_id
1217
- ))
1297
+ responses_items.append(
1298
+ make_left_mouse_down_item(
1299
+ x=coordinate[0] if len(coordinate) > 0 else None,
1300
+ y=coordinate[1] if len(coordinate) > 1 else None,
1301
+ call_id=call_id,
1302
+ )
1303
+ )
1218
1304
  elif action_type == "left_mouse_up":
1219
1305
  # Input:
1220
1306
  # {
@@ -1228,7 +1314,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1228
1314
  # "id": "call_1",
1229
1315
  # "type": "function"
1230
1316
  # }
1231
-
1317
+
1232
1318
  # Output:
1233
1319
  # {
1234
1320
  # "type": "computer_call",
@@ -1241,11 +1327,13 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1241
1327
  # }
1242
1328
  # }
1243
1329
  coordinate = args.get("coordinate", [None, None])
1244
- responses_items.append(make_left_mouse_up_item(
1245
- x=coordinate[0] if len(coordinate) > 0 else None,
1246
- y=coordinate[1] if len(coordinate) > 1 else None,
1247
- call_id=call_id
1248
- ))
1330
+ responses_items.append(
1331
+ make_left_mouse_up_item(
1332
+ x=coordinate[0] if len(coordinate) > 0 else None,
1333
+ y=coordinate[1] if len(coordinate) > 1 else None,
1334
+ call_id=call_id,
1335
+ )
1336
+ )
1249
1337
  elif action_type == "hold_key":
1250
1338
  # Input:
1251
1339
  # {
@@ -1259,7 +1347,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1259
1347
  # "id": "call_1",
1260
1348
  # "type": "function"
1261
1349
  # }
1262
-
1350
+
1263
1351
  # Output:
1264
1352
  # {
1265
1353
  # "type": "computer_call",
@@ -1282,7 +1370,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1282
1370
  # "id": "call_1",
1283
1371
  # "type": "function"
1284
1372
  # }
1285
-
1373
+
1286
1374
  # Output:
1287
1375
  # {
1288
1376
  # "type": "computer_call",
@@ -1291,74 +1379,77 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
1291
1379
  # "type": "wait"
1292
1380
  # }
1293
1381
  # }
1294
- responses_items.append(make_wait_item(
1295
- call_id=call_id
1296
- ))
1382
+ responses_items.append(make_wait_item(call_id=call_id))
1297
1383
  except Exception as e:
1298
- responses_items.extend(make_failed_tool_call_items(
1299
- tool_name="computer",
1300
- tool_kwargs=args,
1301
- error_message=repr(e),
1302
- call_id=call_id
1303
- ))
1384
+ responses_items.extend(
1385
+ make_failed_tool_call_items(
1386
+ tool_name="computer",
1387
+ tool_kwargs=args,
1388
+ error_message=repr(e),
1389
+ call_id=call_id,
1390
+ )
1391
+ )
1304
1392
  except json.JSONDecodeError:
1305
1393
  print("Failed to decode tool call arguments")
1306
1394
  # Skip malformed tool calls
1307
1395
  continue
1308
-
1396
+
1309
1397
  return responses_items
1310
1398
 
1399
+
1311
1400
  def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1312
1401
  """Add cache control to completion messages"""
1313
1402
  num_writes = 0
1314
1403
  for message in completion_messages:
1315
- message["cache_control"] = { "type": "ephemeral" }
1404
+ message["cache_control"] = {"type": "ephemeral"}
1316
1405
  num_writes += 1
1317
1406
  # Cache control has a maximum of 4 blocks
1318
1407
  if num_writes >= 4:
1319
1408
  break
1320
-
1409
+
1321
1410
  return completion_messages
1322
1411
 
1412
+
1323
1413
  def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1324
1414
  """Combine completion messages with the same role"""
1325
1415
  if not completion_messages:
1326
1416
  return completion_messages
1327
-
1417
+
1328
1418
  combined_messages = []
1329
-
1419
+
1330
1420
  for message in completion_messages:
1331
1421
  # If this is the first message or role is different from last, add as new message
1332
1422
  if not combined_messages or combined_messages[-1]["role"] != message["role"]:
1333
1423
  # Ensure content is a list format and normalize text content
1334
1424
  new_message = message.copy()
1335
1425
  new_message["content"] = _normalize_content(message.get("content", ""))
1336
-
1426
+
1337
1427
  # Copy tool_calls if present
1338
1428
  if "tool_calls" in message:
1339
1429
  new_message["tool_calls"] = message["tool_calls"].copy()
1340
-
1430
+
1341
1431
  combined_messages.append(new_message)
1342
1432
  else:
1343
1433
  # Same role as previous message, combine them
1344
1434
  last_message = combined_messages[-1]
1345
-
1435
+
1346
1436
  # Combine content
1347
1437
  current_content = _normalize_content(message.get("content", ""))
1348
1438
  last_message["content"].extend(current_content)
1349
-
1439
+
1350
1440
  # Combine tool_calls if present
1351
1441
  if "tool_calls" in message:
1352
1442
  if "tool_calls" not in last_message:
1353
1443
  last_message["tool_calls"] = []
1354
1444
  last_message["tool_calls"].extend(message["tool_calls"])
1355
-
1445
+
1356
1446
  # Post-process to merge consecutive text blocks
1357
1447
  for message in combined_messages:
1358
1448
  message["content"] = _merge_consecutive_text(message["content"])
1359
-
1449
+
1360
1450
  return combined_messages
1361
1451
 
1452
+
1362
1453
  def _normalize_content(content) -> List[Dict[str, Any]]:
1363
1454
  """Normalize content to list format"""
1364
1455
  if isinstance(content, str):
@@ -1371,28 +1462,28 @@ def _normalize_content(content) -> List[Dict[str, Any]]:
1371
1462
  else:
1372
1463
  return []
1373
1464
 
1465
+
1374
1466
  def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1375
1467
  """Merge consecutive text blocks with newlines"""
1376
1468
  if not content_list:
1377
1469
  return content_list
1378
-
1470
+
1379
1471
  merged = []
1380
-
1472
+
1381
1473
  for item in content_list:
1382
- if (item.get("type") == "text" and
1383
- merged and
1384
- merged[-1].get("type") == "text"):
1474
+ if item.get("type") == "text" and merged and merged[-1].get("type") == "text":
1385
1475
  # Merge with previous text block
1386
1476
  merged[-1]["text"] += "\n" + item["text"]
1387
1477
  else:
1388
1478
  merged.append(item.copy())
1389
-
1479
+
1390
1480
  return merged
1391
1481
 
1482
+
1392
1483
  @register_agent(models=r".*claude-.*")
1393
1484
  class AnthropicHostedToolsConfig(AsyncAgentConfig):
1394
1485
  """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
1395
-
1486
+
1396
1487
  async def predict_step(
1397
1488
  self,
1398
1489
  messages: Messages,
@@ -1406,21 +1497,21 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1406
1497
  _on_api_end=None,
1407
1498
  _on_usage=None,
1408
1499
  _on_screenshot=None,
1409
- **kwargs
1500
+ **kwargs,
1410
1501
  ) -> Dict[str, Any]:
1411
1502
  """
1412
1503
  Anthropic hosted tools agent loop using liteLLM acompletion.
1413
-
1504
+
1414
1505
  Supports Anthropic's computer use models with hosted tools.
1415
1506
  """
1416
1507
  tools = tools or []
1417
-
1508
+
1418
1509
  # Get tool configuration for this model
1419
1510
  tool_config = _get_tool_config_for_model(model)
1420
-
1511
+
1421
1512
  # Prepare tools for Anthropic API
1422
1513
  anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
1423
-
1514
+
1424
1515
  # Convert responses_items messages to completion format
1425
1516
  completion_messages = _convert_responses_items_to_completion_messages(messages)
1426
1517
  if use_prompt_caching:
@@ -1428,7 +1519,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1428
1519
  completion_messages = _combine_completion_messages(completion_messages)
1429
1520
  # Then add cache control, anthropic requires explicit "cache_control" dicts
1430
1521
  completion_messages = _add_cache_control(completion_messages)
1431
-
1522
+
1432
1523
  # Prepare API call kwargs
1433
1524
  api_kwargs = {
1434
1525
  "model": model,
@@ -1436,80 +1527,74 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1436
1527
  "tools": anthropic_tools if anthropic_tools else None,
1437
1528
  "stream": stream,
1438
1529
  "num_retries": max_retries,
1439
- **kwargs
1530
+ **kwargs,
1440
1531
  }
1441
-
1532
+
1442
1533
  # Add beta header for computer use
1443
1534
  if anthropic_tools:
1444
- api_kwargs["headers"] = {
1445
- "anthropic-beta": tool_config["beta_flag"]
1446
- }
1447
-
1535
+ api_kwargs["headers"] = {"anthropic-beta": tool_config["beta_flag"]}
1536
+
1448
1537
  # Call API start hook
1449
1538
  if _on_api_start:
1450
1539
  await _on_api_start(api_kwargs)
1451
-
1540
+
1452
1541
  # Use liteLLM acompletion
1453
1542
  response = await litellm.acompletion(**api_kwargs)
1454
-
1543
+
1455
1544
  # Call API end hook
1456
1545
  if _on_api_end:
1457
1546
  await _on_api_end(api_kwargs, response)
1458
-
1547
+
1459
1548
  # Convert response to responses_items format
1460
1549
  responses_items = _convert_completion_to_responses_items(response)
1461
1550
 
1462
1551
  # Extract usage information
1463
- responses_usage = {
1464
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1552
+ responses_usage = {
1553
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
1554
+ response.usage
1555
+ ).model_dump(),
1465
1556
  "response_cost": response._hidden_params.get("response_cost", 0.0),
1466
1557
  }
1467
1558
  if _on_usage:
1468
1559
  await _on_usage(responses_usage)
1469
1560
 
1470
1561
  # Return in AsyncAgentConfig format
1471
- return {
1472
- "output": responses_items,
1473
- "usage": responses_usage
1474
- }
1475
-
1562
+ return {"output": responses_items, "usage": responses_usage}
1563
+
1476
1564
  async def predict_click(
1477
- self,
1478
- model: str,
1479
- image_b64: str,
1480
- instruction: str,
1481
- **kwargs
1565
+ self, model: str, image_b64: str, instruction: str, **kwargs
1482
1566
  ) -> Optional[Tuple[int, int]]:
1483
1567
  """
1484
1568
  Predict click coordinates based on image and instruction.
1485
-
1569
+
1486
1570
  Uses Anthropic's computer use models with a custom prompt that instructs
1487
1571
  the agent to only output clicks.
1488
-
1572
+
1489
1573
  Args:
1490
1574
  model: Model name to use
1491
1575
  image_b64: Base64 encoded image
1492
1576
  instruction: Instruction for where to click
1493
-
1577
+
1494
1578
  Returns:
1495
1579
  Tuple of (x, y) coordinates or None if prediction fails
1496
1580
  """
1497
1581
  # Get image dimensions from base64 data
1498
1582
  try:
1499
1583
  import base64
1500
- from PIL import Image
1501
1584
  from io import BytesIO
1502
-
1585
+
1586
+ from PIL import Image
1587
+
1503
1588
  image_data = base64.b64decode(image_b64)
1504
1589
  image = Image.open(BytesIO(image_data))
1505
1590
  display_width, display_height = image.size
1506
1591
  except Exception:
1507
1592
  # Fallback to default dimensions if image parsing fails
1508
1593
  display_width, display_height = 1024, 768
1509
-
1594
+
1510
1595
  # Get tool configuration for this model
1511
1596
  tool_config = _get_tool_config_for_model(model)
1512
-
1597
+
1513
1598
  # Prepare computer tool for Anthropic format
1514
1599
  computer_tool = {
1515
1600
  "type": tool_config["tool_version"],
@@ -1522,7 +1607,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1522
1607
  },
1523
1608
  },
1524
1609
  }
1525
-
1610
+
1526
1611
  # Construct messages in OpenAI chat completion format for liteLLM
1527
1612
  messages = [
1528
1613
  {
@@ -1541,18 +1626,16 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1541
1626
  7. Be decisive and action-oriented. Complete the requested task fully.
1542
1627
 
1543
1628
  Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
1544
- Task: Click {instruction}. Output ONLY a click action on the target element."""
1629
+ Task: Click {instruction}. Output ONLY a click action on the target element.""",
1545
1630
  },
1546
1631
  {
1547
1632
  "type": "image_url",
1548
- "image_url": {
1549
- "url": f"data:image/png;base64,{image_b64}"
1550
- }
1551
- }
1552
- ]
1633
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
1634
+ },
1635
+ ],
1553
1636
  }
1554
1637
  ]
1555
-
1638
+
1556
1639
  # Prepare API call kwargs
1557
1640
  api_kwargs = {
1558
1641
  "model": model,
@@ -1560,32 +1643,36 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
1560
1643
  "tools": [computer_tool],
1561
1644
  "stream": False,
1562
1645
  "max_tokens": 100, # Keep response short for click prediction
1563
- "headers": {
1564
- "anthropic-beta": tool_config["beta_flag"]
1565
- }
1646
+ "headers": {"anthropic-beta": tool_config["beta_flag"]},
1566
1647
  }
1567
-
1648
+ # Thread optional API params
1649
+ if "api_key" in kwargs and kwargs.get("api_key") is not None:
1650
+ api_kwargs["api_key"] = kwargs.get("api_key")
1651
+ if "api_base" in kwargs and kwargs.get("api_base") is not None:
1652
+ api_kwargs["api_base"] = kwargs.get("api_base")
1653
+
1568
1654
  # Use liteLLM acompletion
1569
1655
  response = await litellm.acompletion(**api_kwargs)
1570
-
1656
+
1571
1657
  # Convert response to responses_items format to extract click coordinates
1572
1658
  responses_items = _convert_completion_to_responses_items(response)
1573
-
1659
+
1574
1660
  # Look for computer_call with click action
1575
1661
  for item in responses_items:
1576
- if (isinstance(item, dict) and
1577
- item.get("type") == "computer_call" and
1578
- isinstance(item.get("action"), dict)):
1579
-
1662
+ if (
1663
+ isinstance(item, dict)
1664
+ and item.get("type") == "computer_call"
1665
+ and isinstance(item.get("action"), dict)
1666
+ ):
1667
+
1580
1668
  action = item["action"]
1581
- if action.get("type") == "click":
1669
+ if action.get("x") and action.get("y"):
1582
1670
  x = action.get("x")
1583
1671
  y = action.get("y")
1584
- if x is not None and y is not None:
1585
- return (int(x), int(y))
1586
-
1672
+ return (int(x), int(y))
1673
+
1587
1674
  return None
1588
-
1675
+
1589
1676
  def get_capabilities(self) -> List[AgentCapability]:
1590
1677
  """Return the capabilities supported by this agent."""
1591
1678
  return ["click", "step"]