cua-agent 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (112) hide show
  1. agent/__init__.py +21 -12
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +594 -0
  6. agent/callbacks/__init__.py +19 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/telemetry.py +210 -0
  13. agent/callbacks/trajectory_saver.py +305 -0
  14. agent/cli.py +297 -0
  15. agent/computer_handler.py +107 -0
  16. agent/decorators.py +90 -0
  17. agent/loops/__init__.py +11 -0
  18. agent/loops/anthropic.py +728 -0
  19. agent/loops/omniparser.py +339 -0
  20. agent/loops/openai.py +95 -0
  21. agent/loops/uitars.py +688 -0
  22. agent/responses.py +207 -0
  23. agent/telemetry.py +135 -14
  24. agent/types.py +79 -0
  25. agent/ui/__init__.py +7 -1
  26. agent/ui/__main__.py +2 -13
  27. agent/ui/gradio/__init__.py +6 -19
  28. agent/ui/gradio/app.py +94 -1313
  29. agent/ui/gradio/ui_components.py +721 -0
  30. cua_agent-0.4.0.dist-info/METADATA +424 -0
  31. cua_agent-0.4.0.dist-info/RECORD +33 -0
  32. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +1 -1
  33. agent/core/__init__.py +0 -27
  34. agent/core/agent.py +0 -210
  35. agent/core/base.py +0 -217
  36. agent/core/callbacks.py +0 -200
  37. agent/core/experiment.py +0 -249
  38. agent/core/factory.py +0 -122
  39. agent/core/messages.py +0 -332
  40. agent/core/provider_config.py +0 -21
  41. agent/core/telemetry.py +0 -142
  42. agent/core/tools/__init__.py +0 -21
  43. agent/core/tools/base.py +0 -74
  44. agent/core/tools/bash.py +0 -52
  45. agent/core/tools/collection.py +0 -46
  46. agent/core/tools/computer.py +0 -113
  47. agent/core/tools/edit.py +0 -67
  48. agent/core/tools/manager.py +0 -56
  49. agent/core/tools.py +0 -32
  50. agent/core/types.py +0 -88
  51. agent/core/visualization.py +0 -197
  52. agent/providers/__init__.py +0 -4
  53. agent/providers/anthropic/__init__.py +0 -6
  54. agent/providers/anthropic/api/client.py +0 -360
  55. agent/providers/anthropic/api/logging.py +0 -150
  56. agent/providers/anthropic/api_handler.py +0 -140
  57. agent/providers/anthropic/callbacks/__init__.py +0 -5
  58. agent/providers/anthropic/callbacks/manager.py +0 -65
  59. agent/providers/anthropic/loop.py +0 -568
  60. agent/providers/anthropic/prompts.py +0 -23
  61. agent/providers/anthropic/response_handler.py +0 -226
  62. agent/providers/anthropic/tools/__init__.py +0 -33
  63. agent/providers/anthropic/tools/base.py +0 -88
  64. agent/providers/anthropic/tools/bash.py +0 -66
  65. agent/providers/anthropic/tools/collection.py +0 -34
  66. agent/providers/anthropic/tools/computer.py +0 -396
  67. agent/providers/anthropic/tools/edit.py +0 -326
  68. agent/providers/anthropic/tools/manager.py +0 -54
  69. agent/providers/anthropic/tools/run.py +0 -42
  70. agent/providers/anthropic/types.py +0 -16
  71. agent/providers/anthropic/utils.py +0 -367
  72. agent/providers/omni/__init__.py +0 -8
  73. agent/providers/omni/api_handler.py +0 -42
  74. agent/providers/omni/clients/anthropic.py +0 -103
  75. agent/providers/omni/clients/base.py +0 -35
  76. agent/providers/omni/clients/oaicompat.py +0 -195
  77. agent/providers/omni/clients/ollama.py +0 -122
  78. agent/providers/omni/clients/openai.py +0 -155
  79. agent/providers/omni/clients/utils.py +0 -25
  80. agent/providers/omni/image_utils.py +0 -34
  81. agent/providers/omni/loop.py +0 -990
  82. agent/providers/omni/parser.py +0 -307
  83. agent/providers/omni/prompts.py +0 -64
  84. agent/providers/omni/tools/__init__.py +0 -30
  85. agent/providers/omni/tools/base.py +0 -29
  86. agent/providers/omni/tools/bash.py +0 -74
  87. agent/providers/omni/tools/computer.py +0 -179
  88. agent/providers/omni/tools/manager.py +0 -61
  89. agent/providers/omni/utils.py +0 -236
  90. agent/providers/openai/__init__.py +0 -6
  91. agent/providers/openai/api_handler.py +0 -456
  92. agent/providers/openai/loop.py +0 -472
  93. agent/providers/openai/response_handler.py +0 -205
  94. agent/providers/openai/tools/__init__.py +0 -15
  95. agent/providers/openai/tools/base.py +0 -79
  96. agent/providers/openai/tools/computer.py +0 -326
  97. agent/providers/openai/tools/manager.py +0 -106
  98. agent/providers/openai/types.py +0 -36
  99. agent/providers/openai/utils.py +0 -98
  100. agent/providers/uitars/__init__.py +0 -1
  101. agent/providers/uitars/clients/base.py +0 -35
  102. agent/providers/uitars/clients/mlxvlm.py +0 -263
  103. agent/providers/uitars/clients/oaicompat.py +0 -214
  104. agent/providers/uitars/loop.py +0 -660
  105. agent/providers/uitars/prompts.py +0 -63
  106. agent/providers/uitars/tools/__init__.py +0 -1
  107. agent/providers/uitars/tools/computer.py +0 -283
  108. agent/providers/uitars/tools/manager.py +0 -60
  109. agent/providers/uitars/utils.py +0 -264
  110. cua_agent-0.3.1.dist-info/METADATA +0 -295
  111. cua_agent-0.3.1.dist-info/RECORD +0 -87
  112. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,728 @@
1
+ """
2
+ Anthropic hosted tools agent loop implementation using liteLLM
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional
8
+ import litellm
9
+ from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
10
+
11
+ from ..decorators import agent_loop
12
+ from ..types import Messages, AgentResponse, Tools
13
+ from ..responses import (
14
+ make_reasoning_item,
15
+ make_output_text_item,
16
+ make_click_item,
17
+ make_double_click_item,
18
+ make_drag_item,
19
+ make_keypress_item,
20
+ make_move_item,
21
+ make_scroll_item,
22
+ make_type_item,
23
+ make_wait_item,
24
+ make_input_image_item,
25
+ make_screenshot_item
26
+ )
27
+
28
+ # Model version mapping to tool version and beta flag
29
+ MODEL_TOOL_MAPPING = [
30
+ # Claude 4 models
31
+ {
32
+ "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
33
+ "tool_version": "computer_20250124",
34
+ "beta_flag": "computer-use-2025-01-24"
35
+ },
36
+ # Claude 3.7 models
37
+ {
38
+ "pattern": r"claude-3\.?7|claude-3-7",
39
+ "tool_version": "computer_20250124",
40
+ "beta_flag": "computer-use-2025-01-24"
41
+ },
42
+ # Claude 3.5 models (fallback)
43
+ {
44
+ "pattern": r"claude-3\.?5|claude-3-5",
45
+ "tool_version": "computer_20241022",
46
+ "beta_flag": "computer-use-2024-10-22"
47
+ }
48
+ ]
49
+
50
+ def _get_tool_config_for_model(model: str) -> Dict[str, str]:
51
+ """Get tool version and beta flag for the given model."""
52
+ import re
53
+
54
+ for mapping in MODEL_TOOL_MAPPING:
55
+ if re.search(mapping["pattern"], model, re.IGNORECASE):
56
+ return {
57
+ "tool_version": mapping["tool_version"],
58
+ "beta_flag": mapping["beta_flag"]
59
+ }
60
+
61
+ # Default to Claude 3.5 configuration
62
+ return {
63
+ "tool_version": "computer_20241022",
64
+ "beta_flag": "computer-use-2024-10-22"
65
+ }
66
+
67
+ def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
68
+ """Map a computer tool to Anthropic's hosted tool schema."""
69
+ return {
70
+ "type": tool_version,
71
+ "function": {
72
+ "name": "computer",
73
+ "parameters": {
74
+ "display_height_px": getattr(computer_tool, 'display_height', 768),
75
+ "display_width_px": getattr(computer_tool, 'display_width', 1024),
76
+ "display_number": getattr(computer_tool, 'display_number', 1),
77
+ },
78
+ },
79
+ }
80
+
81
+ def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
82
+ """Prepare tools for Anthropic API format."""
83
+ tool_config = _get_tool_config_for_model(model)
84
+ anthropic_tools = []
85
+
86
+ for schema in tool_schemas:
87
+ if schema["type"] == "computer":
88
+ # Map computer tool to Anthropic format
89
+ anthropic_tools.append(_map_computer_tool_to_anthropic(
90
+ schema["computer"],
91
+ tool_config["tool_version"]
92
+ ))
93
+ elif schema["type"] == "function":
94
+ # Function tools - convert to Anthropic format
95
+ function_schema = schema["function"]
96
+ anthropic_tools.append({
97
+ "type": "function",
98
+ "function": {
99
+ "name": function_schema["name"],
100
+ "description": function_schema.get("description", ""),
101
+ "parameters": function_schema.get("parameters", {})
102
+ }
103
+ })
104
+
105
+ return anthropic_tools
106
+
107
+ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
108
+ """Convert responses_items message format to liteLLM completion format."""
109
+ completion_messages = []
110
+
111
+ for message in messages:
112
+ msg_type = message.get("type")
113
+ role = message.get("role")
114
+
115
+ # Handle user messages (both with and without explicit type)
116
+ if role == "user" or msg_type == "user":
117
+ content = message.get("content", "")
118
+ if isinstance(content, list):
119
+ # Multi-modal content - convert input_image to image format
120
+ converted_content = []
121
+ for item in content:
122
+ if isinstance(item, dict) and item.get("type") == "input_image":
123
+ # Convert input_image to Anthropic image format
124
+ image_url = item.get("image_url", "")
125
+ if image_url and image_url != "[omitted]":
126
+ # Extract base64 data from data URL
127
+ if "," in image_url:
128
+ base64_data = image_url.split(",")[-1]
129
+ else:
130
+ base64_data = image_url
131
+
132
+ converted_content.append({
133
+ "type": "image",
134
+ "source": {
135
+ "type": "base64",
136
+ "media_type": "image/png",
137
+ "data": base64_data
138
+ }
139
+ })
140
+ else:
141
+ # Keep other content types as-is
142
+ converted_content.append(item)
143
+
144
+ completion_messages.append({
145
+ "role": "user",
146
+ "content": converted_content if converted_content else content
147
+ })
148
+ else:
149
+ # Text content
150
+ completion_messages.append({
151
+ "role": "user",
152
+ "content": content
153
+ })
154
+
155
+ # Handle assistant messages
156
+ elif role == "assistant":
157
+ content = message.get("content", [])
158
+ if isinstance(content, str):
159
+ content = [{ "type": "output_text", "text": content }]
160
+
161
+ content = "\n".join(item.get("text", "") for item in content)
162
+ completion_messages.append({
163
+ "role": "assistant",
164
+ "content": content
165
+ })
166
+
167
+ elif msg_type == "reasoning":
168
+ # Reasoning becomes part of assistant message
169
+ summary = message.get("summary", [])
170
+ reasoning_text = ""
171
+
172
+ if isinstance(summary, list) and summary:
173
+ # Extract text from summary items
174
+ for item in summary:
175
+ if isinstance(item, dict) and item.get("type") == "summary_text":
176
+ reasoning_text = item.get("text", "")
177
+ break
178
+ else:
179
+ # Fallback to direct reasoning field
180
+ reasoning_text = message.get("reasoning", "")
181
+
182
+ if reasoning_text:
183
+ completion_messages.append({
184
+ "role": "assistant",
185
+ "content": reasoning_text
186
+ })
187
+
188
+ elif msg_type == "computer_call":
189
+ # Computer call becomes tool use in assistant message
190
+ action = message.get("action", {})
191
+ action_type = action.get("type")
192
+ call_id = message.get("call_id", "call_1")
193
+
194
+ tool_use_content = []
195
+
196
+ if action_type == "click":
197
+ tool_use_content.append({
198
+ "type": "tool_use",
199
+ "id": call_id,
200
+ "name": "computer",
201
+ "input": {
202
+ "action": "click",
203
+ "coordinate": [action.get("x", 0), action.get("y", 0)]
204
+ }
205
+ })
206
+ elif action_type == "type":
207
+ tool_use_content.append({
208
+ "type": "tool_use",
209
+ "id": call_id,
210
+ "name": "computer",
211
+ "input": {
212
+ "action": "type",
213
+ "text": action.get("text", "")
214
+ }
215
+ })
216
+ elif action_type == "key":
217
+ tool_use_content.append({
218
+ "type": "tool_use",
219
+ "id": call_id,
220
+ "name": "computer",
221
+ "input": {
222
+ "action": "key",
223
+ "key": action.get("key", "")
224
+ }
225
+ })
226
+ elif action_type == "wait":
227
+ tool_use_content.append({
228
+ "type": "tool_use",
229
+ "id": call_id,
230
+ "name": "computer",
231
+ "input": {
232
+ "action": "screenshot"
233
+ }
234
+ })
235
+ elif action_type == "screenshot":
236
+ tool_use_content.append({
237
+ "type": "tool_use",
238
+ "id": call_id,
239
+ "name": "computer",
240
+ "input": {
241
+ "action": "screenshot"
242
+ }
243
+ })
244
+
245
+ # Convert tool_use_content to OpenAI tool_calls format
246
+ openai_tool_calls = []
247
+ for tool_use in tool_use_content:
248
+ openai_tool_calls.append({
249
+ "id": tool_use["id"],
250
+ "type": "function",
251
+ "function": {
252
+ "name": tool_use["name"],
253
+ "arguments": json.dumps(tool_use["input"])
254
+ }
255
+ })
256
+
257
+ # If the last completion message is an assistant message, extend the tool_calls
258
+ if completion_messages and completion_messages[-1].get("role") == "assistant":
259
+ if "tool_calls" not in completion_messages[-1]:
260
+ completion_messages[-1]["tool_calls"] = []
261
+ completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
262
+ else:
263
+ # Create new assistant message with tool calls
264
+ completion_messages.append({
265
+ "role": "assistant",
266
+ "content": None,
267
+ "tool_calls": openai_tool_calls
268
+ })
269
+
270
+ elif msg_type == "computer_call_output":
271
+ # Computer call output becomes OpenAI function result
272
+ output = message.get("output", {})
273
+ call_id = message.get("call_id", "call_1")
274
+
275
+ if output.get("type") == "input_image":
276
+ # Screenshot result - convert to OpenAI format with image_url content
277
+ image_url = output.get("image_url", "")
278
+ completion_messages.append({
279
+ "role": "function",
280
+ "name": "computer",
281
+ "tool_call_id": call_id,
282
+ "content": [{
283
+ "type": "image_url",
284
+ "image_url": {
285
+ "url": image_url
286
+ }
287
+ }]
288
+ })
289
+ else:
290
+ # Text result - convert to OpenAI format
291
+ completion_messages.append({
292
+ "role": "function",
293
+ "name": "computer",
294
+ "tool_call_id": call_id,
295
+ "content": str(output)
296
+ })
297
+
298
+ return completion_messages
299
+
300
+ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
301
+ """Convert liteLLM completion response to responses_items message format."""
302
+ responses_items = []
303
+
304
+ if not response or not hasattr(response, 'choices') or not response.choices:
305
+ return responses_items
306
+
307
+ choice = response.choices[0]
308
+ message = choice.message
309
+
310
+ # Handle text content
311
+ if hasattr(message, 'content') and message.content:
312
+ if isinstance(message.content, str):
313
+ responses_items.append(make_output_text_item(message.content))
314
+ elif isinstance(message.content, list):
315
+ for content_item in message.content:
316
+ if isinstance(content_item, dict):
317
+ if content_item.get("type") == "text":
318
+ responses_items.append(make_output_text_item(content_item.get("text", "")))
319
+ elif content_item.get("type") == "tool_use":
320
+ # Convert tool use to computer call
321
+ tool_input = content_item.get("input", {})
322
+ action_type = tool_input.get("action")
323
+ call_id = content_item.get("id")
324
+
325
+ # Action reference:
326
+ # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
327
+
328
+ # Basic actions (all versions)
329
+ if action_type == "screenshot":
330
+ responses_items.append(make_screenshot_item(call_id=call_id))
331
+ elif action_type == "left_click":
332
+ coordinate = tool_input.get("coordinate", [0, 0])
333
+ responses_items.append(make_click_item(
334
+ x=coordinate[0] if len(coordinate) > 0 else 0,
335
+ y=coordinate[1] if len(coordinate) > 1 else 0,
336
+ call_id=call_id
337
+ ))
338
+ elif action_type == "type":
339
+ responses_items.append(make_type_item(
340
+ text=tool_input.get("text", ""),
341
+ call_id=call_id
342
+ ))
343
+ elif action_type == "key":
344
+ responses_items.append(make_keypress_item(
345
+ key=tool_input.get("key", ""),
346
+ call_id=call_id
347
+ ))
348
+ elif action_type == "mouse_move":
349
+ # Mouse move - create a custom action item
350
+ coordinate = tool_input.get("coordinate", [0, 0])
351
+ responses_items.append({
352
+ "type": "computer_call",
353
+ "call_id": call_id,
354
+ "action": {
355
+ "type": "mouse_move",
356
+ "x": coordinate[0] if len(coordinate) > 0 else 0,
357
+ "y": coordinate[1] if len(coordinate) > 1 else 0
358
+ }
359
+ })
360
+
361
+ # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
362
+ elif action_type == "scroll":
363
+ coordinate = tool_input.get("coordinate", [0, 0])
364
+ responses_items.append(make_scroll_item(
365
+ x=coordinate[0] if len(coordinate) > 0 else 0,
366
+ y=coordinate[1] if len(coordinate) > 1 else 0,
367
+ direction=tool_input.get("scroll_direction", "down"),
368
+ amount=tool_input.get("scroll_amount", 3),
369
+ call_id=call_id
370
+ ))
371
+ elif action_type == "left_click_drag":
372
+ start_coord = tool_input.get("start_coordinate", [0, 0])
373
+ end_coord = tool_input.get("end_coordinate", [0, 0])
374
+ responses_items.append(make_drag_item(
375
+ start_x=start_coord[0] if len(start_coord) > 0 else 0,
376
+ start_y=start_coord[1] if len(start_coord) > 1 else 0,
377
+ end_x=end_coord[0] if len(end_coord) > 0 else 0,
378
+ end_y=end_coord[1] if len(end_coord) > 1 else 0,
379
+ call_id=call_id
380
+ ))
381
+ elif action_type == "right_click":
382
+ coordinate = tool_input.get("coordinate", [0, 0])
383
+ responses_items.append(make_click_item(
384
+ x=coordinate[0] if len(coordinate) > 0 else 0,
385
+ y=coordinate[1] if len(coordinate) > 1 else 0,
386
+ button="right",
387
+ call_id=call_id
388
+ ))
389
+ elif action_type == "middle_click":
390
+ coordinate = tool_input.get("coordinate", [0, 0])
391
+ responses_items.append(make_click_item(
392
+ x=coordinate[0] if len(coordinate) > 0 else 0,
393
+ y=coordinate[1] if len(coordinate) > 1 else 0,
394
+ button="wheel",
395
+ call_id=call_id
396
+ ))
397
+ elif action_type == "double_click":
398
+ coordinate = tool_input.get("coordinate", [0, 0])
399
+ responses_items.append(make_double_click_item(
400
+ x=coordinate[0] if len(coordinate) > 0 else 0,
401
+ y=coordinate[1] if len(coordinate) > 1 else 0,
402
+ call_id=call_id
403
+ ))
404
+ elif action_type == "triple_click":
405
+ # coordinate = tool_input.get("coordinate", [0, 0])
406
+ # responses_items.append({
407
+ # "type": "computer_call",
408
+ # "call_id": call_id,
409
+ # "action": {
410
+ # "type": "triple_click",
411
+ # "x": coordinate[0] if len(coordinate) > 0 else 0,
412
+ # "y": coordinate[1] if len(coordinate) > 1 else 0
413
+ # }
414
+ # })
415
+ raise NotImplementedError("triple_click")
416
+ elif action_type == "left_mouse_down":
417
+ # coordinate = tool_input.get("coordinate", [0, 0])
418
+ # responses_items.append({
419
+ # "type": "computer_call",
420
+ # "call_id": call_id,
421
+ # "action": {
422
+ # "type": "mouse_down",
423
+ # "button": "left",
424
+ # "x": coordinate[0] if len(coordinate) > 0 else 0,
425
+ # "y": coordinate[1] if len(coordinate) > 1 else 0
426
+ # }
427
+ # })
428
+ raise NotImplementedError("left_mouse_down")
429
+ elif action_type == "left_mouse_up":
430
+ # coordinate = tool_input.get("coordinate", [0, 0])
431
+ # responses_items.append({
432
+ # "type": "computer_call",
433
+ # "call_id": call_id,
434
+ # "action": {
435
+ # "type": "mouse_up",
436
+ # "button": "left",
437
+ # "x": coordinate[0] if len(coordinate) > 0 else 0,
438
+ # "y": coordinate[1] if len(coordinate) > 1 else 0
439
+ # }
440
+ # })
441
+ raise NotImplementedError("left_mouse_up")
442
+ elif action_type == "hold_key":
443
+ # responses_items.append({
444
+ # "type": "computer_call",
445
+ # "call_id": call_id,
446
+ # "action": {
447
+ # "type": "key_hold",
448
+ # "key": tool_input.get("key", "")
449
+ # }
450
+ # })
451
+ raise NotImplementedError("hold_key")
452
+ elif action_type == "wait":
453
+ responses_items.append(make_wait_item(
454
+ call_id=call_id
455
+ ))
456
+ else:
457
+ raise ValueError(f"Unknown action type: {action_type}")
458
+
459
+ # Handle tool calls (alternative format)
460
+ if hasattr(message, 'tool_calls') and message.tool_calls:
461
+ for tool_call in message.tool_calls:
462
+ print(tool_call)
463
+ if tool_call.function.name == "computer":
464
+ try:
465
+ args = json.loads(tool_call.function.arguments)
466
+ action_type = args.get("action")
467
+ call_id = tool_call.id
468
+
469
+ # Basic actions (all versions)
470
+ if action_type == "screenshot":
471
+ responses_items.append(make_screenshot_item(
472
+ call_id=call_id
473
+ ))
474
+ elif action_type in ["click", "left_click"]:
475
+ coordinate = args.get("coordinate", [0, 0])
476
+ responses_items.append(make_click_item(
477
+ x=coordinate[0] if len(coordinate) > 0 else 0,
478
+ y=coordinate[1] if len(coordinate) > 1 else 0,
479
+ call_id=call_id
480
+ ))
481
+ elif action_type == "type":
482
+ responses_items.append(make_type_item(
483
+ text=args.get("text", ""),
484
+ call_id=call_id
485
+ ))
486
+ elif action_type == "key":
487
+ responses_items.append(make_keypress_item(
488
+ key=args.get("key", ""),
489
+ call_id=call_id
490
+ ))
491
+ elif action_type == "mouse_move":
492
+ coordinate = args.get("coordinate", [0, 0])
493
+ responses_items.append(make_move_item(
494
+ x=coordinate[0] if len(coordinate) > 0 else 0,
495
+ y=coordinate[1] if len(coordinate) > 1 else 0,
496
+ call_id=call_id
497
+ ))
498
+
499
+ # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
500
+ elif action_type == "scroll":
501
+ coordinate = args.get("coordinate", [0, 0])
502
+ direction = args.get("scroll_direction", "down")
503
+ amount = args.get("scroll_amount", 3)
504
+ scroll_x = amount if direction == "left" else \
505
+ -amount if direction == "right" else 0
506
+ scroll_y = amount if direction == "up" else \
507
+ -amount if direction == "down" else 0
508
+ responses_items.append(make_scroll_item(
509
+ x=coordinate[0] if len(coordinate) > 0 else 0,
510
+ y=coordinate[1] if len(coordinate) > 1 else 0,
511
+ scroll_x=scroll_x,
512
+ scroll_y=scroll_y,
513
+ call_id=call_id
514
+ ))
515
+ elif action_type == "left_click_drag":
516
+ start_coord = args.get("start_coordinate", [0, 0])
517
+ end_coord = args.get("end_coordinate", [0, 0])
518
+ responses_items.append(make_drag_item(
519
+ start_x=start_coord[0] if len(start_coord) > 0 else 0,
520
+ start_y=start_coord[1] if len(start_coord) > 1 else 0,
521
+ end_x=end_coord[0] if len(end_coord) > 0 else 0,
522
+ end_y=end_coord[1] if len(end_coord) > 1 else 0,
523
+ call_id=call_id
524
+ ))
525
+ elif action_type == "right_click":
526
+ coordinate = args.get("coordinate", [0, 0])
527
+ responses_items.append(make_click_item(
528
+ x=coordinate[0] if len(coordinate) > 0 else 0,
529
+ y=coordinate[1] if len(coordinate) > 1 else 0,
530
+ button="right",
531
+ call_id=call_id
532
+ ))
533
+ elif action_type == "middle_click":
534
+ coordinate = args.get("coordinate", [0, 0])
535
+ responses_items.append(make_click_item(
536
+ x=coordinate[0] if len(coordinate) > 0 else 0,
537
+ y=coordinate[1] if len(coordinate) > 1 else 0,
538
+ button="scroll",
539
+ call_id=call_id
540
+ ))
541
+ elif action_type == "double_click":
542
+ coordinate = args.get("coordinate", [0, 0])
543
+ responses_items.append(make_double_click_item(
544
+ x=coordinate[0] if len(coordinate) > 0 else 0,
545
+ y=coordinate[1] if len(coordinate) > 1 else 0,
546
+ call_id=call_id
547
+ ))
548
+ elif action_type == "triple_click":
549
+ raise NotImplementedError("triple_click")
550
+ elif action_type == "left_mouse_down":
551
+ raise NotImplementedError("left_mouse_down")
552
+ elif action_type == "left_mouse_up":
553
+ raise NotImplementedError("left_mouse_up")
554
+ elif action_type == "hold_key":
555
+ raise NotImplementedError("hold_key")
556
+ elif action_type == "wait":
557
+ responses_items.append(make_wait_item(
558
+ call_id=call_id
559
+ ))
560
+ except json.JSONDecodeError:
561
+ print("Failed to decode tool call arguments")
562
+ # Skip malformed tool calls
563
+ continue
564
+
565
+ return responses_items
566
+
567
+ def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
568
+ """Add cache control to completion messages"""
569
+ num_writes = 0
570
+ for message in completion_messages:
571
+ message["cache_control"] = { "type": "ephemeral" }
572
+ num_writes += 1
573
+ # Cache control has a maximum of 4 blocks
574
+ if num_writes >= 4:
575
+ break
576
+
577
+ return completion_messages
578
+
579
+ def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
580
+ """Combine completion messages with the same role"""
581
+ if not completion_messages:
582
+ return completion_messages
583
+
584
+ combined_messages = []
585
+
586
+ for message in completion_messages:
587
+ # If this is the first message or role is different from last, add as new message
588
+ if not combined_messages or combined_messages[-1]["role"] != message["role"]:
589
+ # Ensure content is a list format and normalize text content
590
+ new_message = message.copy()
591
+ new_message["content"] = _normalize_content(message.get("content", ""))
592
+
593
+ # Copy tool_calls if present
594
+ if "tool_calls" in message:
595
+ new_message["tool_calls"] = message["tool_calls"].copy()
596
+
597
+ combined_messages.append(new_message)
598
+ else:
599
+ # Same role as previous message, combine them
600
+ last_message = combined_messages[-1]
601
+
602
+ # Combine content
603
+ current_content = _normalize_content(message.get("content", ""))
604
+ last_message["content"].extend(current_content)
605
+
606
+ # Combine tool_calls if present
607
+ if "tool_calls" in message:
608
+ if "tool_calls" not in last_message:
609
+ last_message["tool_calls"] = []
610
+ last_message["tool_calls"].extend(message["tool_calls"])
611
+
612
+ # Post-process to merge consecutive text blocks
613
+ for message in combined_messages:
614
+ message["content"] = _merge_consecutive_text(message["content"])
615
+
616
+ return combined_messages
617
+
618
+ def _normalize_content(content) -> List[Dict[str, Any]]:
619
+ """Normalize content to list format"""
620
+ if isinstance(content, str):
621
+ if content.strip(): # Only add non-empty strings
622
+ return [{"type": "text", "text": content}]
623
+ else:
624
+ return []
625
+ elif isinstance(content, list):
626
+ return content.copy()
627
+ else:
628
+ return []
629
+
630
+ def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
631
+ """Merge consecutive text blocks with newlines"""
632
+ if not content_list:
633
+ return content_list
634
+
635
+ merged = []
636
+
637
+ for item in content_list:
638
+ if (item.get("type") == "text" and
639
+ merged and
640
+ merged[-1].get("type") == "text"):
641
+ # Merge with previous text block
642
+ merged[-1]["text"] += "\n" + item["text"]
643
+ else:
644
+ merged.append(item.copy())
645
+
646
+ return merged
647
+
648
+ @agent_loop(models=r".*claude-.*", priority=5)
649
+ async def anthropic_hosted_tools_loop(
650
+ messages: Messages,
651
+ model: str,
652
+ tools: Optional[List[Dict[str, Any]]] = None,
653
+ max_retries: Optional[int] = None,
654
+ stream: bool = False,
655
+ computer_handler=None,
656
+ use_prompt_caching: Optional[bool] = False,
657
+ _on_api_start=None,
658
+ _on_api_end=None,
659
+ _on_usage=None,
660
+ _on_screenshot=None,
661
+ **kwargs
662
+ ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
663
+ """
664
+ Anthropic hosted tools agent loop using liteLLM acompletion.
665
+
666
+ Supports Anthropic's computer use models with hosted tools.
667
+ """
668
+ tools = tools or []
669
+
670
+ # Get tool configuration for this model
671
+ tool_config = _get_tool_config_for_model(model)
672
+
673
+ # Prepare tools for Anthropic API
674
+ anthropic_tools = _prepare_tools_for_anthropic(tools, model)
675
+
676
+ # Convert responses_items messages to completion format
677
+ completion_messages = _convert_responses_items_to_completion_messages(messages)
678
+ if use_prompt_caching:
679
+ # First combine messages to reduce number of blocks
680
+ completion_messages = _combine_completion_messages(completion_messages)
681
+ # Then add cache control, anthropic requires explicit "cache_control" dicts
682
+ completion_messages = _add_cache_control(completion_messages)
683
+
684
+ # Prepare API call kwargs
685
+ api_kwargs = {
686
+ "model": model,
687
+ "messages": completion_messages,
688
+ "tools": anthropic_tools if anthropic_tools else None,
689
+ "stream": stream,
690
+ "num_retries": max_retries,
691
+ **kwargs
692
+ }
693
+
694
+ # Add beta header for computer use
695
+ if anthropic_tools:
696
+ api_kwargs["headers"] = {
697
+ "anthropic-beta": tool_config["beta_flag"]
698
+ }
699
+
700
+ # Call API start hook
701
+ if _on_api_start:
702
+ await _on_api_start(api_kwargs)
703
+
704
+ # Use liteLLM acompletion
705
+ response = await litellm.acompletion(**api_kwargs)
706
+
707
+ # Call API end hook
708
+ if _on_api_end:
709
+ await _on_api_end(api_kwargs, response)
710
+
711
+ # Convert response to responses_items format
712
+ responses_items = _convert_completion_to_responses_items(response)
713
+
714
+ # Extract usage information
715
+ responses_usage = {
716
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
717
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
718
+ }
719
+ if _on_usage:
720
+ await _on_usage(responses_usage)
721
+
722
+ # Create agent response
723
+ agent_response = {
724
+ "output": responses_items,
725
+ "usage": responses_usage
726
+ }
727
+
728
+ return agent_response