cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +15 -51
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +216 -0
- agent/agent.py +577 -0
- agent/callbacks/__init__.py +17 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +290 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +80 -1299
- agent/ui/gradio/ui_components.py +703 -0
- cua_agent-0.4.0b1.dist-info/METADATA +424 -0
- cua_agent-0.4.0b1.dist-info/RECORD +30 -0
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -381
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- agent/telemetry.py +0 -21
- agent/ui/__main__.py +0 -15
- cua_agent-0.3.2.dist-info/METADATA +0 -295
- cua_agent-0.3.2.dist-info/RECORD +0 -87
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
agent/loops/anthropic.py
ADDED
|
@@ -0,0 +1,728 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Anthropic hosted tools agent loop implementation using liteLLM
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
from typing import Dict, List, Any, AsyncGenerator, Union, Optional
|
|
8
|
+
import litellm
|
|
9
|
+
from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
|
|
10
|
+
|
|
11
|
+
from ..decorators import agent_loop
|
|
12
|
+
from ..types import Messages, AgentResponse, Tools
|
|
13
|
+
from ..responses import (
|
|
14
|
+
make_reasoning_item,
|
|
15
|
+
make_output_text_item,
|
|
16
|
+
make_click_item,
|
|
17
|
+
make_double_click_item,
|
|
18
|
+
make_drag_item,
|
|
19
|
+
make_keypress_item,
|
|
20
|
+
make_move_item,
|
|
21
|
+
make_scroll_item,
|
|
22
|
+
make_type_item,
|
|
23
|
+
make_wait_item,
|
|
24
|
+
make_input_image_item,
|
|
25
|
+
make_screenshot_item
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Model version mapping to tool version and beta flag
|
|
29
|
+
MODEL_TOOL_MAPPING = [
|
|
30
|
+
# Claude 4 models
|
|
31
|
+
{
|
|
32
|
+
"pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
|
|
33
|
+
"tool_version": "computer_20250124",
|
|
34
|
+
"beta_flag": "computer-use-2025-01-24"
|
|
35
|
+
},
|
|
36
|
+
# Claude 3.7 models
|
|
37
|
+
{
|
|
38
|
+
"pattern": r"claude-3\.?7|claude-3-7",
|
|
39
|
+
"tool_version": "computer_20250124",
|
|
40
|
+
"beta_flag": "computer-use-2025-01-24"
|
|
41
|
+
},
|
|
42
|
+
# Claude 3.5 models (fallback)
|
|
43
|
+
{
|
|
44
|
+
"pattern": r"claude-3\.?5|claude-3-5",
|
|
45
|
+
"tool_version": "computer_20241022",
|
|
46
|
+
"beta_flag": "computer-use-2024-10-22"
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
def _get_tool_config_for_model(model: str) -> Dict[str, str]:
|
|
51
|
+
"""Get tool version and beta flag for the given model."""
|
|
52
|
+
import re
|
|
53
|
+
|
|
54
|
+
for mapping in MODEL_TOOL_MAPPING:
|
|
55
|
+
if re.search(mapping["pattern"], model, re.IGNORECASE):
|
|
56
|
+
return {
|
|
57
|
+
"tool_version": mapping["tool_version"],
|
|
58
|
+
"beta_flag": mapping["beta_flag"]
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Default to Claude 3.5 configuration
|
|
62
|
+
return {
|
|
63
|
+
"tool_version": "computer_20241022",
|
|
64
|
+
"beta_flag": "computer-use-2024-10-22"
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
68
|
+
"""Map a computer tool to Anthropic's hosted tool schema."""
|
|
69
|
+
return {
|
|
70
|
+
"type": tool_version,
|
|
71
|
+
"function": {
|
|
72
|
+
"name": "computer",
|
|
73
|
+
"parameters": {
|
|
74
|
+
"display_height_px": getattr(computer_tool, 'display_height', 768),
|
|
75
|
+
"display_width_px": getattr(computer_tool, 'display_width', 1024),
|
|
76
|
+
"display_number": getattr(computer_tool, 'display_number', 1),
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
82
|
+
"""Prepare tools for Anthropic API format."""
|
|
83
|
+
tool_config = _get_tool_config_for_model(model)
|
|
84
|
+
anthropic_tools = []
|
|
85
|
+
|
|
86
|
+
for schema in tool_schemas:
|
|
87
|
+
if schema["type"] == "computer":
|
|
88
|
+
# Map computer tool to Anthropic format
|
|
89
|
+
anthropic_tools.append(_map_computer_tool_to_anthropic(
|
|
90
|
+
schema["computer"],
|
|
91
|
+
tool_config["tool_version"]
|
|
92
|
+
))
|
|
93
|
+
elif schema["type"] == "function":
|
|
94
|
+
# Function tools - convert to Anthropic format
|
|
95
|
+
function_schema = schema["function"]
|
|
96
|
+
anthropic_tools.append({
|
|
97
|
+
"type": "function",
|
|
98
|
+
"function": {
|
|
99
|
+
"name": function_schema["name"],
|
|
100
|
+
"description": function_schema.get("description", ""),
|
|
101
|
+
"parameters": function_schema.get("parameters", {})
|
|
102
|
+
}
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
return anthropic_tools
|
|
106
|
+
|
|
107
|
+
def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
|
|
108
|
+
"""Convert responses_items message format to liteLLM completion format."""
|
|
109
|
+
completion_messages = []
|
|
110
|
+
|
|
111
|
+
for message in messages:
|
|
112
|
+
msg_type = message.get("type")
|
|
113
|
+
role = message.get("role")
|
|
114
|
+
|
|
115
|
+
# Handle user messages (both with and without explicit type)
|
|
116
|
+
if role == "user" or msg_type == "user":
|
|
117
|
+
content = message.get("content", "")
|
|
118
|
+
if isinstance(content, list):
|
|
119
|
+
# Multi-modal content - convert input_image to image format
|
|
120
|
+
converted_content = []
|
|
121
|
+
for item in content:
|
|
122
|
+
if isinstance(item, dict) and item.get("type") == "input_image":
|
|
123
|
+
# Convert input_image to Anthropic image format
|
|
124
|
+
image_url = item.get("image_url", "")
|
|
125
|
+
if image_url and image_url != "[omitted]":
|
|
126
|
+
# Extract base64 data from data URL
|
|
127
|
+
if "," in image_url:
|
|
128
|
+
base64_data = image_url.split(",")[-1]
|
|
129
|
+
else:
|
|
130
|
+
base64_data = image_url
|
|
131
|
+
|
|
132
|
+
converted_content.append({
|
|
133
|
+
"type": "image",
|
|
134
|
+
"source": {
|
|
135
|
+
"type": "base64",
|
|
136
|
+
"media_type": "image/png",
|
|
137
|
+
"data": base64_data
|
|
138
|
+
}
|
|
139
|
+
})
|
|
140
|
+
else:
|
|
141
|
+
# Keep other content types as-is
|
|
142
|
+
converted_content.append(item)
|
|
143
|
+
|
|
144
|
+
completion_messages.append({
|
|
145
|
+
"role": "user",
|
|
146
|
+
"content": converted_content if converted_content else content
|
|
147
|
+
})
|
|
148
|
+
else:
|
|
149
|
+
# Text content
|
|
150
|
+
completion_messages.append({
|
|
151
|
+
"role": "user",
|
|
152
|
+
"content": content
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
# Handle assistant messages
|
|
156
|
+
elif role == "assistant":
|
|
157
|
+
content = message.get("content", [])
|
|
158
|
+
if isinstance(content, str):
|
|
159
|
+
content = [{ "type": "output_text", "text": content }]
|
|
160
|
+
|
|
161
|
+
content = "\n".join(item.get("text", "") for item in content)
|
|
162
|
+
completion_messages.append({
|
|
163
|
+
"role": "assistant",
|
|
164
|
+
"content": content
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
elif msg_type == "reasoning":
|
|
168
|
+
# Reasoning becomes part of assistant message
|
|
169
|
+
summary = message.get("summary", [])
|
|
170
|
+
reasoning_text = ""
|
|
171
|
+
|
|
172
|
+
if isinstance(summary, list) and summary:
|
|
173
|
+
# Extract text from summary items
|
|
174
|
+
for item in summary:
|
|
175
|
+
if isinstance(item, dict) and item.get("type") == "summary_text":
|
|
176
|
+
reasoning_text = item.get("text", "")
|
|
177
|
+
break
|
|
178
|
+
else:
|
|
179
|
+
# Fallback to direct reasoning field
|
|
180
|
+
reasoning_text = message.get("reasoning", "")
|
|
181
|
+
|
|
182
|
+
if reasoning_text:
|
|
183
|
+
completion_messages.append({
|
|
184
|
+
"role": "assistant",
|
|
185
|
+
"content": reasoning_text
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
elif msg_type == "computer_call":
|
|
189
|
+
# Computer call becomes tool use in assistant message
|
|
190
|
+
action = message.get("action", {})
|
|
191
|
+
action_type = action.get("type")
|
|
192
|
+
call_id = message.get("call_id", "call_1")
|
|
193
|
+
|
|
194
|
+
tool_use_content = []
|
|
195
|
+
|
|
196
|
+
if action_type == "click":
|
|
197
|
+
tool_use_content.append({
|
|
198
|
+
"type": "tool_use",
|
|
199
|
+
"id": call_id,
|
|
200
|
+
"name": "computer",
|
|
201
|
+
"input": {
|
|
202
|
+
"action": "click",
|
|
203
|
+
"coordinate": [action.get("x", 0), action.get("y", 0)]
|
|
204
|
+
}
|
|
205
|
+
})
|
|
206
|
+
elif action_type == "type":
|
|
207
|
+
tool_use_content.append({
|
|
208
|
+
"type": "tool_use",
|
|
209
|
+
"id": call_id,
|
|
210
|
+
"name": "computer",
|
|
211
|
+
"input": {
|
|
212
|
+
"action": "type",
|
|
213
|
+
"text": action.get("text", "")
|
|
214
|
+
}
|
|
215
|
+
})
|
|
216
|
+
elif action_type == "key":
|
|
217
|
+
tool_use_content.append({
|
|
218
|
+
"type": "tool_use",
|
|
219
|
+
"id": call_id,
|
|
220
|
+
"name": "computer",
|
|
221
|
+
"input": {
|
|
222
|
+
"action": "key",
|
|
223
|
+
"key": action.get("key", "")
|
|
224
|
+
}
|
|
225
|
+
})
|
|
226
|
+
elif action_type == "wait":
|
|
227
|
+
tool_use_content.append({
|
|
228
|
+
"type": "tool_use",
|
|
229
|
+
"id": call_id,
|
|
230
|
+
"name": "computer",
|
|
231
|
+
"input": {
|
|
232
|
+
"action": "screenshot"
|
|
233
|
+
}
|
|
234
|
+
})
|
|
235
|
+
elif action_type == "screenshot":
|
|
236
|
+
tool_use_content.append({
|
|
237
|
+
"type": "tool_use",
|
|
238
|
+
"id": call_id,
|
|
239
|
+
"name": "computer",
|
|
240
|
+
"input": {
|
|
241
|
+
"action": "screenshot"
|
|
242
|
+
}
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
# Convert tool_use_content to OpenAI tool_calls format
|
|
246
|
+
openai_tool_calls = []
|
|
247
|
+
for tool_use in tool_use_content:
|
|
248
|
+
openai_tool_calls.append({
|
|
249
|
+
"id": tool_use["id"],
|
|
250
|
+
"type": "function",
|
|
251
|
+
"function": {
|
|
252
|
+
"name": tool_use["name"],
|
|
253
|
+
"arguments": json.dumps(tool_use["input"])
|
|
254
|
+
}
|
|
255
|
+
})
|
|
256
|
+
|
|
257
|
+
# If the last completion message is an assistant message, extend the tool_calls
|
|
258
|
+
if completion_messages and completion_messages[-1].get("role") == "assistant":
|
|
259
|
+
if "tool_calls" not in completion_messages[-1]:
|
|
260
|
+
completion_messages[-1]["tool_calls"] = []
|
|
261
|
+
completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
|
|
262
|
+
else:
|
|
263
|
+
# Create new assistant message with tool calls
|
|
264
|
+
completion_messages.append({
|
|
265
|
+
"role": "assistant",
|
|
266
|
+
"content": None,
|
|
267
|
+
"tool_calls": openai_tool_calls
|
|
268
|
+
})
|
|
269
|
+
|
|
270
|
+
elif msg_type == "computer_call_output":
|
|
271
|
+
# Computer call output becomes OpenAI function result
|
|
272
|
+
output = message.get("output", {})
|
|
273
|
+
call_id = message.get("call_id", "call_1")
|
|
274
|
+
|
|
275
|
+
if output.get("type") == "input_image":
|
|
276
|
+
# Screenshot result - convert to OpenAI format with image_url content
|
|
277
|
+
image_url = output.get("image_url", "")
|
|
278
|
+
completion_messages.append({
|
|
279
|
+
"role": "function",
|
|
280
|
+
"name": "computer",
|
|
281
|
+
"tool_call_id": call_id,
|
|
282
|
+
"content": [{
|
|
283
|
+
"type": "image_url",
|
|
284
|
+
"image_url": {
|
|
285
|
+
"url": image_url
|
|
286
|
+
}
|
|
287
|
+
}]
|
|
288
|
+
})
|
|
289
|
+
else:
|
|
290
|
+
# Text result - convert to OpenAI format
|
|
291
|
+
completion_messages.append({
|
|
292
|
+
"role": "function",
|
|
293
|
+
"name": "computer",
|
|
294
|
+
"tool_call_id": call_id,
|
|
295
|
+
"content": str(output)
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
return completion_messages
|
|
299
|
+
|
|
300
|
+
def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
|
|
301
|
+
"""Convert liteLLM completion response to responses_items message format."""
|
|
302
|
+
responses_items = []
|
|
303
|
+
|
|
304
|
+
if not response or not hasattr(response, 'choices') or not response.choices:
|
|
305
|
+
return responses_items
|
|
306
|
+
|
|
307
|
+
choice = response.choices[0]
|
|
308
|
+
message = choice.message
|
|
309
|
+
|
|
310
|
+
# Handle text content
|
|
311
|
+
if hasattr(message, 'content') and message.content:
|
|
312
|
+
if isinstance(message.content, str):
|
|
313
|
+
responses_items.append(make_output_text_item(message.content))
|
|
314
|
+
elif isinstance(message.content, list):
|
|
315
|
+
for content_item in message.content:
|
|
316
|
+
if isinstance(content_item, dict):
|
|
317
|
+
if content_item.get("type") == "text":
|
|
318
|
+
responses_items.append(make_output_text_item(content_item.get("text", "")))
|
|
319
|
+
elif content_item.get("type") == "tool_use":
|
|
320
|
+
# Convert tool use to computer call
|
|
321
|
+
tool_input = content_item.get("input", {})
|
|
322
|
+
action_type = tool_input.get("action")
|
|
323
|
+
call_id = content_item.get("id")
|
|
324
|
+
|
|
325
|
+
# Action reference:
|
|
326
|
+
# https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
|
|
327
|
+
|
|
328
|
+
# Basic actions (all versions)
|
|
329
|
+
if action_type == "screenshot":
|
|
330
|
+
responses_items.append(make_screenshot_item(call_id=call_id))
|
|
331
|
+
elif action_type == "left_click":
|
|
332
|
+
coordinate = tool_input.get("coordinate", [0, 0])
|
|
333
|
+
responses_items.append(make_click_item(
|
|
334
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
335
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
336
|
+
call_id=call_id
|
|
337
|
+
))
|
|
338
|
+
elif action_type == "type":
|
|
339
|
+
responses_items.append(make_type_item(
|
|
340
|
+
text=tool_input.get("text", ""),
|
|
341
|
+
call_id=call_id
|
|
342
|
+
))
|
|
343
|
+
elif action_type == "key":
|
|
344
|
+
responses_items.append(make_keypress_item(
|
|
345
|
+
key=tool_input.get("key", ""),
|
|
346
|
+
call_id=call_id
|
|
347
|
+
))
|
|
348
|
+
elif action_type == "mouse_move":
|
|
349
|
+
# Mouse move - create a custom action item
|
|
350
|
+
coordinate = tool_input.get("coordinate", [0, 0])
|
|
351
|
+
responses_items.append({
|
|
352
|
+
"type": "computer_call",
|
|
353
|
+
"call_id": call_id,
|
|
354
|
+
"action": {
|
|
355
|
+
"type": "mouse_move",
|
|
356
|
+
"x": coordinate[0] if len(coordinate) > 0 else 0,
|
|
357
|
+
"y": coordinate[1] if len(coordinate) > 1 else 0
|
|
358
|
+
}
|
|
359
|
+
})
|
|
360
|
+
|
|
361
|
+
# Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
|
|
362
|
+
elif action_type == "scroll":
|
|
363
|
+
coordinate = tool_input.get("coordinate", [0, 0])
|
|
364
|
+
responses_items.append(make_scroll_item(
|
|
365
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
366
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
367
|
+
direction=tool_input.get("scroll_direction", "down"),
|
|
368
|
+
amount=tool_input.get("scroll_amount", 3),
|
|
369
|
+
call_id=call_id
|
|
370
|
+
))
|
|
371
|
+
elif action_type == "left_click_drag":
|
|
372
|
+
start_coord = tool_input.get("start_coordinate", [0, 0])
|
|
373
|
+
end_coord = tool_input.get("end_coordinate", [0, 0])
|
|
374
|
+
responses_items.append(make_drag_item(
|
|
375
|
+
start_x=start_coord[0] if len(start_coord) > 0 else 0,
|
|
376
|
+
start_y=start_coord[1] if len(start_coord) > 1 else 0,
|
|
377
|
+
end_x=end_coord[0] if len(end_coord) > 0 else 0,
|
|
378
|
+
end_y=end_coord[1] if len(end_coord) > 1 else 0,
|
|
379
|
+
call_id=call_id
|
|
380
|
+
))
|
|
381
|
+
elif action_type == "right_click":
|
|
382
|
+
coordinate = tool_input.get("coordinate", [0, 0])
|
|
383
|
+
responses_items.append(make_click_item(
|
|
384
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
385
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
386
|
+
button="right",
|
|
387
|
+
call_id=call_id
|
|
388
|
+
))
|
|
389
|
+
elif action_type == "middle_click":
|
|
390
|
+
coordinate = tool_input.get("coordinate", [0, 0])
|
|
391
|
+
responses_items.append(make_click_item(
|
|
392
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
393
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
394
|
+
button="wheel",
|
|
395
|
+
call_id=call_id
|
|
396
|
+
))
|
|
397
|
+
elif action_type == "double_click":
|
|
398
|
+
coordinate = tool_input.get("coordinate", [0, 0])
|
|
399
|
+
responses_items.append(make_double_click_item(
|
|
400
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
401
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
402
|
+
call_id=call_id
|
|
403
|
+
))
|
|
404
|
+
elif action_type == "triple_click":
|
|
405
|
+
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
406
|
+
# responses_items.append({
|
|
407
|
+
# "type": "computer_call",
|
|
408
|
+
# "call_id": call_id,
|
|
409
|
+
# "action": {
|
|
410
|
+
# "type": "triple_click",
|
|
411
|
+
# "x": coordinate[0] if len(coordinate) > 0 else 0,
|
|
412
|
+
# "y": coordinate[1] if len(coordinate) > 1 else 0
|
|
413
|
+
# }
|
|
414
|
+
# })
|
|
415
|
+
raise NotImplementedError("triple_click")
|
|
416
|
+
elif action_type == "left_mouse_down":
|
|
417
|
+
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
418
|
+
# responses_items.append({
|
|
419
|
+
# "type": "computer_call",
|
|
420
|
+
# "call_id": call_id,
|
|
421
|
+
# "action": {
|
|
422
|
+
# "type": "mouse_down",
|
|
423
|
+
# "button": "left",
|
|
424
|
+
# "x": coordinate[0] if len(coordinate) > 0 else 0,
|
|
425
|
+
# "y": coordinate[1] if len(coordinate) > 1 else 0
|
|
426
|
+
# }
|
|
427
|
+
# })
|
|
428
|
+
raise NotImplementedError("left_mouse_down")
|
|
429
|
+
elif action_type == "left_mouse_up":
|
|
430
|
+
# coordinate = tool_input.get("coordinate", [0, 0])
|
|
431
|
+
# responses_items.append({
|
|
432
|
+
# "type": "computer_call",
|
|
433
|
+
# "call_id": call_id,
|
|
434
|
+
# "action": {
|
|
435
|
+
# "type": "mouse_up",
|
|
436
|
+
# "button": "left",
|
|
437
|
+
# "x": coordinate[0] if len(coordinate) > 0 else 0,
|
|
438
|
+
# "y": coordinate[1] if len(coordinate) > 1 else 0
|
|
439
|
+
# }
|
|
440
|
+
# })
|
|
441
|
+
raise NotImplementedError("left_mouse_up")
|
|
442
|
+
elif action_type == "hold_key":
|
|
443
|
+
# responses_items.append({
|
|
444
|
+
# "type": "computer_call",
|
|
445
|
+
# "call_id": call_id,
|
|
446
|
+
# "action": {
|
|
447
|
+
# "type": "key_hold",
|
|
448
|
+
# "key": tool_input.get("key", "")
|
|
449
|
+
# }
|
|
450
|
+
# })
|
|
451
|
+
raise NotImplementedError("hold_key")
|
|
452
|
+
elif action_type == "wait":
|
|
453
|
+
responses_items.append(make_wait_item(
|
|
454
|
+
call_id=call_id
|
|
455
|
+
))
|
|
456
|
+
else:
|
|
457
|
+
raise ValueError(f"Unknown action type: {action_type}")
|
|
458
|
+
|
|
459
|
+
# Handle tool calls (alternative format)
|
|
460
|
+
if hasattr(message, 'tool_calls') and message.tool_calls:
|
|
461
|
+
for tool_call in message.tool_calls:
|
|
462
|
+
print(tool_call)
|
|
463
|
+
if tool_call.function.name == "computer":
|
|
464
|
+
try:
|
|
465
|
+
args = json.loads(tool_call.function.arguments)
|
|
466
|
+
action_type = args.get("action")
|
|
467
|
+
call_id = tool_call.id
|
|
468
|
+
|
|
469
|
+
# Basic actions (all versions)
|
|
470
|
+
if action_type == "screenshot":
|
|
471
|
+
responses_items.append(make_screenshot_item(
|
|
472
|
+
call_id=call_id
|
|
473
|
+
))
|
|
474
|
+
elif action_type in ["click", "left_click"]:
|
|
475
|
+
coordinate = args.get("coordinate", [0, 0])
|
|
476
|
+
responses_items.append(make_click_item(
|
|
477
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
478
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
479
|
+
call_id=call_id
|
|
480
|
+
))
|
|
481
|
+
elif action_type == "type":
|
|
482
|
+
responses_items.append(make_type_item(
|
|
483
|
+
text=args.get("text", ""),
|
|
484
|
+
call_id=call_id
|
|
485
|
+
))
|
|
486
|
+
elif action_type == "key":
|
|
487
|
+
responses_items.append(make_keypress_item(
|
|
488
|
+
key=args.get("key", ""),
|
|
489
|
+
call_id=call_id
|
|
490
|
+
))
|
|
491
|
+
elif action_type == "mouse_move":
|
|
492
|
+
coordinate = args.get("coordinate", [0, 0])
|
|
493
|
+
responses_items.append(make_move_item(
|
|
494
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
495
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
496
|
+
call_id=call_id
|
|
497
|
+
))
|
|
498
|
+
|
|
499
|
+
# Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
|
|
500
|
+
elif action_type == "scroll":
|
|
501
|
+
coordinate = args.get("coordinate", [0, 0])
|
|
502
|
+
direction = args.get("scroll_direction", "down")
|
|
503
|
+
amount = args.get("scroll_amount", 3)
|
|
504
|
+
scroll_x = amount if direction == "left" else \
|
|
505
|
+
-amount if direction == "right" else 0
|
|
506
|
+
scroll_y = amount if direction == "up" else \
|
|
507
|
+
-amount if direction == "down" else 0
|
|
508
|
+
responses_items.append(make_scroll_item(
|
|
509
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
510
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
511
|
+
scroll_x=scroll_x,
|
|
512
|
+
scroll_y=scroll_y,
|
|
513
|
+
call_id=call_id
|
|
514
|
+
))
|
|
515
|
+
elif action_type == "left_click_drag":
|
|
516
|
+
start_coord = args.get("start_coordinate", [0, 0])
|
|
517
|
+
end_coord = args.get("end_coordinate", [0, 0])
|
|
518
|
+
responses_items.append(make_drag_item(
|
|
519
|
+
start_x=start_coord[0] if len(start_coord) > 0 else 0,
|
|
520
|
+
start_y=start_coord[1] if len(start_coord) > 1 else 0,
|
|
521
|
+
end_x=end_coord[0] if len(end_coord) > 0 else 0,
|
|
522
|
+
end_y=end_coord[1] if len(end_coord) > 1 else 0,
|
|
523
|
+
call_id=call_id
|
|
524
|
+
))
|
|
525
|
+
elif action_type == "right_click":
|
|
526
|
+
coordinate = args.get("coordinate", [0, 0])
|
|
527
|
+
responses_items.append(make_click_item(
|
|
528
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
529
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
530
|
+
button="right",
|
|
531
|
+
call_id=call_id
|
|
532
|
+
))
|
|
533
|
+
elif action_type == "middle_click":
|
|
534
|
+
coordinate = args.get("coordinate", [0, 0])
|
|
535
|
+
responses_items.append(make_click_item(
|
|
536
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
537
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
538
|
+
button="scroll",
|
|
539
|
+
call_id=call_id
|
|
540
|
+
))
|
|
541
|
+
elif action_type == "double_click":
|
|
542
|
+
coordinate = args.get("coordinate", [0, 0])
|
|
543
|
+
responses_items.append(make_double_click_item(
|
|
544
|
+
x=coordinate[0] if len(coordinate) > 0 else 0,
|
|
545
|
+
y=coordinate[1] if len(coordinate) > 1 else 0,
|
|
546
|
+
call_id=call_id
|
|
547
|
+
))
|
|
548
|
+
elif action_type == "triple_click":
|
|
549
|
+
raise NotImplementedError("triple_click")
|
|
550
|
+
elif action_type == "left_mouse_down":
|
|
551
|
+
raise NotImplementedError("left_mouse_down")
|
|
552
|
+
elif action_type == "left_mouse_up":
|
|
553
|
+
raise NotImplementedError("left_mouse_up")
|
|
554
|
+
elif action_type == "hold_key":
|
|
555
|
+
raise NotImplementedError("hold_key")
|
|
556
|
+
elif action_type == "wait":
|
|
557
|
+
responses_items.append(make_wait_item(
|
|
558
|
+
call_id=call_id
|
|
559
|
+
))
|
|
560
|
+
except json.JSONDecodeError:
|
|
561
|
+
print("Failed to decode tool call arguments")
|
|
562
|
+
# Skip malformed tool calls
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
return responses_items
|
|
566
|
+
|
|
567
|
+
def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
568
|
+
"""Add cache control to completion messages"""
|
|
569
|
+
num_writes = 0
|
|
570
|
+
for message in completion_messages:
|
|
571
|
+
message["cache_control"] = { "type": "ephemeral" }
|
|
572
|
+
num_writes += 1
|
|
573
|
+
# Cache control has a maximum of 4 blocks
|
|
574
|
+
if num_writes >= 4:
|
|
575
|
+
break
|
|
576
|
+
|
|
577
|
+
return completion_messages
|
|
578
|
+
|
|
579
|
+
def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
580
|
+
"""Combine completion messages with the same role"""
|
|
581
|
+
if not completion_messages:
|
|
582
|
+
return completion_messages
|
|
583
|
+
|
|
584
|
+
combined_messages = []
|
|
585
|
+
|
|
586
|
+
for message in completion_messages:
|
|
587
|
+
# If this is the first message or role is different from last, add as new message
|
|
588
|
+
if not combined_messages or combined_messages[-1]["role"] != message["role"]:
|
|
589
|
+
# Ensure content is a list format and normalize text content
|
|
590
|
+
new_message = message.copy()
|
|
591
|
+
new_message["content"] = _normalize_content(message.get("content", ""))
|
|
592
|
+
|
|
593
|
+
# Copy tool_calls if present
|
|
594
|
+
if "tool_calls" in message:
|
|
595
|
+
new_message["tool_calls"] = message["tool_calls"].copy()
|
|
596
|
+
|
|
597
|
+
combined_messages.append(new_message)
|
|
598
|
+
else:
|
|
599
|
+
# Same role as previous message, combine them
|
|
600
|
+
last_message = combined_messages[-1]
|
|
601
|
+
|
|
602
|
+
# Combine content
|
|
603
|
+
current_content = _normalize_content(message.get("content", ""))
|
|
604
|
+
last_message["content"].extend(current_content)
|
|
605
|
+
|
|
606
|
+
# Combine tool_calls if present
|
|
607
|
+
if "tool_calls" in message:
|
|
608
|
+
if "tool_calls" not in last_message:
|
|
609
|
+
last_message["tool_calls"] = []
|
|
610
|
+
last_message["tool_calls"].extend(message["tool_calls"])
|
|
611
|
+
|
|
612
|
+
# Post-process to merge consecutive text blocks
|
|
613
|
+
for message in combined_messages:
|
|
614
|
+
message["content"] = _merge_consecutive_text(message["content"])
|
|
615
|
+
|
|
616
|
+
return combined_messages
|
|
617
|
+
|
|
618
|
+
def _normalize_content(content) -> List[Dict[str, Any]]:
|
|
619
|
+
"""Normalize content to list format"""
|
|
620
|
+
if isinstance(content, str):
|
|
621
|
+
if content.strip(): # Only add non-empty strings
|
|
622
|
+
return [{"type": "text", "text": content}]
|
|
623
|
+
else:
|
|
624
|
+
return []
|
|
625
|
+
elif isinstance(content, list):
|
|
626
|
+
return content.copy()
|
|
627
|
+
else:
|
|
628
|
+
return []
|
|
629
|
+
|
|
630
|
+
def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
631
|
+
"""Merge consecutive text blocks with newlines"""
|
|
632
|
+
if not content_list:
|
|
633
|
+
return content_list
|
|
634
|
+
|
|
635
|
+
merged = []
|
|
636
|
+
|
|
637
|
+
for item in content_list:
|
|
638
|
+
if (item.get("type") == "text" and
|
|
639
|
+
merged and
|
|
640
|
+
merged[-1].get("type") == "text"):
|
|
641
|
+
# Merge with previous text block
|
|
642
|
+
merged[-1]["text"] += "\n" + item["text"]
|
|
643
|
+
else:
|
|
644
|
+
merged.append(item.copy())
|
|
645
|
+
|
|
646
|
+
return merged
|
|
647
|
+
|
|
648
|
+
@agent_loop(models=r".*claude-.*", priority=5)
|
|
649
|
+
async def anthropic_hosted_tools_loop(
|
|
650
|
+
messages: Messages,
|
|
651
|
+
model: str,
|
|
652
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
653
|
+
max_retries: Optional[int] = None,
|
|
654
|
+
stream: bool = False,
|
|
655
|
+
computer_handler=None,
|
|
656
|
+
use_prompt_caching: Optional[bool] = False,
|
|
657
|
+
_on_api_start=None,
|
|
658
|
+
_on_api_end=None,
|
|
659
|
+
_on_usage=None,
|
|
660
|
+
_on_screenshot=None,
|
|
661
|
+
**kwargs
|
|
662
|
+
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
663
|
+
"""
|
|
664
|
+
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
665
|
+
|
|
666
|
+
Supports Anthropic's computer use models with hosted tools.
|
|
667
|
+
"""
|
|
668
|
+
tools = tools or []
|
|
669
|
+
|
|
670
|
+
# Get tool configuration for this model
|
|
671
|
+
tool_config = _get_tool_config_for_model(model)
|
|
672
|
+
|
|
673
|
+
# Prepare tools for Anthropic API
|
|
674
|
+
anthropic_tools = _prepare_tools_for_anthropic(tools, model)
|
|
675
|
+
|
|
676
|
+
# Convert responses_items messages to completion format
|
|
677
|
+
completion_messages = _convert_responses_items_to_completion_messages(messages)
|
|
678
|
+
if use_prompt_caching:
|
|
679
|
+
# First combine messages to reduce number of blocks
|
|
680
|
+
completion_messages = _combine_completion_messages(completion_messages)
|
|
681
|
+
# Then add cache control, anthropic requires explicit "cache_control" dicts
|
|
682
|
+
completion_messages = _add_cache_control(completion_messages)
|
|
683
|
+
|
|
684
|
+
# Prepare API call kwargs
|
|
685
|
+
api_kwargs = {
|
|
686
|
+
"model": model,
|
|
687
|
+
"messages": completion_messages,
|
|
688
|
+
"tools": anthropic_tools if anthropic_tools else None,
|
|
689
|
+
"stream": stream,
|
|
690
|
+
"num_retries": max_retries,
|
|
691
|
+
**kwargs
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
# Add beta header for computer use
|
|
695
|
+
if anthropic_tools:
|
|
696
|
+
api_kwargs["headers"] = {
|
|
697
|
+
"anthropic-beta": tool_config["beta_flag"]
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
# Call API start hook
|
|
701
|
+
if _on_api_start:
|
|
702
|
+
await _on_api_start(api_kwargs)
|
|
703
|
+
|
|
704
|
+
# Use liteLLM acompletion
|
|
705
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
706
|
+
|
|
707
|
+
# Call API end hook
|
|
708
|
+
if _on_api_end:
|
|
709
|
+
await _on_api_end(api_kwargs, response)
|
|
710
|
+
|
|
711
|
+
# Convert response to responses_items format
|
|
712
|
+
responses_items = _convert_completion_to_responses_items(response)
|
|
713
|
+
|
|
714
|
+
# Extract usage information
|
|
715
|
+
responses_usage = {
|
|
716
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
|
|
717
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
718
|
+
}
|
|
719
|
+
if _on_usage:
|
|
720
|
+
await _on_usage(responses_usage)
|
|
721
|
+
|
|
722
|
+
# Create agent response
|
|
723
|
+
agent_response = {
|
|
724
|
+
"output": responses_items,
|
|
725
|
+
"usage": responses_usage
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
return agent_response
|