cua-agent 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +2 -2
- agent/adapters/huggingfacelocal_adapter.py +8 -5
- agent/agent.py +85 -15
- agent/cli.py +9 -3
- agent/computer_handler.py +3 -1
- agent/decorators.py +28 -66
- agent/loops/__init__.py +3 -1
- agent/loops/anthropic.py +200 -84
- agent/loops/base.py +76 -0
- agent/loops/composed_grounded.py +318 -0
- agent/loops/gta1.py +178 -0
- agent/loops/model_types.csv +6 -0
- agent/loops/omniparser.py +178 -84
- agent/loops/openai.py +198 -58
- agent/loops/uitars.py +305 -178
- agent/responses.py +477 -1
- agent/types.py +7 -5
- agent/ui/gradio/app.py +14 -7
- agent/ui/gradio/ui_components.py +18 -1
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/METADATA +3 -3
- cua_agent-0.4.8.dist-info/RECORD +37 -0
- cua_agent-0.4.6.dist-info/RECORD +0 -33
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0
agent/loops/anthropic.py
CHANGED
|
@@ -4,12 +4,13 @@ Anthropic hosted tools agent loop implementation using liteLLM
|
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional
|
|
7
|
+
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
8
8
|
import litellm
|
|
9
9
|
from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
|
|
10
10
|
|
|
11
|
-
from ..decorators import
|
|
12
|
-
from ..types import Messages, AgentResponse, Tools
|
|
11
|
+
from ..decorators import register_agent
|
|
12
|
+
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
13
|
+
from ..loops.base import AsyncAgentConfig
|
|
13
14
|
from ..responses import (
|
|
14
15
|
make_reasoning_item,
|
|
15
16
|
make_output_text_item,
|
|
@@ -64,21 +65,28 @@ def _get_tool_config_for_model(model: str) -> Dict[str, str]:
|
|
|
64
65
|
"beta_flag": "computer-use-2024-10-22"
|
|
65
66
|
}
|
|
66
67
|
|
|
67
|
-
def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
68
|
+
async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
68
69
|
"""Map a computer tool to Anthropic's hosted tool schema."""
|
|
70
|
+
# Get dimensions from the computer handler
|
|
71
|
+
try:
|
|
72
|
+
width, height = await computer_tool.get_dimensions()
|
|
73
|
+
except Exception:
|
|
74
|
+
# Fallback to default dimensions if method fails
|
|
75
|
+
width, height = 1024, 768
|
|
76
|
+
|
|
69
77
|
return {
|
|
70
78
|
"type": tool_version,
|
|
71
79
|
"function": {
|
|
72
80
|
"name": "computer",
|
|
73
81
|
"parameters": {
|
|
74
|
-
"display_height_px":
|
|
75
|
-
"display_width_px":
|
|
76
|
-
"display_number":
|
|
82
|
+
"display_height_px": height,
|
|
83
|
+
"display_width_px": width,
|
|
84
|
+
"display_number": 1,
|
|
77
85
|
},
|
|
78
86
|
},
|
|
79
87
|
}
|
|
80
88
|
|
|
81
|
-
def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
89
|
+
async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
82
90
|
"""Prepare tools for Anthropic API format."""
|
|
83
91
|
tool_config = _get_tool_config_for_model(model)
|
|
84
92
|
anthropic_tools = []
|
|
@@ -86,7 +94,7 @@ def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str)
|
|
|
86
94
|
for schema in tool_schemas:
|
|
87
95
|
if schema["type"] == "computer":
|
|
88
96
|
# Map computer tool to Anthropic format
|
|
89
|
-
anthropic_tools.append(_map_computer_tool_to_anthropic(
|
|
97
|
+
anthropic_tools.append(await _map_computer_tool_to_anthropic(
|
|
90
98
|
schema["computer"],
|
|
91
99
|
tool_config["tool_version"]
|
|
92
100
|
))
|
|
@@ -1284,84 +1292,192 @@ def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str
|
|
|
1284
1292
|
|
|
1285
1293
|
return merged
|
|
1286
1294
|
|
|
1287
|
-
@
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
model: str,
|
|
1291
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
1292
|
-
max_retries: Optional[int] = None,
|
|
1293
|
-
stream: bool = False,
|
|
1294
|
-
computer_handler=None,
|
|
1295
|
-
use_prompt_caching: Optional[bool] = False,
|
|
1296
|
-
_on_api_start=None,
|
|
1297
|
-
_on_api_end=None,
|
|
1298
|
-
_on_usage=None,
|
|
1299
|
-
_on_screenshot=None,
|
|
1300
|
-
**kwargs
|
|
1301
|
-
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
1302
|
-
"""
|
|
1303
|
-
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
1304
|
-
|
|
1305
|
-
Supports Anthropic's computer use models with hosted tools.
|
|
1306
|
-
"""
|
|
1307
|
-
tools = tools or []
|
|
1308
|
-
|
|
1309
|
-
# Get tool configuration for this model
|
|
1310
|
-
tool_config = _get_tool_config_for_model(model)
|
|
1295
|
+
@register_agent(models=r".*claude-.*")
|
|
1296
|
+
class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
1297
|
+
"""Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
|
|
1311
1298
|
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
"model": model,
|
|
1326
|
-
"messages": completion_messages,
|
|
1327
|
-
"tools": anthropic_tools if anthropic_tools else None,
|
|
1328
|
-
"stream": stream,
|
|
1329
|
-
"num_retries": max_retries,
|
|
1299
|
+
async def predict_step(
|
|
1300
|
+
self,
|
|
1301
|
+
messages: Messages,
|
|
1302
|
+
model: str,
|
|
1303
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
1304
|
+
max_retries: Optional[int] = None,
|
|
1305
|
+
stream: bool = False,
|
|
1306
|
+
computer_handler=None,
|
|
1307
|
+
use_prompt_caching: Optional[bool] = False,
|
|
1308
|
+
_on_api_start=None,
|
|
1309
|
+
_on_api_end=None,
|
|
1310
|
+
_on_usage=None,
|
|
1311
|
+
_on_screenshot=None,
|
|
1330
1312
|
**kwargs
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1313
|
+
) -> Dict[str, Any]:
|
|
1314
|
+
"""
|
|
1315
|
+
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
1316
|
+
|
|
1317
|
+
Supports Anthropic's computer use models with hosted tools.
|
|
1318
|
+
"""
|
|
1319
|
+
tools = tools or []
|
|
1320
|
+
|
|
1321
|
+
# Get tool configuration for this model
|
|
1322
|
+
tool_config = _get_tool_config_for_model(model)
|
|
1323
|
+
|
|
1324
|
+
# Prepare tools for Anthropic API
|
|
1325
|
+
anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
|
|
1326
|
+
|
|
1327
|
+
# Convert responses_items messages to completion format
|
|
1328
|
+
completion_messages = _convert_responses_items_to_completion_messages(messages)
|
|
1329
|
+
if use_prompt_caching:
|
|
1330
|
+
# First combine messages to reduce number of blocks
|
|
1331
|
+
completion_messages = _combine_completion_messages(completion_messages)
|
|
1332
|
+
# Then add cache control, anthropic requires explicit "cache_control" dicts
|
|
1333
|
+
completion_messages = _add_cache_control(completion_messages)
|
|
1334
|
+
|
|
1335
|
+
# Prepare API call kwargs
|
|
1336
|
+
api_kwargs = {
|
|
1337
|
+
"model": model,
|
|
1338
|
+
"messages": completion_messages,
|
|
1339
|
+
"tools": anthropic_tools if anthropic_tools else None,
|
|
1340
|
+
"stream": stream,
|
|
1341
|
+
"num_retries": max_retries,
|
|
1342
|
+
**kwargs
|
|
1337
1343
|
}
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1344
|
+
|
|
1345
|
+
# Add beta header for computer use
|
|
1346
|
+
if anthropic_tools:
|
|
1347
|
+
api_kwargs["headers"] = {
|
|
1348
|
+
"anthropic-beta": tool_config["beta_flag"]
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
# Call API start hook
|
|
1352
|
+
if _on_api_start:
|
|
1353
|
+
await _on_api_start(api_kwargs)
|
|
1354
|
+
|
|
1355
|
+
# Use liteLLM acompletion
|
|
1356
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
1357
|
+
|
|
1358
|
+
# Call API end hook
|
|
1359
|
+
if _on_api_end:
|
|
1360
|
+
await _on_api_end(api_kwargs, response)
|
|
1361
|
+
|
|
1362
|
+
# Convert response to responses_items format
|
|
1363
|
+
responses_items = _convert_completion_to_responses_items(response)
|
|
1352
1364
|
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1365
|
+
# Extract usage information
|
|
1366
|
+
responses_usage = {
|
|
1367
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
|
|
1368
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
1369
|
+
}
|
|
1370
|
+
if _on_usage:
|
|
1371
|
+
await _on_usage(responses_usage)
|
|
1360
1372
|
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1373
|
+
# Return in AsyncAgentConfig format
|
|
1374
|
+
return {
|
|
1375
|
+
"output": responses_items,
|
|
1376
|
+
"usage": responses_usage
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
async def predict_click(
|
|
1380
|
+
self,
|
|
1381
|
+
model: str,
|
|
1382
|
+
image_b64: str,
|
|
1383
|
+
instruction: str,
|
|
1384
|
+
**kwargs
|
|
1385
|
+
) -> Optional[Tuple[int, int]]:
|
|
1386
|
+
"""
|
|
1387
|
+
Predict click coordinates based on image and instruction.
|
|
1388
|
+
|
|
1389
|
+
Uses Anthropic's computer use models with a custom prompt that instructs
|
|
1390
|
+
the agent to only output clicks.
|
|
1391
|
+
|
|
1392
|
+
Args:
|
|
1393
|
+
model: Model name to use
|
|
1394
|
+
image_b64: Base64 encoded image
|
|
1395
|
+
instruction: Instruction for where to click
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
Tuple of (x, y) coordinates or None if prediction fails
|
|
1399
|
+
"""
|
|
1400
|
+
# Get image dimensions from base64 data
|
|
1401
|
+
try:
|
|
1402
|
+
import base64
|
|
1403
|
+
from PIL import Image
|
|
1404
|
+
from io import BytesIO
|
|
1405
|
+
|
|
1406
|
+
image_data = base64.b64decode(image_b64)
|
|
1407
|
+
image = Image.open(BytesIO(image_data))
|
|
1408
|
+
display_width, display_height = image.size
|
|
1409
|
+
except Exception:
|
|
1410
|
+
# Fallback to default dimensions if image parsing fails
|
|
1411
|
+
display_width, display_height = 1024, 768
|
|
1412
|
+
|
|
1413
|
+
# Get tool configuration for this model
|
|
1414
|
+
tool_config = _get_tool_config_for_model(model)
|
|
1415
|
+
|
|
1416
|
+
# Prepare computer tool for Anthropic format
|
|
1417
|
+
computer_tool = {
|
|
1418
|
+
"type": tool_config["tool_version"],
|
|
1419
|
+
"function": {
|
|
1420
|
+
"name": "computer",
|
|
1421
|
+
"parameters": {
|
|
1422
|
+
"display_height_px": display_height,
|
|
1423
|
+
"display_width_px": display_width,
|
|
1424
|
+
"display_number": 1,
|
|
1425
|
+
},
|
|
1426
|
+
},
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
# Construct messages in OpenAI chat completion format for liteLLM
|
|
1430
|
+
messages = [
|
|
1431
|
+
{
|
|
1432
|
+
"role": "user",
|
|
1433
|
+
"content": [
|
|
1434
|
+
{
|
|
1435
|
+
"type": "text",
|
|
1436
|
+
"text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
|
1437
|
+
},
|
|
1438
|
+
{
|
|
1439
|
+
"type": "image_url",
|
|
1440
|
+
"image_url": {
|
|
1441
|
+
"url": f"data:image/png;base64,{image_b64}"
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
]
|
|
1445
|
+
}
|
|
1446
|
+
]
|
|
1447
|
+
|
|
1448
|
+
# Prepare API call kwargs
|
|
1449
|
+
api_kwargs = {
|
|
1450
|
+
"model": model,
|
|
1451
|
+
"messages": messages,
|
|
1452
|
+
"tools": [computer_tool],
|
|
1453
|
+
"stream": False,
|
|
1454
|
+
"max_tokens": 100, # Keep response short for click prediction
|
|
1455
|
+
"headers": {
|
|
1456
|
+
"anthropic-beta": tool_config["beta_flag"]
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
# Use liteLLM acompletion
|
|
1461
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
1462
|
+
|
|
1463
|
+
# Convert response to responses_items format to extract click coordinates
|
|
1464
|
+
responses_items = _convert_completion_to_responses_items(response)
|
|
1465
|
+
|
|
1466
|
+
# Look for computer_call with click action
|
|
1467
|
+
for item in responses_items:
|
|
1468
|
+
if (isinstance(item, dict) and
|
|
1469
|
+
item.get("type") == "computer_call" and
|
|
1470
|
+
isinstance(item.get("action"), dict)):
|
|
1471
|
+
|
|
1472
|
+
action = item["action"]
|
|
1473
|
+
if action.get("type") == "click":
|
|
1474
|
+
x = action.get("x")
|
|
1475
|
+
y = action.get("y")
|
|
1476
|
+
if x is not None and y is not None:
|
|
1477
|
+
return (int(x), int(y))
|
|
1478
|
+
|
|
1479
|
+
return None
|
|
1366
1480
|
|
|
1367
|
-
|
|
1481
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
1482
|
+
"""Return the capabilities supported by this agent."""
|
|
1483
|
+
return ["click", "step"]
|
agent/loops/base.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base protocol for async agent configurations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
|
|
6
|
+
from abc import abstractmethod
|
|
7
|
+
from ..types import AgentCapability
|
|
8
|
+
|
|
9
|
+
class AsyncAgentConfig(Protocol):
|
|
10
|
+
"""Protocol defining the interface for async agent configurations."""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
async def predict_step(
|
|
14
|
+
self,
|
|
15
|
+
messages: List[Dict[str, Any]],
|
|
16
|
+
model: str,
|
|
17
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
18
|
+
max_retries: Optional[int] = None,
|
|
19
|
+
stream: bool = False,
|
|
20
|
+
computer_handler=None,
|
|
21
|
+
_on_api_start=None,
|
|
22
|
+
_on_api_end=None,
|
|
23
|
+
_on_usage=None,
|
|
24
|
+
_on_screenshot=None,
|
|
25
|
+
**kwargs
|
|
26
|
+
) -> Dict[str, Any]:
|
|
27
|
+
"""
|
|
28
|
+
Predict the next step based on input items.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
messages: Input items following Responses format (message, function_call, computer_call)
|
|
32
|
+
model: Model name to use
|
|
33
|
+
tools: Optional list of tool schemas
|
|
34
|
+
max_retries: Maximum number of retries for failed API calls
|
|
35
|
+
stream: Whether to stream responses
|
|
36
|
+
computer_handler: Computer handler instance
|
|
37
|
+
_on_api_start: Callback for API start
|
|
38
|
+
_on_api_end: Callback for API end
|
|
39
|
+
_on_usage: Callback for usage tracking
|
|
40
|
+
_on_screenshot: Callback for screenshot events
|
|
41
|
+
**kwargs: Additional arguments
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Dictionary with "output" (output items) and "usage" array
|
|
45
|
+
"""
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
async def predict_click(
|
|
50
|
+
self,
|
|
51
|
+
model: str,
|
|
52
|
+
image_b64: str,
|
|
53
|
+
instruction: str
|
|
54
|
+
) -> Optional[Tuple[int, int]]:
|
|
55
|
+
"""
|
|
56
|
+
Predict click coordinates based on image and instruction.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
model: Model name to use
|
|
60
|
+
image_b64: Base64 encoded image
|
|
61
|
+
instruction: Instruction for where to click
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
None or tuple with (x, y) coordinates
|
|
65
|
+
"""
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
70
|
+
"""
|
|
71
|
+
Get list of capabilities supported by this agent config.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of capability strings (e.g., ["step", "click"])
|
|
75
|
+
"""
|
|
76
|
+
...
|