cua-agent 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/loops/anthropic.py CHANGED
@@ -4,12 +4,13 @@ Anthropic hosted tools agent loop implementation using liteLLM
4
4
 
5
5
  import asyncio
6
6
  import json
7
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
8
8
  import litellm
9
9
  from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
10
10
 
11
- from ..decorators import agent_loop
12
- from ..types import Messages, AgentResponse, Tools
11
+ from ..decorators import register_agent
12
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
13
+ from ..loops.base import AsyncAgentConfig
13
14
  from ..responses import (
14
15
  make_reasoning_item,
15
16
  make_output_text_item,
@@ -64,21 +65,28 @@ def _get_tool_config_for_model(model: str) -> Dict[str, str]:
64
65
  "beta_flag": "computer-use-2024-10-22"
65
66
  }
66
67
 
67
- def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
68
+ async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
68
69
  """Map a computer tool to Anthropic's hosted tool schema."""
70
+ # Get dimensions from the computer handler
71
+ try:
72
+ width, height = await computer_tool.get_dimensions()
73
+ except Exception:
74
+ # Fallback to default dimensions if method fails
75
+ width, height = 1024, 768
76
+
69
77
  return {
70
78
  "type": tool_version,
71
79
  "function": {
72
80
  "name": "computer",
73
81
  "parameters": {
74
- "display_height_px": getattr(computer_tool, 'display_height', 768),
75
- "display_width_px": getattr(computer_tool, 'display_width', 1024),
76
- "display_number": getattr(computer_tool, 'display_number', 1),
82
+ "display_height_px": height,
83
+ "display_width_px": width,
84
+ "display_number": 1,
77
85
  },
78
86
  },
79
87
  }
80
88
 
81
- def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
89
+ async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
82
90
  """Prepare tools for Anthropic API format."""
83
91
  tool_config = _get_tool_config_for_model(model)
84
92
  anthropic_tools = []
@@ -86,7 +94,7 @@ def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str)
86
94
  for schema in tool_schemas:
87
95
  if schema["type"] == "computer":
88
96
  # Map computer tool to Anthropic format
89
- anthropic_tools.append(_map_computer_tool_to_anthropic(
97
+ anthropic_tools.append(await _map_computer_tool_to_anthropic(
90
98
  schema["computer"],
91
99
  tool_config["tool_version"]
92
100
  ))
@@ -1284,84 +1292,192 @@ def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str
1284
1292
 
1285
1293
  return merged
1286
1294
 
1287
- @agent_loop(models=r".*claude-.*", priority=5)
1288
- async def anthropic_hosted_tools_loop(
1289
- messages: Messages,
1290
- model: str,
1291
- tools: Optional[List[Dict[str, Any]]] = None,
1292
- max_retries: Optional[int] = None,
1293
- stream: bool = False,
1294
- computer_handler=None,
1295
- use_prompt_caching: Optional[bool] = False,
1296
- _on_api_start=None,
1297
- _on_api_end=None,
1298
- _on_usage=None,
1299
- _on_screenshot=None,
1300
- **kwargs
1301
- ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
1302
- """
1303
- Anthropic hosted tools agent loop using liteLLM acompletion.
1304
-
1305
- Supports Anthropic's computer use models with hosted tools.
1306
- """
1307
- tools = tools or []
1308
-
1309
- # Get tool configuration for this model
1310
- tool_config = _get_tool_config_for_model(model)
1295
+ @register_agent(models=r".*claude-.*")
1296
+ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1297
+ """Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
1311
1298
 
1312
- # Prepare tools for Anthropic API
1313
- anthropic_tools = _prepare_tools_for_anthropic(tools, model)
1314
-
1315
- # Convert responses_items messages to completion format
1316
- completion_messages = _convert_responses_items_to_completion_messages(messages)
1317
- if use_prompt_caching:
1318
- # First combine messages to reduce number of blocks
1319
- completion_messages = _combine_completion_messages(completion_messages)
1320
- # Then add cache control, anthropic requires explicit "cache_control" dicts
1321
- completion_messages = _add_cache_control(completion_messages)
1322
-
1323
- # Prepare API call kwargs
1324
- api_kwargs = {
1325
- "model": model,
1326
- "messages": completion_messages,
1327
- "tools": anthropic_tools if anthropic_tools else None,
1328
- "stream": stream,
1329
- "num_retries": max_retries,
1299
+ async def predict_step(
1300
+ self,
1301
+ messages: Messages,
1302
+ model: str,
1303
+ tools: Optional[List[Dict[str, Any]]] = None,
1304
+ max_retries: Optional[int] = None,
1305
+ stream: bool = False,
1306
+ computer_handler=None,
1307
+ use_prompt_caching: Optional[bool] = False,
1308
+ _on_api_start=None,
1309
+ _on_api_end=None,
1310
+ _on_usage=None,
1311
+ _on_screenshot=None,
1330
1312
  **kwargs
1331
- }
1332
-
1333
- # Add beta header for computer use
1334
- if anthropic_tools:
1335
- api_kwargs["headers"] = {
1336
- "anthropic-beta": tool_config["beta_flag"]
1313
+ ) -> Dict[str, Any]:
1314
+ """
1315
+ Anthropic hosted tools agent loop using liteLLM acompletion.
1316
+
1317
+ Supports Anthropic's computer use models with hosted tools.
1318
+ """
1319
+ tools = tools or []
1320
+
1321
+ # Get tool configuration for this model
1322
+ tool_config = _get_tool_config_for_model(model)
1323
+
1324
+ # Prepare tools for Anthropic API
1325
+ anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
1326
+
1327
+ # Convert responses_items messages to completion format
1328
+ completion_messages = _convert_responses_items_to_completion_messages(messages)
1329
+ if use_prompt_caching:
1330
+ # First combine messages to reduce number of blocks
1331
+ completion_messages = _combine_completion_messages(completion_messages)
1332
+ # Then add cache control, anthropic requires explicit "cache_control" dicts
1333
+ completion_messages = _add_cache_control(completion_messages)
1334
+
1335
+ # Prepare API call kwargs
1336
+ api_kwargs = {
1337
+ "model": model,
1338
+ "messages": completion_messages,
1339
+ "tools": anthropic_tools if anthropic_tools else None,
1340
+ "stream": stream,
1341
+ "num_retries": max_retries,
1342
+ **kwargs
1337
1343
  }
1338
-
1339
- # Call API start hook
1340
- if _on_api_start:
1341
- await _on_api_start(api_kwargs)
1342
-
1343
- # Use liteLLM acompletion
1344
- response = await litellm.acompletion(**api_kwargs)
1345
-
1346
- # Call API end hook
1347
- if _on_api_end:
1348
- await _on_api_end(api_kwargs, response)
1349
-
1350
- # Convert response to responses_items format
1351
- responses_items = _convert_completion_to_responses_items(response)
1344
+
1345
+ # Add beta header for computer use
1346
+ if anthropic_tools:
1347
+ api_kwargs["headers"] = {
1348
+ "anthropic-beta": tool_config["beta_flag"]
1349
+ }
1350
+
1351
+ # Call API start hook
1352
+ if _on_api_start:
1353
+ await _on_api_start(api_kwargs)
1354
+
1355
+ # Use liteLLM acompletion
1356
+ response = await litellm.acompletion(**api_kwargs)
1357
+
1358
+ # Call API end hook
1359
+ if _on_api_end:
1360
+ await _on_api_end(api_kwargs, response)
1361
+
1362
+ # Convert response to responses_items format
1363
+ responses_items = _convert_completion_to_responses_items(response)
1352
1364
 
1353
- # Extract usage information
1354
- responses_usage = {
1355
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1356
- "response_cost": response._hidden_params.get("response_cost", 0.0),
1357
- }
1358
- if _on_usage:
1359
- await _on_usage(responses_usage)
1365
+ # Extract usage information
1366
+ responses_usage = {
1367
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
1368
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
1369
+ }
1370
+ if _on_usage:
1371
+ await _on_usage(responses_usage)
1360
1372
 
1361
- # Create agent response
1362
- agent_response = {
1363
- "output": responses_items,
1364
- "usage": responses_usage
1365
- }
1373
+ # Return in AsyncAgentConfig format
1374
+ return {
1375
+ "output": responses_items,
1376
+ "usage": responses_usage
1377
+ }
1378
+
1379
+ async def predict_click(
1380
+ self,
1381
+ model: str,
1382
+ image_b64: str,
1383
+ instruction: str,
1384
+ **kwargs
1385
+ ) -> Optional[Tuple[int, int]]:
1386
+ """
1387
+ Predict click coordinates based on image and instruction.
1388
+
1389
+ Uses Anthropic's computer use models with a custom prompt that instructs
1390
+ the agent to only output clicks.
1391
+
1392
+ Args:
1393
+ model: Model name to use
1394
+ image_b64: Base64 encoded image
1395
+ instruction: Instruction for where to click
1396
+
1397
+ Returns:
1398
+ Tuple of (x, y) coordinates or None if prediction fails
1399
+ """
1400
+ # Get image dimensions from base64 data
1401
+ try:
1402
+ import base64
1403
+ from PIL import Image
1404
+ from io import BytesIO
1405
+
1406
+ image_data = base64.b64decode(image_b64)
1407
+ image = Image.open(BytesIO(image_data))
1408
+ display_width, display_height = image.size
1409
+ except Exception:
1410
+ # Fallback to default dimensions if image parsing fails
1411
+ display_width, display_height = 1024, 768
1412
+
1413
+ # Get tool configuration for this model
1414
+ tool_config = _get_tool_config_for_model(model)
1415
+
1416
+ # Prepare computer tool for Anthropic format
1417
+ computer_tool = {
1418
+ "type": tool_config["tool_version"],
1419
+ "function": {
1420
+ "name": "computer",
1421
+ "parameters": {
1422
+ "display_height_px": display_height,
1423
+ "display_width_px": display_width,
1424
+ "display_number": 1,
1425
+ },
1426
+ },
1427
+ }
1428
+
1429
+ # Construct messages in OpenAI chat completion format for liteLLM
1430
+ messages = [
1431
+ {
1432
+ "role": "user",
1433
+ "content": [
1434
+ {
1435
+ "type": "text",
1436
+ "text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
1437
+ },
1438
+ {
1439
+ "type": "image_url",
1440
+ "image_url": {
1441
+ "url": f"data:image/png;base64,{image_b64}"
1442
+ }
1443
+ }
1444
+ ]
1445
+ }
1446
+ ]
1447
+
1448
+ # Prepare API call kwargs
1449
+ api_kwargs = {
1450
+ "model": model,
1451
+ "messages": messages,
1452
+ "tools": [computer_tool],
1453
+ "stream": False,
1454
+ "max_tokens": 100, # Keep response short for click prediction
1455
+ "headers": {
1456
+ "anthropic-beta": tool_config["beta_flag"]
1457
+ }
1458
+ }
1459
+
1460
+ # Use liteLLM acompletion
1461
+ response = await litellm.acompletion(**api_kwargs)
1462
+
1463
+ # Convert response to responses_items format to extract click coordinates
1464
+ responses_items = _convert_completion_to_responses_items(response)
1465
+
1466
+ # Look for computer_call with click action
1467
+ for item in responses_items:
1468
+ if (isinstance(item, dict) and
1469
+ item.get("type") == "computer_call" and
1470
+ isinstance(item.get("action"), dict)):
1471
+
1472
+ action = item["action"]
1473
+ if action.get("type") == "click":
1474
+ x = action.get("x")
1475
+ y = action.get("y")
1476
+ if x is not None and y is not None:
1477
+ return (int(x), int(y))
1478
+
1479
+ return None
1366
1480
 
1367
- return agent_response
1481
+ def get_capabilities(self) -> List[AgentCapability]:
1482
+ """Return the capabilities supported by this agent."""
1483
+ return ["click", "step"]
agent/loops/base.py ADDED
@@ -0,0 +1,76 @@
1
+ """
2
+ Base protocol for async agent configurations
3
+ """
4
+
5
+ from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
6
+ from abc import abstractmethod
7
+ from ..types import AgentCapability
8
+
9
+ class AsyncAgentConfig(Protocol):
10
+ """Protocol defining the interface for async agent configurations."""
11
+
12
+ @abstractmethod
13
+ async def predict_step(
14
+ self,
15
+ messages: List[Dict[str, Any]],
16
+ model: str,
17
+ tools: Optional[List[Dict[str, Any]]] = None,
18
+ max_retries: Optional[int] = None,
19
+ stream: bool = False,
20
+ computer_handler=None,
21
+ _on_api_start=None,
22
+ _on_api_end=None,
23
+ _on_usage=None,
24
+ _on_screenshot=None,
25
+ **kwargs
26
+ ) -> Dict[str, Any]:
27
+ """
28
+ Predict the next step based on input items.
29
+
30
+ Args:
31
+ messages: Input items following Responses format (message, function_call, computer_call)
32
+ model: Model name to use
33
+ tools: Optional list of tool schemas
34
+ max_retries: Maximum number of retries for failed API calls
35
+ stream: Whether to stream responses
36
+ computer_handler: Computer handler instance
37
+ _on_api_start: Callback for API start
38
+ _on_api_end: Callback for API end
39
+ _on_usage: Callback for usage tracking
40
+ _on_screenshot: Callback for screenshot events
41
+ **kwargs: Additional arguments
42
+
43
+ Returns:
44
+ Dictionary with "output" (output items) and "usage" array
45
+ """
46
+ ...
47
+
48
+ @abstractmethod
49
+ async def predict_click(
50
+ self,
51
+ model: str,
52
+ image_b64: str,
53
+ instruction: str
54
+ ) -> Optional[Tuple[int, int]]:
55
+ """
56
+ Predict click coordinates based on image and instruction.
57
+
58
+ Args:
59
+ model: Model name to use
60
+ image_b64: Base64 encoded image
61
+ instruction: Instruction for where to click
62
+
63
+ Returns:
64
+ None or tuple with (x, y) coordinates
65
+ """
66
+ ...
67
+
68
+ @abstractmethod
69
+ def get_capabilities(self) -> List[AgentCapability]:
70
+ """
71
+ Get list of capabilities supported by this agent config.
72
+
73
+ Returns:
74
+ List of capability strings (e.g., ["step", "click"])
75
+ """
76
+ ...