cua-agent 0.4.7__tar.gz → 0.4.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.4.7 → cua_agent-0.4.8}/PKG-INFO +2 -2
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/__init__.py +2 -2
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/adapters/huggingfacelocal_adapter.py +5 -1
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/agent.py +82 -15
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/cli.py +9 -3
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/computer_handler.py +3 -1
- cua_agent-0.4.8/agent/decorators.py +52 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/__init__.py +3 -1
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/anthropic.py +200 -84
- cua_agent-0.4.8/agent/loops/base.py +76 -0
- cua_agent-0.4.8/agent/loops/composed_grounded.py +318 -0
- cua_agent-0.4.8/agent/loops/gta1.py +178 -0
- cua_agent-0.4.8/agent/loops/model_types.csv +6 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/omniparser.py +178 -84
- cua_agent-0.4.8/agent/loops/openai.py +235 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/loops/uitars.py +305 -178
- cua_agent-0.4.8/agent/responses.py +683 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/types.py +7 -5
- {cua_agent-0.4.7 → cua_agent-0.4.8}/pyproject.toml +2 -2
- cua_agent-0.4.7/agent/decorators.py +0 -90
- cua_agent-0.4.7/agent/loops/openai.py +0 -95
- cua_agent-0.4.7/agent/responses.py +0 -207
- {cua_agent-0.4.7 → cua_agent-0.4.8}/README.md +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/__main__.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/adapters/__init__.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/__init__.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/base.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/budget_manager.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/image_retention.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/logging.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/pii_anonymization.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/telemetry.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/callbacks/trajectory_saver.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/telemetry.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/__init__.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/__main__.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/gradio/__init__.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/gradio/app.py +0 -0
- {cua_agent-0.4.7 → cua_agent-0.4.8}/agent/ui/gradio/ui_components.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.8
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -15,7 +15,7 @@ Requires-Dist: python-dotenv>=1.0.1
|
|
|
15
15
|
Requires-Dist: cua-computer<0.5.0,>=0.4.0
|
|
16
16
|
Requires-Dist: cua-core<0.2.0,>=0.1.8
|
|
17
17
|
Requires-Dist: certifi>=2024.2.2
|
|
18
|
-
Requires-Dist: litellm>=1.74.
|
|
18
|
+
Requires-Dist: litellm>=1.74.12
|
|
19
19
|
Provides-Extra: openai
|
|
20
20
|
Provides-Extra: anthropic
|
|
21
21
|
Provides-Extra: omni
|
|
@@ -5,7 +5,7 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
|
|
|
5
5
|
import logging
|
|
6
6
|
import sys
|
|
7
7
|
|
|
8
|
-
from .decorators import
|
|
8
|
+
from .decorators import register_agent
|
|
9
9
|
from .agent import ComputerAgent
|
|
10
10
|
from .types import Messages, AgentResponse
|
|
11
11
|
|
|
@@ -13,7 +13,7 @@ from .types import Messages, AgentResponse
|
|
|
13
13
|
from . import loops
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
16
|
-
"
|
|
16
|
+
"register_agent",
|
|
17
17
|
"ComputerAgent",
|
|
18
18
|
"Messages",
|
|
19
19
|
"AgentResponse"
|
|
@@ -48,7 +48,11 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
48
48
|
)
|
|
49
49
|
|
|
50
50
|
# Load processor
|
|
51
|
-
processor = AutoProcessor.from_pretrained(
|
|
51
|
+
processor = AutoProcessor.from_pretrained(
|
|
52
|
+
model_name,
|
|
53
|
+
min_pixels=3136,
|
|
54
|
+
max_pixels=4096 * 2160
|
|
55
|
+
)
|
|
52
56
|
|
|
53
57
|
# Cache them
|
|
54
58
|
self.models[model_name] = model
|
|
@@ -3,12 +3,12 @@ ComputerAgent - Main agent class that selects and runs agent loops
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
-
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set
|
|
6
|
+
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
|
|
7
7
|
|
|
8
8
|
from litellm.responses.utils import Usage
|
|
9
9
|
|
|
10
|
-
from .types import Messages, Computer
|
|
11
|
-
from .decorators import
|
|
10
|
+
from .types import Messages, Computer, AgentCapability
|
|
11
|
+
from .decorators import find_agent_config
|
|
12
12
|
from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
|
|
13
13
|
import json
|
|
14
14
|
import litellm
|
|
@@ -117,6 +117,13 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
117
117
|
return sanitized
|
|
118
118
|
return msg
|
|
119
119
|
|
|
120
|
+
def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
|
|
121
|
+
call_ids = []
|
|
122
|
+
for message in messages:
|
|
123
|
+
if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
|
|
124
|
+
call_ids.append(message.get("call_id"))
|
|
125
|
+
return call_ids
|
|
126
|
+
|
|
120
127
|
class ComputerAgent:
|
|
121
128
|
"""
|
|
122
129
|
Main agent class that automatically selects the appropriate agent loop
|
|
@@ -207,19 +214,21 @@ class ComputerAgent:
|
|
|
207
214
|
litellm.custom_provider_map = [
|
|
208
215
|
{"provider": "huggingface-local", "custom_handler": hf_adapter}
|
|
209
216
|
]
|
|
217
|
+
litellm.suppress_debug_info = True
|
|
210
218
|
|
|
211
219
|
# == Initialize computer agent ==
|
|
212
220
|
|
|
213
221
|
# Find the appropriate agent loop
|
|
214
222
|
if custom_loop:
|
|
215
223
|
self.agent_loop = custom_loop
|
|
216
|
-
self.
|
|
224
|
+
self.agent_config_info = None
|
|
217
225
|
else:
|
|
218
|
-
|
|
219
|
-
if not
|
|
220
|
-
raise ValueError(f"No agent
|
|
221
|
-
|
|
222
|
-
self.
|
|
226
|
+
config_info = find_agent_config(model)
|
|
227
|
+
if not config_info:
|
|
228
|
+
raise ValueError(f"No agent config found for model: {model}")
|
|
229
|
+
# Instantiate the agent config class
|
|
230
|
+
self.agent_loop = config_info.agent_class()
|
|
231
|
+
self.agent_config_info = config_info
|
|
223
232
|
|
|
224
233
|
self.tool_schemas = []
|
|
225
234
|
self.computer_handler = None
|
|
@@ -389,8 +398,10 @@ class ComputerAgent:
|
|
|
389
398
|
# AGENT OUTPUT PROCESSING
|
|
390
399
|
# ============================================================================
|
|
391
400
|
|
|
392
|
-
async def _handle_item(self, item: Any, computer: Optional[Computer] = None) -> List[Dict[str, Any]]:
|
|
401
|
+
async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
393
402
|
"""Handle each item; may cause a computer action + screenshot."""
|
|
403
|
+
if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
|
|
404
|
+
return []
|
|
394
405
|
|
|
395
406
|
item_type = item.get("type", None)
|
|
396
407
|
|
|
@@ -439,7 +450,7 @@ class ComputerAgent:
|
|
|
439
450
|
acknowledged_checks = []
|
|
440
451
|
for check in pending_checks:
|
|
441
452
|
check_message = check.get("message", str(check))
|
|
442
|
-
if acknowledge_safety_check_callback(check_message):
|
|
453
|
+
if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks
|
|
443
454
|
acknowledged_checks.append(check)
|
|
444
455
|
else:
|
|
445
456
|
raise ValueError(f"Safety check failed: {check_message}")
|
|
@@ -514,6 +525,12 @@ class ComputerAgent:
|
|
|
514
525
|
Returns:
|
|
515
526
|
AsyncGenerator that yields response chunks
|
|
516
527
|
"""
|
|
528
|
+
if not self.agent_config_info:
|
|
529
|
+
raise ValueError("Agent configuration not found")
|
|
530
|
+
|
|
531
|
+
capabilities = self.get_capabilities()
|
|
532
|
+
if "step" not in capabilities:
|
|
533
|
+
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
|
|
517
534
|
|
|
518
535
|
await self._initialize_computers()
|
|
519
536
|
|
|
@@ -528,7 +545,7 @@ class ComputerAgent:
|
|
|
528
545
|
"messages": messages,
|
|
529
546
|
"stream": stream,
|
|
530
547
|
"model": self.model,
|
|
531
|
-
"agent_loop": self.
|
|
548
|
+
"agent_loop": self.agent_config_info.agent_class.__name__,
|
|
532
549
|
**merged_kwargs
|
|
533
550
|
}
|
|
534
551
|
await self._on_run_start(run_kwargs, old_items)
|
|
@@ -558,7 +575,7 @@ class ComputerAgent:
|
|
|
558
575
|
}
|
|
559
576
|
|
|
560
577
|
# Run agent loop iteration
|
|
561
|
-
result = await self.agent_loop(
|
|
578
|
+
result = await self.agent_loop.predict_step(
|
|
562
579
|
**loop_kwargs,
|
|
563
580
|
_on_api_start=self._on_api_start,
|
|
564
581
|
_on_api_end=self._on_api_end,
|
|
@@ -579,9 +596,12 @@ class ComputerAgent:
|
|
|
579
596
|
# Add agent response to new_items
|
|
580
597
|
new_items += result.get("output")
|
|
581
598
|
|
|
599
|
+
# Get output call ids
|
|
600
|
+
output_call_ids = get_output_call_ids(result.get("output", []))
|
|
601
|
+
|
|
582
602
|
# Handle computer actions
|
|
583
603
|
for item in result.get("output"):
|
|
584
|
-
partial_items = await self._handle_item(item, self.computer_handler)
|
|
604
|
+
partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
|
|
585
605
|
new_items += partial_items
|
|
586
606
|
|
|
587
607
|
# Yield partial response
|
|
@@ -594,4 +614,51 @@ class ComputerAgent:
|
|
|
594
614
|
)
|
|
595
615
|
}
|
|
596
616
|
|
|
597
|
-
await self._on_run_end(loop_kwargs, old_items, new_items)
|
|
617
|
+
await self._on_run_end(loop_kwargs, old_items, new_items)
|
|
618
|
+
|
|
619
|
+
async def predict_click(
|
|
620
|
+
self,
|
|
621
|
+
instruction: str,
|
|
622
|
+
image_b64: Optional[str] = None
|
|
623
|
+
) -> Optional[Tuple[int, int]]:
|
|
624
|
+
"""
|
|
625
|
+
Predict click coordinates based on image and instruction.
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
instruction: Instruction for where to click
|
|
629
|
+
image_b64: Base64 encoded image (optional, will take screenshot if not provided)
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
None or tuple with (x, y) coordinates
|
|
633
|
+
"""
|
|
634
|
+
if not self.agent_config_info:
|
|
635
|
+
raise ValueError("Agent configuration not found")
|
|
636
|
+
|
|
637
|
+
capabilities = self.get_capabilities()
|
|
638
|
+
if "click" not in capabilities:
|
|
639
|
+
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
|
|
640
|
+
if hasattr(self.agent_loop, 'predict_click'):
|
|
641
|
+
if not image_b64:
|
|
642
|
+
if not self.computer_handler:
|
|
643
|
+
raise ValueError("Computer tool or image_b64 is required for predict_click")
|
|
644
|
+
image_b64 = await self.computer_handler.screenshot()
|
|
645
|
+
return await self.agent_loop.predict_click(
|
|
646
|
+
model=self.model,
|
|
647
|
+
image_b64=image_b64,
|
|
648
|
+
instruction=instruction
|
|
649
|
+
)
|
|
650
|
+
return None
|
|
651
|
+
|
|
652
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
653
|
+
"""
|
|
654
|
+
Get list of capabilities supported by the current agent config.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
List of capability strings (e.g., ["step", "click"])
|
|
658
|
+
"""
|
|
659
|
+
if not self.agent_config_info:
|
|
660
|
+
raise ValueError("Agent configuration not found")
|
|
661
|
+
|
|
662
|
+
if hasattr(self.agent_loop, 'get_capabilities'):
|
|
663
|
+
return self.agent_loop.get_capabilities()
|
|
664
|
+
return ["step"] # Default capability
|
|
@@ -120,7 +120,7 @@ async def ainput(prompt: str = ""):
|
|
|
120
120
|
|
|
121
121
|
async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
|
|
122
122
|
"""Main chat loop with the agent."""
|
|
123
|
-
print_welcome(model, agent.
|
|
123
|
+
print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
|
|
124
124
|
|
|
125
125
|
history = []
|
|
126
126
|
|
|
@@ -130,7 +130,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
130
130
|
total_cost = 0
|
|
131
131
|
|
|
132
132
|
while True:
|
|
133
|
-
if history[-1].get("role") != "user":
|
|
133
|
+
if len(history) == 0 or history[-1].get("role") != "user":
|
|
134
134
|
# Get user input with prompt
|
|
135
135
|
print_colored("> ", end="")
|
|
136
136
|
user_input = await ainput()
|
|
@@ -260,7 +260,12 @@ Examples:
|
|
|
260
260
|
help="Show total cost of the agent runs"
|
|
261
261
|
)
|
|
262
262
|
|
|
263
|
-
|
|
263
|
+
parser.add_argument(
|
|
264
|
+
"-r", "--max-retries",
|
|
265
|
+
type=int,
|
|
266
|
+
default=3,
|
|
267
|
+
help="Maximum number of retries for the LLM API calls"
|
|
268
|
+
)
|
|
264
269
|
|
|
265
270
|
args = parser.parse_args()
|
|
266
271
|
|
|
@@ -327,6 +332,7 @@ Examples:
|
|
|
327
332
|
"model": args.model,
|
|
328
333
|
"tools": [computer],
|
|
329
334
|
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
|
335
|
+
"max_retries": args.max_retries
|
|
330
336
|
}
|
|
331
337
|
|
|
332
338
|
if args.images > 0:
|
|
@@ -93,8 +93,10 @@ class OpenAIComputerHandler:
|
|
|
93
93
|
return ""
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def acknowledge_safety_check_callback(message: str) -> bool:
|
|
96
|
+
def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool:
|
|
97
97
|
"""Safety check callback for user acknowledgment."""
|
|
98
|
+
if allow_always:
|
|
99
|
+
return True
|
|
98
100
|
response = input(
|
|
99
101
|
f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
|
|
100
102
|
).lower()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Decorators for agent - agent_loop decorator
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
from .types import AgentConfigInfo
|
|
7
|
+
|
|
8
|
+
# Global registry
|
|
9
|
+
_agent_configs: List[AgentConfigInfo] = []
|
|
10
|
+
|
|
11
|
+
def register_agent(models: str, priority: int = 0):
|
|
12
|
+
"""
|
|
13
|
+
Decorator to register an AsyncAgentConfig class.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
models: Regex pattern to match supported models
|
|
17
|
+
priority: Priority for agent selection (higher = more priority)
|
|
18
|
+
"""
|
|
19
|
+
def decorator(agent_class: type):
|
|
20
|
+
# Validate that the class implements AsyncAgentConfig protocol
|
|
21
|
+
if not hasattr(agent_class, 'predict_step'):
|
|
22
|
+
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
|
|
23
|
+
if not hasattr(agent_class, 'predict_click'):
|
|
24
|
+
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
|
|
25
|
+
if not hasattr(agent_class, 'get_capabilities'):
|
|
26
|
+
raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
|
|
27
|
+
|
|
28
|
+
# Register the agent config
|
|
29
|
+
config_info = AgentConfigInfo(
|
|
30
|
+
agent_class=agent_class,
|
|
31
|
+
models_regex=models,
|
|
32
|
+
priority=priority
|
|
33
|
+
)
|
|
34
|
+
_agent_configs.append(config_info)
|
|
35
|
+
|
|
36
|
+
# Sort by priority (highest first)
|
|
37
|
+
_agent_configs.sort(key=lambda x: x.priority, reverse=True)
|
|
38
|
+
|
|
39
|
+
return agent_class
|
|
40
|
+
|
|
41
|
+
return decorator
|
|
42
|
+
|
|
43
|
+
def get_agent_configs() -> List[AgentConfigInfo]:
|
|
44
|
+
"""Get all registered agent configs"""
|
|
45
|
+
return _agent_configs.copy()
|
|
46
|
+
|
|
47
|
+
def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
|
|
48
|
+
"""Find the best matching agent config for a model"""
|
|
49
|
+
for config_info in _agent_configs:
|
|
50
|
+
if config_info.matches_model(model):
|
|
51
|
+
return config_info
|
|
52
|
+
return None
|
|
@@ -7,5 +7,7 @@ from . import anthropic
|
|
|
7
7
|
from . import openai
|
|
8
8
|
from . import uitars
|
|
9
9
|
from . import omniparser
|
|
10
|
+
from . import gta1
|
|
11
|
+
from . import composed_grounded
|
|
10
12
|
|
|
11
|
-
__all__ = ["anthropic", "openai", "uitars", "omniparser"]
|
|
13
|
+
__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded"]
|
|
@@ -4,12 +4,13 @@ Anthropic hosted tools agent loop implementation using liteLLM
|
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional
|
|
7
|
+
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
8
8
|
import litellm
|
|
9
9
|
from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
|
|
10
10
|
|
|
11
|
-
from ..decorators import
|
|
12
|
-
from ..types import Messages, AgentResponse, Tools
|
|
11
|
+
from ..decorators import register_agent
|
|
12
|
+
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
13
|
+
from ..loops.base import AsyncAgentConfig
|
|
13
14
|
from ..responses import (
|
|
14
15
|
make_reasoning_item,
|
|
15
16
|
make_output_text_item,
|
|
@@ -64,21 +65,28 @@ def _get_tool_config_for_model(model: str) -> Dict[str, str]:
|
|
|
64
65
|
"beta_flag": "computer-use-2024-10-22"
|
|
65
66
|
}
|
|
66
67
|
|
|
67
|
-
def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
68
|
+
async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
|
|
68
69
|
"""Map a computer tool to Anthropic's hosted tool schema."""
|
|
70
|
+
# Get dimensions from the computer handler
|
|
71
|
+
try:
|
|
72
|
+
width, height = await computer_tool.get_dimensions()
|
|
73
|
+
except Exception:
|
|
74
|
+
# Fallback to default dimensions if method fails
|
|
75
|
+
width, height = 1024, 768
|
|
76
|
+
|
|
69
77
|
return {
|
|
70
78
|
"type": tool_version,
|
|
71
79
|
"function": {
|
|
72
80
|
"name": "computer",
|
|
73
81
|
"parameters": {
|
|
74
|
-
"display_height_px":
|
|
75
|
-
"display_width_px":
|
|
76
|
-
"display_number":
|
|
82
|
+
"display_height_px": height,
|
|
83
|
+
"display_width_px": width,
|
|
84
|
+
"display_number": 1,
|
|
77
85
|
},
|
|
78
86
|
},
|
|
79
87
|
}
|
|
80
88
|
|
|
81
|
-
def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
89
|
+
async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
|
|
82
90
|
"""Prepare tools for Anthropic API format."""
|
|
83
91
|
tool_config = _get_tool_config_for_model(model)
|
|
84
92
|
anthropic_tools = []
|
|
@@ -86,7 +94,7 @@ def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str)
|
|
|
86
94
|
for schema in tool_schemas:
|
|
87
95
|
if schema["type"] == "computer":
|
|
88
96
|
# Map computer tool to Anthropic format
|
|
89
|
-
anthropic_tools.append(_map_computer_tool_to_anthropic(
|
|
97
|
+
anthropic_tools.append(await _map_computer_tool_to_anthropic(
|
|
90
98
|
schema["computer"],
|
|
91
99
|
tool_config["tool_version"]
|
|
92
100
|
))
|
|
@@ -1284,84 +1292,192 @@ def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str
|
|
|
1284
1292
|
|
|
1285
1293
|
return merged
|
|
1286
1294
|
|
|
1287
|
-
@
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
model: str,
|
|
1291
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
1292
|
-
max_retries: Optional[int] = None,
|
|
1293
|
-
stream: bool = False,
|
|
1294
|
-
computer_handler=None,
|
|
1295
|
-
use_prompt_caching: Optional[bool] = False,
|
|
1296
|
-
_on_api_start=None,
|
|
1297
|
-
_on_api_end=None,
|
|
1298
|
-
_on_usage=None,
|
|
1299
|
-
_on_screenshot=None,
|
|
1300
|
-
**kwargs
|
|
1301
|
-
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
1302
|
-
"""
|
|
1303
|
-
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
1304
|
-
|
|
1305
|
-
Supports Anthropic's computer use models with hosted tools.
|
|
1306
|
-
"""
|
|
1307
|
-
tools = tools or []
|
|
1308
|
-
|
|
1309
|
-
# Get tool configuration for this model
|
|
1310
|
-
tool_config = _get_tool_config_for_model(model)
|
|
1295
|
+
@register_agent(models=r".*claude-.*")
|
|
1296
|
+
class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
1297
|
+
"""Anthropic hosted tools agent configuration implementing AsyncAgentConfig protocol."""
|
|
1311
1298
|
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
"model": model,
|
|
1326
|
-
"messages": completion_messages,
|
|
1327
|
-
"tools": anthropic_tools if anthropic_tools else None,
|
|
1328
|
-
"stream": stream,
|
|
1329
|
-
"num_retries": max_retries,
|
|
1299
|
+
async def predict_step(
|
|
1300
|
+
self,
|
|
1301
|
+
messages: Messages,
|
|
1302
|
+
model: str,
|
|
1303
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
1304
|
+
max_retries: Optional[int] = None,
|
|
1305
|
+
stream: bool = False,
|
|
1306
|
+
computer_handler=None,
|
|
1307
|
+
use_prompt_caching: Optional[bool] = False,
|
|
1308
|
+
_on_api_start=None,
|
|
1309
|
+
_on_api_end=None,
|
|
1310
|
+
_on_usage=None,
|
|
1311
|
+
_on_screenshot=None,
|
|
1330
1312
|
**kwargs
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1313
|
+
) -> Dict[str, Any]:
|
|
1314
|
+
"""
|
|
1315
|
+
Anthropic hosted tools agent loop using liteLLM acompletion.
|
|
1316
|
+
|
|
1317
|
+
Supports Anthropic's computer use models with hosted tools.
|
|
1318
|
+
"""
|
|
1319
|
+
tools = tools or []
|
|
1320
|
+
|
|
1321
|
+
# Get tool configuration for this model
|
|
1322
|
+
tool_config = _get_tool_config_for_model(model)
|
|
1323
|
+
|
|
1324
|
+
# Prepare tools for Anthropic API
|
|
1325
|
+
anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
|
|
1326
|
+
|
|
1327
|
+
# Convert responses_items messages to completion format
|
|
1328
|
+
completion_messages = _convert_responses_items_to_completion_messages(messages)
|
|
1329
|
+
if use_prompt_caching:
|
|
1330
|
+
# First combine messages to reduce number of blocks
|
|
1331
|
+
completion_messages = _combine_completion_messages(completion_messages)
|
|
1332
|
+
# Then add cache control, anthropic requires explicit "cache_control" dicts
|
|
1333
|
+
completion_messages = _add_cache_control(completion_messages)
|
|
1334
|
+
|
|
1335
|
+
# Prepare API call kwargs
|
|
1336
|
+
api_kwargs = {
|
|
1337
|
+
"model": model,
|
|
1338
|
+
"messages": completion_messages,
|
|
1339
|
+
"tools": anthropic_tools if anthropic_tools else None,
|
|
1340
|
+
"stream": stream,
|
|
1341
|
+
"num_retries": max_retries,
|
|
1342
|
+
**kwargs
|
|
1337
1343
|
}
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1344
|
+
|
|
1345
|
+
# Add beta header for computer use
|
|
1346
|
+
if anthropic_tools:
|
|
1347
|
+
api_kwargs["headers"] = {
|
|
1348
|
+
"anthropic-beta": tool_config["beta_flag"]
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
# Call API start hook
|
|
1352
|
+
if _on_api_start:
|
|
1353
|
+
await _on_api_start(api_kwargs)
|
|
1354
|
+
|
|
1355
|
+
# Use liteLLM acompletion
|
|
1356
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
1357
|
+
|
|
1358
|
+
# Call API end hook
|
|
1359
|
+
if _on_api_end:
|
|
1360
|
+
await _on_api_end(api_kwargs, response)
|
|
1361
|
+
|
|
1362
|
+
# Convert response to responses_items format
|
|
1363
|
+
responses_items = _convert_completion_to_responses_items(response)
|
|
1352
1364
|
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1365
|
+
# Extract usage information
|
|
1366
|
+
responses_usage = {
|
|
1367
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
|
|
1368
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
1369
|
+
}
|
|
1370
|
+
if _on_usage:
|
|
1371
|
+
await _on_usage(responses_usage)
|
|
1360
1372
|
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1373
|
+
# Return in AsyncAgentConfig format
|
|
1374
|
+
return {
|
|
1375
|
+
"output": responses_items,
|
|
1376
|
+
"usage": responses_usage
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
async def predict_click(
|
|
1380
|
+
self,
|
|
1381
|
+
model: str,
|
|
1382
|
+
image_b64: str,
|
|
1383
|
+
instruction: str,
|
|
1384
|
+
**kwargs
|
|
1385
|
+
) -> Optional[Tuple[int, int]]:
|
|
1386
|
+
"""
|
|
1387
|
+
Predict click coordinates based on image and instruction.
|
|
1388
|
+
|
|
1389
|
+
Uses Anthropic's computer use models with a custom prompt that instructs
|
|
1390
|
+
the agent to only output clicks.
|
|
1391
|
+
|
|
1392
|
+
Args:
|
|
1393
|
+
model: Model name to use
|
|
1394
|
+
image_b64: Base64 encoded image
|
|
1395
|
+
instruction: Instruction for where to click
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
Tuple of (x, y) coordinates or None if prediction fails
|
|
1399
|
+
"""
|
|
1400
|
+
# Get image dimensions from base64 data
|
|
1401
|
+
try:
|
|
1402
|
+
import base64
|
|
1403
|
+
from PIL import Image
|
|
1404
|
+
from io import BytesIO
|
|
1405
|
+
|
|
1406
|
+
image_data = base64.b64decode(image_b64)
|
|
1407
|
+
image = Image.open(BytesIO(image_data))
|
|
1408
|
+
display_width, display_height = image.size
|
|
1409
|
+
except Exception:
|
|
1410
|
+
# Fallback to default dimensions if image parsing fails
|
|
1411
|
+
display_width, display_height = 1024, 768
|
|
1412
|
+
|
|
1413
|
+
# Get tool configuration for this model
|
|
1414
|
+
tool_config = _get_tool_config_for_model(model)
|
|
1415
|
+
|
|
1416
|
+
# Prepare computer tool for Anthropic format
|
|
1417
|
+
computer_tool = {
|
|
1418
|
+
"type": tool_config["tool_version"],
|
|
1419
|
+
"function": {
|
|
1420
|
+
"name": "computer",
|
|
1421
|
+
"parameters": {
|
|
1422
|
+
"display_height_px": display_height,
|
|
1423
|
+
"display_width_px": display_width,
|
|
1424
|
+
"display_number": 1,
|
|
1425
|
+
},
|
|
1426
|
+
},
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
# Construct messages in OpenAI chat completion format for liteLLM
|
|
1430
|
+
messages = [
|
|
1431
|
+
{
|
|
1432
|
+
"role": "user",
|
|
1433
|
+
"content": [
|
|
1434
|
+
{
|
|
1435
|
+
"type": "text",
|
|
1436
|
+
"text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
|
1437
|
+
},
|
|
1438
|
+
{
|
|
1439
|
+
"type": "image_url",
|
|
1440
|
+
"image_url": {
|
|
1441
|
+
"url": f"data:image/png;base64,{image_b64}"
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
]
|
|
1445
|
+
}
|
|
1446
|
+
]
|
|
1447
|
+
|
|
1448
|
+
# Prepare API call kwargs
|
|
1449
|
+
api_kwargs = {
|
|
1450
|
+
"model": model,
|
|
1451
|
+
"messages": messages,
|
|
1452
|
+
"tools": [computer_tool],
|
|
1453
|
+
"stream": False,
|
|
1454
|
+
"max_tokens": 100, # Keep response short for click prediction
|
|
1455
|
+
"headers": {
|
|
1456
|
+
"anthropic-beta": tool_config["beta_flag"]
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
# Use liteLLM acompletion
|
|
1461
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
1462
|
+
|
|
1463
|
+
# Convert response to responses_items format to extract click coordinates
|
|
1464
|
+
responses_items = _convert_completion_to_responses_items(response)
|
|
1465
|
+
|
|
1466
|
+
# Look for computer_call with click action
|
|
1467
|
+
for item in responses_items:
|
|
1468
|
+
if (isinstance(item, dict) and
|
|
1469
|
+
item.get("type") == "computer_call" and
|
|
1470
|
+
isinstance(item.get("action"), dict)):
|
|
1471
|
+
|
|
1472
|
+
action = item["action"]
|
|
1473
|
+
if action.get("type") == "click":
|
|
1474
|
+
x = action.get("x")
|
|
1475
|
+
y = action.get("y")
|
|
1476
|
+
if x is not None and y is not None:
|
|
1477
|
+
return (int(x), int(y))
|
|
1478
|
+
|
|
1479
|
+
return None
|
|
1366
1480
|
|
|
1367
|
-
|
|
1481
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
1482
|
+
"""Return the capabilities supported by this agent."""
|
|
1483
|
+
return ["click", "step"]
|