cua-agent 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +2 -2
- agent/adapters/huggingfacelocal_adapter.py +8 -5
- agent/agent.py +85 -15
- agent/cli.py +9 -3
- agent/computer_handler.py +3 -1
- agent/decorators.py +28 -66
- agent/loops/__init__.py +3 -1
- agent/loops/anthropic.py +200 -84
- agent/loops/base.py +76 -0
- agent/loops/composed_grounded.py +318 -0
- agent/loops/gta1.py +178 -0
- agent/loops/model_types.csv +6 -0
- agent/loops/omniparser.py +178 -84
- agent/loops/openai.py +198 -58
- agent/loops/uitars.py +305 -178
- agent/responses.py +477 -1
- agent/types.py +7 -5
- agent/ui/gradio/app.py +14 -7
- agent/ui/gradio/ui_components.py +18 -1
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/METADATA +3 -3
- cua_agent-0.4.8.dist-info/RECORD +37 -0
- cua_agent-0.4.6.dist-info/RECORD +0 -33
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.6.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0
agent/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
|
|
|
5
5
|
import logging
|
|
6
6
|
import sys
|
|
7
7
|
|
|
8
|
-
from .decorators import
|
|
8
|
+
from .decorators import register_agent
|
|
9
9
|
from .agent import ComputerAgent
|
|
10
10
|
from .types import Messages, AgentResponse
|
|
11
11
|
|
|
@@ -13,7 +13,7 @@ from .types import Messages, AgentResponse
|
|
|
13
13
|
from . import loops
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
16
|
-
"
|
|
16
|
+
"register_agent",
|
|
17
17
|
"ComputerAgent",
|
|
18
18
|
"Messages",
|
|
19
19
|
"AgentResponse"
|
|
@@ -8,7 +8,7 @@ from litellm import completion, acompletion
|
|
|
8
8
|
# Try to import HuggingFace dependencies
|
|
9
9
|
try:
|
|
10
10
|
import torch
|
|
11
|
-
from transformers import
|
|
11
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
12
12
|
HF_AVAILABLE = True
|
|
13
13
|
except ImportError:
|
|
14
14
|
HF_AVAILABLE = False
|
|
@@ -40,7 +40,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
40
40
|
"""
|
|
41
41
|
if model_name not in self.models:
|
|
42
42
|
# Load model
|
|
43
|
-
model =
|
|
43
|
+
model = AutoModelForImageTextToText.from_pretrained(
|
|
44
44
|
model_name,
|
|
45
45
|
torch_dtype=torch.float16,
|
|
46
46
|
device_map=self.device,
|
|
@@ -48,7 +48,11 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
48
48
|
)
|
|
49
49
|
|
|
50
50
|
# Load processor
|
|
51
|
-
processor = AutoProcessor.from_pretrained(
|
|
51
|
+
processor = AutoProcessor.from_pretrained(
|
|
52
|
+
model_name,
|
|
53
|
+
min_pixels=3136,
|
|
54
|
+
max_pixels=4096 * 2160
|
|
55
|
+
)
|
|
52
56
|
|
|
53
57
|
# Cache them
|
|
54
58
|
self.models[model_name] = model
|
|
@@ -141,8 +145,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
141
145
|
)
|
|
142
146
|
|
|
143
147
|
# Move inputs to the same device as model
|
|
144
|
-
|
|
145
|
-
inputs = inputs.to("cuda")
|
|
148
|
+
inputs = inputs.to(model.device)
|
|
146
149
|
|
|
147
150
|
# Generate response
|
|
148
151
|
with torch.no_grad():
|
agent/agent.py
CHANGED
|
@@ -3,12 +3,12 @@ ComputerAgent - Main agent class that selects and runs agent loops
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
-
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set
|
|
6
|
+
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
|
|
7
7
|
|
|
8
8
|
from litellm.responses.utils import Usage
|
|
9
9
|
|
|
10
|
-
from .types import Messages, Computer
|
|
11
|
-
from .decorators import
|
|
10
|
+
from .types import Messages, Computer, AgentCapability
|
|
11
|
+
from .decorators import find_agent_config
|
|
12
12
|
from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
|
|
13
13
|
import json
|
|
14
14
|
import litellm
|
|
@@ -117,6 +117,13 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
117
117
|
return sanitized
|
|
118
118
|
return msg
|
|
119
119
|
|
|
120
|
+
def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
|
|
121
|
+
call_ids = []
|
|
122
|
+
for message in messages:
|
|
123
|
+
if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
|
|
124
|
+
call_ids.append(message.get("call_id"))
|
|
125
|
+
return call_ids
|
|
126
|
+
|
|
120
127
|
class ComputerAgent:
|
|
121
128
|
"""
|
|
122
129
|
Main agent class that automatically selects the appropriate agent loop
|
|
@@ -207,19 +214,21 @@ class ComputerAgent:
|
|
|
207
214
|
litellm.custom_provider_map = [
|
|
208
215
|
{"provider": "huggingface-local", "custom_handler": hf_adapter}
|
|
209
216
|
]
|
|
217
|
+
litellm.suppress_debug_info = True
|
|
210
218
|
|
|
211
219
|
# == Initialize computer agent ==
|
|
212
220
|
|
|
213
221
|
# Find the appropriate agent loop
|
|
214
222
|
if custom_loop:
|
|
215
223
|
self.agent_loop = custom_loop
|
|
216
|
-
self.
|
|
224
|
+
self.agent_config_info = None
|
|
217
225
|
else:
|
|
218
|
-
|
|
219
|
-
if not
|
|
220
|
-
raise ValueError(f"No agent
|
|
221
|
-
|
|
222
|
-
self.
|
|
226
|
+
config_info = find_agent_config(model)
|
|
227
|
+
if not config_info:
|
|
228
|
+
raise ValueError(f"No agent config found for model: {model}")
|
|
229
|
+
# Instantiate the agent config class
|
|
230
|
+
self.agent_loop = config_info.agent_class()
|
|
231
|
+
self.agent_config_info = config_info
|
|
223
232
|
|
|
224
233
|
self.tool_schemas = []
|
|
225
234
|
self.computer_handler = None
|
|
@@ -389,8 +398,10 @@ class ComputerAgent:
|
|
|
389
398
|
# AGENT OUTPUT PROCESSING
|
|
390
399
|
# ============================================================================
|
|
391
400
|
|
|
392
|
-
async def _handle_item(self, item: Any, computer: Optional[Computer] = None) -> List[Dict[str, Any]]:
|
|
401
|
+
async def _handle_item(self, item: Any, computer: Optional[Computer] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
393
402
|
"""Handle each item; may cause a computer action + screenshot."""
|
|
403
|
+
if ignore_call_ids and item.get("call_id") and item.get("call_id") in ignore_call_ids:
|
|
404
|
+
return []
|
|
394
405
|
|
|
395
406
|
item_type = item.get("type", None)
|
|
396
407
|
|
|
@@ -411,6 +422,9 @@ class ComputerAgent:
|
|
|
411
422
|
# Perform computer actions
|
|
412
423
|
action = item.get("action")
|
|
413
424
|
action_type = action.get("type")
|
|
425
|
+
if action_type is None:
|
|
426
|
+
print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
|
|
427
|
+
return []
|
|
414
428
|
|
|
415
429
|
# Extract action arguments (all fields except 'type')
|
|
416
430
|
action_args = {k: v for k, v in action.items() if k != "type"}
|
|
@@ -436,7 +450,7 @@ class ComputerAgent:
|
|
|
436
450
|
acknowledged_checks = []
|
|
437
451
|
for check in pending_checks:
|
|
438
452
|
check_message = check.get("message", str(check))
|
|
439
|
-
if acknowledge_safety_check_callback(check_message):
|
|
453
|
+
if acknowledge_safety_check_callback(check_message, allow_always=True): # TODO: implement a callback for safety checks
|
|
440
454
|
acknowledged_checks.append(check)
|
|
441
455
|
else:
|
|
442
456
|
raise ValueError(f"Safety check failed: {check_message}")
|
|
@@ -511,6 +525,12 @@ class ComputerAgent:
|
|
|
511
525
|
Returns:
|
|
512
526
|
AsyncGenerator that yields response chunks
|
|
513
527
|
"""
|
|
528
|
+
if not self.agent_config_info:
|
|
529
|
+
raise ValueError("Agent configuration not found")
|
|
530
|
+
|
|
531
|
+
capabilities = self.get_capabilities()
|
|
532
|
+
if "step" not in capabilities:
|
|
533
|
+
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
|
|
514
534
|
|
|
515
535
|
await self._initialize_computers()
|
|
516
536
|
|
|
@@ -525,7 +545,7 @@ class ComputerAgent:
|
|
|
525
545
|
"messages": messages,
|
|
526
546
|
"stream": stream,
|
|
527
547
|
"model": self.model,
|
|
528
|
-
"agent_loop": self.
|
|
548
|
+
"agent_loop": self.agent_config_info.agent_class.__name__,
|
|
529
549
|
**merged_kwargs
|
|
530
550
|
}
|
|
531
551
|
await self._on_run_start(run_kwargs, old_items)
|
|
@@ -555,7 +575,7 @@ class ComputerAgent:
|
|
|
555
575
|
}
|
|
556
576
|
|
|
557
577
|
# Run agent loop iteration
|
|
558
|
-
result = await self.agent_loop(
|
|
578
|
+
result = await self.agent_loop.predict_step(
|
|
559
579
|
**loop_kwargs,
|
|
560
580
|
_on_api_start=self._on_api_start,
|
|
561
581
|
_on_api_end=self._on_api_end,
|
|
@@ -576,9 +596,12 @@ class ComputerAgent:
|
|
|
576
596
|
# Add agent response to new_items
|
|
577
597
|
new_items += result.get("output")
|
|
578
598
|
|
|
599
|
+
# Get output call ids
|
|
600
|
+
output_call_ids = get_output_call_ids(result.get("output", []))
|
|
601
|
+
|
|
579
602
|
# Handle computer actions
|
|
580
603
|
for item in result.get("output"):
|
|
581
|
-
partial_items = await self._handle_item(item, self.computer_handler)
|
|
604
|
+
partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
|
|
582
605
|
new_items += partial_items
|
|
583
606
|
|
|
584
607
|
# Yield partial response
|
|
@@ -591,4 +614,51 @@ class ComputerAgent:
|
|
|
591
614
|
)
|
|
592
615
|
}
|
|
593
616
|
|
|
594
|
-
await self._on_run_end(loop_kwargs, old_items, new_items)
|
|
617
|
+
await self._on_run_end(loop_kwargs, old_items, new_items)
|
|
618
|
+
|
|
619
|
+
async def predict_click(
|
|
620
|
+
self,
|
|
621
|
+
instruction: str,
|
|
622
|
+
image_b64: Optional[str] = None
|
|
623
|
+
) -> Optional[Tuple[int, int]]:
|
|
624
|
+
"""
|
|
625
|
+
Predict click coordinates based on image and instruction.
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
instruction: Instruction for where to click
|
|
629
|
+
image_b64: Base64 encoded image (optional, will take screenshot if not provided)
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
None or tuple with (x, y) coordinates
|
|
633
|
+
"""
|
|
634
|
+
if not self.agent_config_info:
|
|
635
|
+
raise ValueError("Agent configuration not found")
|
|
636
|
+
|
|
637
|
+
capabilities = self.get_capabilities()
|
|
638
|
+
if "click" not in capabilities:
|
|
639
|
+
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
|
|
640
|
+
if hasattr(self.agent_loop, 'predict_click'):
|
|
641
|
+
if not image_b64:
|
|
642
|
+
if not self.computer_handler:
|
|
643
|
+
raise ValueError("Computer tool or image_b64 is required for predict_click")
|
|
644
|
+
image_b64 = await self.computer_handler.screenshot()
|
|
645
|
+
return await self.agent_loop.predict_click(
|
|
646
|
+
model=self.model,
|
|
647
|
+
image_b64=image_b64,
|
|
648
|
+
instruction=instruction
|
|
649
|
+
)
|
|
650
|
+
return None
|
|
651
|
+
|
|
652
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
653
|
+
"""
|
|
654
|
+
Get list of capabilities supported by the current agent config.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
List of capability strings (e.g., ["step", "click"])
|
|
658
|
+
"""
|
|
659
|
+
if not self.agent_config_info:
|
|
660
|
+
raise ValueError("Agent configuration not found")
|
|
661
|
+
|
|
662
|
+
if hasattr(self.agent_loop, 'get_capabilities'):
|
|
663
|
+
return self.agent_loop.get_capabilities()
|
|
664
|
+
return ["step"] # Default capability
|
agent/cli.py
CHANGED
|
@@ -120,7 +120,7 @@ async def ainput(prompt: str = ""):
|
|
|
120
120
|
|
|
121
121
|
async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
|
|
122
122
|
"""Main chat loop with the agent."""
|
|
123
|
-
print_welcome(model, agent.
|
|
123
|
+
print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
|
|
124
124
|
|
|
125
125
|
history = []
|
|
126
126
|
|
|
@@ -130,7 +130,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
130
130
|
total_cost = 0
|
|
131
131
|
|
|
132
132
|
while True:
|
|
133
|
-
if history[-1].get("role") != "user":
|
|
133
|
+
if len(history) == 0 or history[-1].get("role") != "user":
|
|
134
134
|
# Get user input with prompt
|
|
135
135
|
print_colored("> ", end="")
|
|
136
136
|
user_input = await ainput()
|
|
@@ -260,7 +260,12 @@ Examples:
|
|
|
260
260
|
help="Show total cost of the agent runs"
|
|
261
261
|
)
|
|
262
262
|
|
|
263
|
-
|
|
263
|
+
parser.add_argument(
|
|
264
|
+
"-r", "--max-retries",
|
|
265
|
+
type=int,
|
|
266
|
+
default=3,
|
|
267
|
+
help="Maximum number of retries for the LLM API calls"
|
|
268
|
+
)
|
|
264
269
|
|
|
265
270
|
args = parser.parse_args()
|
|
266
271
|
|
|
@@ -327,6 +332,7 @@ Examples:
|
|
|
327
332
|
"model": args.model,
|
|
328
333
|
"tools": [computer],
|
|
329
334
|
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
|
335
|
+
"max_retries": args.max_retries
|
|
330
336
|
}
|
|
331
337
|
|
|
332
338
|
if args.images > 0:
|
agent/computer_handler.py
CHANGED
|
@@ -93,8 +93,10 @@ class OpenAIComputerHandler:
|
|
|
93
93
|
return ""
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def acknowledge_safety_check_callback(message: str) -> bool:
|
|
96
|
+
def acknowledge_safety_check_callback(message: str, allow_always: bool = False) -> bool:
|
|
97
97
|
"""Safety check callback for user acknowledgment."""
|
|
98
|
+
if allow_always:
|
|
99
|
+
return True
|
|
98
100
|
response = input(
|
|
99
101
|
f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
|
|
100
102
|
).lower()
|
agent/decorators.py
CHANGED
|
@@ -2,89 +2,51 @@
|
|
|
2
2
|
Decorators for agent - agent_loop decorator
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
import
|
|
7
|
-
from typing import Dict, List, Any, Callable, Optional
|
|
8
|
-
from functools import wraps
|
|
9
|
-
|
|
10
|
-
from .types import AgentLoopInfo
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
from .types import AgentConfigInfo
|
|
11
7
|
|
|
12
8
|
# Global registry
|
|
13
|
-
|
|
9
|
+
_agent_configs: List[AgentConfigInfo] = []
|
|
14
10
|
|
|
15
|
-
def
|
|
11
|
+
def register_agent(models: str, priority: int = 0):
|
|
16
12
|
"""
|
|
17
|
-
Decorator to register an
|
|
13
|
+
Decorator to register an AsyncAgentConfig class.
|
|
18
14
|
|
|
19
15
|
Args:
|
|
20
16
|
models: Regex pattern to match supported models
|
|
21
|
-
priority: Priority for
|
|
17
|
+
priority: Priority for agent selection (higher = more priority)
|
|
22
18
|
"""
|
|
23
|
-
def decorator(
|
|
24
|
-
# Validate
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
if not
|
|
30
|
-
|
|
31
|
-
raise ValueError(f"Agent loop function must have parameters: {missing}")
|
|
19
|
+
def decorator(agent_class: type):
|
|
20
|
+
# Validate that the class implements AsyncAgentConfig protocol
|
|
21
|
+
if not hasattr(agent_class, 'predict_step'):
|
|
22
|
+
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
|
|
23
|
+
if not hasattr(agent_class, 'predict_click'):
|
|
24
|
+
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
|
|
25
|
+
if not hasattr(agent_class, 'get_capabilities'):
|
|
26
|
+
raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
|
|
32
27
|
|
|
33
|
-
# Register the
|
|
34
|
-
|
|
35
|
-
|
|
28
|
+
# Register the agent config
|
|
29
|
+
config_info = AgentConfigInfo(
|
|
30
|
+
agent_class=agent_class,
|
|
36
31
|
models_regex=models,
|
|
37
32
|
priority=priority
|
|
38
33
|
)
|
|
39
|
-
|
|
34
|
+
_agent_configs.append(config_info)
|
|
40
35
|
|
|
41
36
|
# Sort by priority (highest first)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@wraps(func)
|
|
45
|
-
async def wrapper(*args, **kwargs):
|
|
46
|
-
# Wrap the function in an asyncio.Queue for cancellation support
|
|
47
|
-
queue = asyncio.Queue()
|
|
48
|
-
task = None
|
|
49
|
-
|
|
50
|
-
try:
|
|
51
|
-
# Create a task that can be cancelled
|
|
52
|
-
async def run_loop():
|
|
53
|
-
try:
|
|
54
|
-
result = await func(*args, **kwargs)
|
|
55
|
-
await queue.put(('result', result))
|
|
56
|
-
except Exception as e:
|
|
57
|
-
await queue.put(('error', e))
|
|
58
|
-
|
|
59
|
-
task = asyncio.create_task(run_loop())
|
|
60
|
-
|
|
61
|
-
# Wait for result or cancellation
|
|
62
|
-
event_type, data = await queue.get()
|
|
63
|
-
|
|
64
|
-
if event_type == 'error':
|
|
65
|
-
raise data
|
|
66
|
-
return data
|
|
67
|
-
|
|
68
|
-
except asyncio.CancelledError:
|
|
69
|
-
if task:
|
|
70
|
-
task.cancel()
|
|
71
|
-
try:
|
|
72
|
-
await task
|
|
73
|
-
except asyncio.CancelledError:
|
|
74
|
-
pass
|
|
75
|
-
raise
|
|
37
|
+
_agent_configs.sort(key=lambda x: x.priority, reverse=True)
|
|
76
38
|
|
|
77
|
-
return
|
|
39
|
+
return agent_class
|
|
78
40
|
|
|
79
41
|
return decorator
|
|
80
42
|
|
|
81
|
-
def
|
|
82
|
-
"""Get all registered agent
|
|
83
|
-
return
|
|
43
|
+
def get_agent_configs() -> List[AgentConfigInfo]:
|
|
44
|
+
"""Get all registered agent configs"""
|
|
45
|
+
return _agent_configs.copy()
|
|
84
46
|
|
|
85
|
-
def
|
|
86
|
-
"""Find the best matching agent
|
|
87
|
-
for
|
|
88
|
-
if
|
|
89
|
-
return
|
|
47
|
+
def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
|
|
48
|
+
"""Find the best matching agent config for a model"""
|
|
49
|
+
for config_info in _agent_configs:
|
|
50
|
+
if config_info.matches_model(model):
|
|
51
|
+
return config_info
|
|
90
52
|
return None
|
agent/loops/__init__.py
CHANGED
|
@@ -7,5 +7,7 @@ from . import anthropic
|
|
|
7
7
|
from . import openai
|
|
8
8
|
from . import uitars
|
|
9
9
|
from . import omniparser
|
|
10
|
+
from . import gta1
|
|
11
|
+
from . import composed_grounded
|
|
10
12
|
|
|
11
|
-
__all__ = ["anthropic", "openai", "uitars", "omniparser"]
|
|
13
|
+
__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded"]
|