cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -2,212 +2,155 @@ import asyncio
|
|
|
2
2
|
import functools
|
|
3
3
|
import warnings
|
|
4
4
|
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
-
from typing import
|
|
6
|
-
|
|
5
|
+
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
|
|
6
|
+
|
|
7
|
+
from litellm import acompletion, completion
|
|
7
8
|
from litellm.llms.custom_llm import CustomLLM
|
|
8
|
-
from litellm import
|
|
9
|
+
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
|
9
10
|
|
|
10
11
|
# Try to import HuggingFace dependencies
|
|
11
12
|
try:
|
|
12
13
|
import torch
|
|
13
14
|
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
15
|
+
|
|
14
16
|
HF_AVAILABLE = True
|
|
15
17
|
except ImportError:
|
|
16
18
|
HF_AVAILABLE = False
|
|
17
19
|
|
|
20
|
+
from .models import load_model as load_model_handler
|
|
21
|
+
|
|
18
22
|
|
|
19
23
|
class HuggingFaceLocalAdapter(CustomLLM):
|
|
20
24
|
"""HuggingFace Local Adapter for running vision-language models locally."""
|
|
21
|
-
|
|
22
|
-
def __init__(self, device: str = "auto", **kwargs):
|
|
25
|
+
|
|
26
|
+
def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
|
|
23
27
|
"""Initialize the adapter.
|
|
24
|
-
|
|
28
|
+
|
|
25
29
|
Args:
|
|
26
30
|
device: Device to load model on ("auto", "cuda", "cpu", etc.)
|
|
31
|
+
trust_remote_code: Whether to trust remote code
|
|
27
32
|
**kwargs: Additional arguments
|
|
28
33
|
"""
|
|
29
34
|
super().__init__()
|
|
30
35
|
self.device = device
|
|
31
|
-
self.
|
|
32
|
-
|
|
36
|
+
self.trust_remote_code = trust_remote_code
|
|
37
|
+
# Cache for model handlers keyed by model_name
|
|
38
|
+
self._handlers: Dict[str, Any] = {}
|
|
33
39
|
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
|
|
34
|
-
|
|
35
|
-
def
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
Tuple of (model, processor)
|
|
43
|
-
"""
|
|
44
|
-
if model_name not in self.models:
|
|
45
|
-
# Load model
|
|
46
|
-
model = AutoModelForImageTextToText.from_pretrained(
|
|
47
|
-
model_name,
|
|
48
|
-
torch_dtype=torch.float16,
|
|
49
|
-
device_map=self.device,
|
|
50
|
-
attn_implementation="sdpa"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
# Load processor
|
|
54
|
-
processor = AutoProcessor.from_pretrained(
|
|
55
|
-
model_name,
|
|
56
|
-
min_pixels=3136,
|
|
57
|
-
max_pixels=4096 * 2160,
|
|
58
|
-
device_map=self.device
|
|
40
|
+
|
|
41
|
+
def _get_handler(self, model_name: str):
|
|
42
|
+
"""Get or create a model handler for the given model name."""
|
|
43
|
+
if model_name not in self._handlers:
|
|
44
|
+
self._handlers[model_name] = load_model_handler(
|
|
45
|
+
model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
|
|
59
46
|
)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
self.models[model_name] = model
|
|
63
|
-
self.processors[model_name] = processor
|
|
64
|
-
|
|
65
|
-
return self.models[model_name], self.processors[model_name]
|
|
66
|
-
|
|
47
|
+
return self._handlers[model_name]
|
|
48
|
+
|
|
67
49
|
def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
68
50
|
"""Convert OpenAI format messages to HuggingFace format.
|
|
69
|
-
|
|
51
|
+
|
|
70
52
|
Args:
|
|
71
53
|
messages: Messages in OpenAI format
|
|
72
|
-
|
|
54
|
+
|
|
73
55
|
Returns:
|
|
74
56
|
Messages in HuggingFace format
|
|
75
57
|
"""
|
|
76
58
|
converted_messages = []
|
|
77
|
-
|
|
59
|
+
|
|
78
60
|
for message in messages:
|
|
79
|
-
converted_message = {
|
|
80
|
-
|
|
81
|
-
"content": []
|
|
82
|
-
}
|
|
83
|
-
|
|
61
|
+
converted_message = {"role": message["role"], "content": []}
|
|
62
|
+
|
|
84
63
|
content = message.get("content", [])
|
|
85
64
|
if isinstance(content, str):
|
|
86
65
|
# Simple text content
|
|
87
|
-
converted_message["content"].append({
|
|
88
|
-
"type": "text",
|
|
89
|
-
"text": content
|
|
90
|
-
})
|
|
66
|
+
converted_message["content"].append({"type": "text", "text": content})
|
|
91
67
|
elif isinstance(content, list):
|
|
92
68
|
# Multi-modal content
|
|
93
69
|
for item in content:
|
|
94
70
|
if item.get("type") == "text":
|
|
95
|
-
converted_message["content"].append(
|
|
96
|
-
"type": "text",
|
|
97
|
-
|
|
98
|
-
})
|
|
71
|
+
converted_message["content"].append(
|
|
72
|
+
{"type": "text", "text": item.get("text", "")}
|
|
73
|
+
)
|
|
99
74
|
elif item.get("type") == "image_url":
|
|
100
75
|
# Convert image_url format to image format
|
|
101
76
|
image_url = item.get("image_url", {}).get("url", "")
|
|
102
|
-
converted_message["content"].append({
|
|
103
|
-
|
|
104
|
-
"image": image_url
|
|
105
|
-
})
|
|
106
|
-
|
|
77
|
+
converted_message["content"].append({"type": "image", "image": image_url})
|
|
78
|
+
|
|
107
79
|
converted_messages.append(converted_message)
|
|
108
|
-
|
|
80
|
+
|
|
109
81
|
return converted_messages
|
|
110
|
-
|
|
82
|
+
|
|
111
83
|
def _generate(self, **kwargs) -> str:
|
|
112
84
|
"""Generate response using the local HuggingFace model.
|
|
113
|
-
|
|
85
|
+
|
|
114
86
|
Args:
|
|
115
87
|
**kwargs: Keyword arguments containing messages and model info
|
|
116
|
-
|
|
88
|
+
|
|
117
89
|
Returns:
|
|
118
90
|
Generated text response
|
|
119
91
|
"""
|
|
120
92
|
if not HF_AVAILABLE:
|
|
121
93
|
raise ImportError(
|
|
122
94
|
"HuggingFace transformers dependencies not found. "
|
|
123
|
-
|
|
95
|
+
'Please install with: pip install "cua-agent[uitars-hf]"'
|
|
124
96
|
)
|
|
125
|
-
|
|
97
|
+
|
|
126
98
|
# Extract messages and model from kwargs
|
|
127
|
-
messages = kwargs.get(
|
|
128
|
-
model_name = kwargs.get(
|
|
129
|
-
max_new_tokens = kwargs.get(
|
|
130
|
-
|
|
99
|
+
messages = kwargs.get("messages", [])
|
|
100
|
+
model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
|
|
101
|
+
max_new_tokens = kwargs.get("max_tokens", 128)
|
|
102
|
+
|
|
131
103
|
# Warn about ignored kwargs
|
|
132
|
-
ignored_kwargs = set(kwargs.keys()) - {
|
|
104
|
+
ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
|
|
133
105
|
if ignored_kwargs:
|
|
134
106
|
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
|
|
135
|
-
|
|
136
|
-
# Load model and processor
|
|
137
|
-
model, processor = self._load_model_and_processor(model_name)
|
|
138
|
-
|
|
107
|
+
|
|
139
108
|
# Convert messages to HuggingFace format
|
|
140
109
|
hf_messages = self._convert_messages(messages)
|
|
141
|
-
|
|
142
|
-
#
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
return_dict=True,
|
|
148
|
-
return_tensors="pt"
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Move inputs to the same device as model
|
|
152
|
-
inputs = inputs.to(model.device)
|
|
153
|
-
|
|
154
|
-
# Generate response
|
|
155
|
-
with torch.no_grad():
|
|
156
|
-
generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
157
|
-
|
|
158
|
-
# Trim input tokens from output
|
|
159
|
-
generated_ids_trimmed = [
|
|
160
|
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
161
|
-
]
|
|
162
|
-
|
|
163
|
-
# Decode output
|
|
164
|
-
output_text = processor.batch_decode(
|
|
165
|
-
generated_ids_trimmed,
|
|
166
|
-
skip_special_tokens=True,
|
|
167
|
-
clean_up_tokenization_spaces=False
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
return output_text[0] if output_text else ""
|
|
171
|
-
|
|
110
|
+
|
|
111
|
+
# Delegate to model handler
|
|
112
|
+
handler = self._get_handler(model_name)
|
|
113
|
+
generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
|
|
114
|
+
return generated_text
|
|
115
|
+
|
|
172
116
|
def completion(self, *args, **kwargs) -> ModelResponse:
|
|
173
117
|
"""Synchronous completion method.
|
|
174
|
-
|
|
118
|
+
|
|
175
119
|
Returns:
|
|
176
120
|
ModelResponse with generated text
|
|
177
121
|
"""
|
|
178
122
|
generated_text = self._generate(**kwargs)
|
|
179
|
-
|
|
123
|
+
|
|
180
124
|
return completion(
|
|
181
125
|
model=f"huggingface-local/{kwargs['model']}",
|
|
182
126
|
mock_response=generated_text,
|
|
183
127
|
)
|
|
184
|
-
|
|
128
|
+
|
|
185
129
|
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
|
186
130
|
"""Asynchronous completion method.
|
|
187
|
-
|
|
131
|
+
|
|
188
132
|
Returns:
|
|
189
133
|
ModelResponse with generated text
|
|
190
134
|
"""
|
|
191
135
|
# Run _generate in thread pool to avoid blocking
|
|
192
136
|
loop = asyncio.get_event_loop()
|
|
193
137
|
generated_text = await loop.run_in_executor(
|
|
194
|
-
self._executor,
|
|
195
|
-
functools.partial(self._generate, **kwargs)
|
|
138
|
+
self._executor, functools.partial(self._generate, **kwargs)
|
|
196
139
|
)
|
|
197
|
-
|
|
140
|
+
|
|
198
141
|
return await acompletion(
|
|
199
142
|
model=f"huggingface-local/{kwargs['model']}",
|
|
200
143
|
mock_response=generated_text,
|
|
201
144
|
)
|
|
202
|
-
|
|
145
|
+
|
|
203
146
|
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
|
204
147
|
"""Synchronous streaming method.
|
|
205
|
-
|
|
148
|
+
|
|
206
149
|
Returns:
|
|
207
150
|
Iterator of GenericStreamingChunk
|
|
208
151
|
"""
|
|
209
152
|
generated_text = self._generate(**kwargs)
|
|
210
|
-
|
|
153
|
+
|
|
211
154
|
generic_streaming_chunk: GenericStreamingChunk = {
|
|
212
155
|
"finish_reason": "stop",
|
|
213
156
|
"index": 0,
|
|
@@ -216,22 +159,21 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
216
159
|
"tool_use": None,
|
|
217
160
|
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
|
218
161
|
}
|
|
219
|
-
|
|
162
|
+
|
|
220
163
|
yield generic_streaming_chunk
|
|
221
|
-
|
|
164
|
+
|
|
222
165
|
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
|
223
166
|
"""Asynchronous streaming method.
|
|
224
|
-
|
|
167
|
+
|
|
225
168
|
Returns:
|
|
226
169
|
AsyncIterator of GenericStreamingChunk
|
|
227
170
|
"""
|
|
228
171
|
# Run _generate in thread pool to avoid blocking
|
|
229
172
|
loop = asyncio.get_event_loop()
|
|
230
173
|
generated_text = await loop.run_in_executor(
|
|
231
|
-
self._executor,
|
|
232
|
-
functools.partial(self._generate, **kwargs)
|
|
174
|
+
self._executor, functools.partial(self._generate, **kwargs)
|
|
233
175
|
)
|
|
234
|
-
|
|
176
|
+
|
|
235
177
|
generic_streaming_chunk: GenericStreamingChunk = {
|
|
236
178
|
"finish_reason": "stop",
|
|
237
179
|
"index": 0,
|
|
@@ -240,5 +182,5 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
240
182
|
"tool_use": None,
|
|
241
183
|
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
|
242
184
|
}
|
|
243
|
-
|
|
244
|
-
yield generic_streaming_chunk
|
|
185
|
+
|
|
186
|
+
yield generic_streaming_chunk
|