cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import functools
|
|
4
|
+
import io
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
import warnings
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Tuple, cast
|
|
10
|
+
|
|
11
|
+
from litellm import acompletion, completion
|
|
12
|
+
from litellm.llms.custom_llm import CustomLLM
|
|
13
|
+
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
|
14
|
+
from PIL import Image
|
|
15
|
+
|
|
16
|
+
# Try to import MLX dependencies
|
|
17
|
+
try:
|
|
18
|
+
import mlx.core as mx
|
|
19
|
+
from mlx_vlm import generate, load
|
|
20
|
+
from mlx_vlm.prompt_utils import apply_chat_template
|
|
21
|
+
from mlx_vlm.utils import load_config
|
|
22
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
23
|
+
|
|
24
|
+
MLX_AVAILABLE = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
MLX_AVAILABLE = False
|
|
27
|
+
|
|
28
|
+
# Constants for smart_resize
|
|
29
|
+
IMAGE_FACTOR = 28
|
|
30
|
+
MIN_PIXELS = 100 * 28 * 28
|
|
31
|
+
MAX_PIXELS = 16384 * 28 * 28
|
|
32
|
+
MAX_RATIO = 200
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def round_by_factor(number: float, factor: int) -> int:
|
|
36
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
37
|
+
return round(number / factor) * factor
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def ceil_by_factor(number: float, factor: int) -> int:
|
|
41
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
|
42
|
+
return math.ceil(number / factor) * factor
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def floor_by_factor(number: float, factor: int) -> int:
|
|
46
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
|
47
|
+
return math.floor(number / factor) * factor
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def smart_resize(
|
|
51
|
+
height: int,
|
|
52
|
+
width: int,
|
|
53
|
+
factor: int = IMAGE_FACTOR,
|
|
54
|
+
min_pixels: int = MIN_PIXELS,
|
|
55
|
+
max_pixels: int = MAX_PIXELS,
|
|
56
|
+
) -> tuple[int, int]:
|
|
57
|
+
"""
|
|
58
|
+
Rescales the image so that the following conditions are met:
|
|
59
|
+
|
|
60
|
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
|
61
|
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
|
62
|
+
3. The aspect ratio of the image is maintained as closely as possible.
|
|
63
|
+
"""
|
|
64
|
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
|
67
|
+
)
|
|
68
|
+
h_bar = max(factor, round_by_factor(height, factor))
|
|
69
|
+
w_bar = max(factor, round_by_factor(width, factor))
|
|
70
|
+
if h_bar * w_bar > max_pixels:
|
|
71
|
+
beta = math.sqrt((height * width) / max_pixels)
|
|
72
|
+
h_bar = floor_by_factor(height / beta, factor)
|
|
73
|
+
w_bar = floor_by_factor(width / beta, factor)
|
|
74
|
+
elif h_bar * w_bar < min_pixels:
|
|
75
|
+
beta = math.sqrt(min_pixels / (height * width))
|
|
76
|
+
h_bar = ceil_by_factor(height * beta, factor)
|
|
77
|
+
w_bar = ceil_by_factor(width * beta, factor)
|
|
78
|
+
return h_bar, w_bar
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class MLXVLMAdapter(CustomLLM):
|
|
82
|
+
"""MLX VLM Adapter for running vision-language models locally using MLX."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, **kwargs):
|
|
85
|
+
"""Initialize the adapter.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
**kwargs: Additional arguments
|
|
89
|
+
"""
|
|
90
|
+
super().__init__()
|
|
91
|
+
|
|
92
|
+
self.models = {} # Cache for loaded models
|
|
93
|
+
self.processors = {} # Cache for loaded processors
|
|
94
|
+
self.configs = {} # Cache for loaded configs
|
|
95
|
+
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
|
|
96
|
+
|
|
97
|
+
def _load_model_and_processor(self, model_name: str):
|
|
98
|
+
"""Load model and processor if not already cached.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
model_name: Name of the model to load
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Tuple of (model, processor, config)
|
|
105
|
+
"""
|
|
106
|
+
if not MLX_AVAILABLE:
|
|
107
|
+
raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
|
|
108
|
+
|
|
109
|
+
if model_name not in self.models:
|
|
110
|
+
# Load model and processor
|
|
111
|
+
model_obj, processor = load(
|
|
112
|
+
model_name, processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
|
|
113
|
+
)
|
|
114
|
+
config = load_config(model_name)
|
|
115
|
+
|
|
116
|
+
# Cache them
|
|
117
|
+
self.models[model_name] = model_obj
|
|
118
|
+
self.processors[model_name] = processor
|
|
119
|
+
self.configs[model_name] = config
|
|
120
|
+
|
|
121
|
+
return self.models[model_name], self.processors[model_name], self.configs[model_name]
|
|
122
|
+
|
|
123
|
+
def _process_coordinates(
|
|
124
|
+
self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]
|
|
125
|
+
) -> str:
|
|
126
|
+
"""Process coordinates in box tokens based on image resizing using smart_resize approach.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
text: Text containing box tokens
|
|
130
|
+
original_size: Original image size (width, height)
|
|
131
|
+
model_size: Model processed image size (width, height)
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Text with processed coordinates
|
|
135
|
+
"""
|
|
136
|
+
# Find all box tokens
|
|
137
|
+
box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
|
|
138
|
+
|
|
139
|
+
def process_coords(match):
|
|
140
|
+
model_x, model_y = int(match.group(1)), int(match.group(2))
|
|
141
|
+
# Scale coordinates from model space to original image space
|
|
142
|
+
# Both original_size and model_size are in (width, height) format
|
|
143
|
+
new_x = int(model_x * original_size[0] / model_size[0]) # Width
|
|
144
|
+
new_y = int(model_y * original_size[1] / model_size[1]) # Height
|
|
145
|
+
return f"<|box_start|>({new_x},{new_y})<|box_end|>"
|
|
146
|
+
|
|
147
|
+
return re.sub(box_pattern, process_coords, text)
|
|
148
|
+
|
|
149
|
+
def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[
|
|
150
|
+
List[Dict[str, Any]],
|
|
151
|
+
List[Image.Image],
|
|
152
|
+
Dict[int, Tuple[int, int]],
|
|
153
|
+
Dict[int, Tuple[int, int]],
|
|
154
|
+
]:
|
|
155
|
+
"""Convert OpenAI format messages to MLX VLM format and extract images.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
messages: Messages in OpenAI format
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Tuple of (processed_messages, images, original_sizes, model_sizes)
|
|
162
|
+
"""
|
|
163
|
+
processed_messages = []
|
|
164
|
+
images = []
|
|
165
|
+
original_sizes = {} # Track original sizes of images for coordinate mapping
|
|
166
|
+
model_sizes = {} # Track model processed sizes
|
|
167
|
+
image_index = 0
|
|
168
|
+
|
|
169
|
+
for message in messages:
|
|
170
|
+
processed_message = {"role": message["role"], "content": []}
|
|
171
|
+
|
|
172
|
+
content = message.get("content", [])
|
|
173
|
+
if isinstance(content, str):
|
|
174
|
+
# Simple text content
|
|
175
|
+
processed_message["content"] = content
|
|
176
|
+
elif isinstance(content, list):
|
|
177
|
+
# Multi-modal content
|
|
178
|
+
processed_content = []
|
|
179
|
+
for item in content:
|
|
180
|
+
if item.get("type") == "text":
|
|
181
|
+
processed_content.append({"type": "text", "text": item.get("text", "")})
|
|
182
|
+
elif item.get("type") == "image_url":
|
|
183
|
+
image_url = item.get("image_url", {}).get("url", "")
|
|
184
|
+
pil_image = None
|
|
185
|
+
|
|
186
|
+
if image_url.startswith("data:image/"):
|
|
187
|
+
# Extract base64 data
|
|
188
|
+
base64_data = image_url.split(",")[1]
|
|
189
|
+
# Convert base64 to PIL Image
|
|
190
|
+
image_data = base64.b64decode(base64_data)
|
|
191
|
+
pil_image = Image.open(io.BytesIO(image_data))
|
|
192
|
+
else:
|
|
193
|
+
# Handle file path or URL
|
|
194
|
+
pil_image = Image.open(image_url)
|
|
195
|
+
|
|
196
|
+
# Store original image size for coordinate mapping
|
|
197
|
+
original_size = pil_image.size
|
|
198
|
+
original_sizes[image_index] = original_size
|
|
199
|
+
|
|
200
|
+
# Use smart_resize to determine model size
|
|
201
|
+
# Note: smart_resize expects (height, width) but PIL gives (width, height)
|
|
202
|
+
height, width = original_size[1], original_size[0]
|
|
203
|
+
new_height, new_width = smart_resize(height, width)
|
|
204
|
+
# Store model size in (width, height) format for consistent coordinate processing
|
|
205
|
+
model_sizes[image_index] = (new_width, new_height)
|
|
206
|
+
|
|
207
|
+
# Resize the image using the calculated dimensions from smart_resize
|
|
208
|
+
resized_image = pil_image.resize((new_width, new_height))
|
|
209
|
+
images.append(resized_image)
|
|
210
|
+
|
|
211
|
+
# Add image placeholder to content
|
|
212
|
+
processed_content.append({"type": "image"})
|
|
213
|
+
|
|
214
|
+
image_index += 1
|
|
215
|
+
|
|
216
|
+
processed_message["content"] = processed_content
|
|
217
|
+
|
|
218
|
+
processed_messages.append(processed_message)
|
|
219
|
+
|
|
220
|
+
return processed_messages, images, original_sizes, model_sizes
|
|
221
|
+
|
|
222
|
+
def _generate(self, **kwargs) -> str:
|
|
223
|
+
"""Generate response using the local MLX VLM model.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
**kwargs: Keyword arguments containing messages and model info
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Generated text response
|
|
230
|
+
"""
|
|
231
|
+
messages = kwargs.get("messages", [])
|
|
232
|
+
model_name = kwargs.get("model", "mlx-community/UI-TARS-1.5-7B-4bit")
|
|
233
|
+
max_tokens = kwargs.get("max_tokens", 128)
|
|
234
|
+
|
|
235
|
+
# Warn about ignored kwargs
|
|
236
|
+
ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
|
|
237
|
+
if ignored_kwargs:
|
|
238
|
+
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
|
|
239
|
+
|
|
240
|
+
# Load model and processor
|
|
241
|
+
model, processor, config = self._load_model_and_processor(model_name)
|
|
242
|
+
|
|
243
|
+
# Convert messages and extract images
|
|
244
|
+
processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
|
|
245
|
+
|
|
246
|
+
# Process user text input with box coordinates after image processing
|
|
247
|
+
# Swap original_size and model_size arguments for inverse transformation
|
|
248
|
+
for msg_idx, msg in enumerate(processed_messages):
|
|
249
|
+
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
|
250
|
+
content = msg.get("content", "")
|
|
251
|
+
if (
|
|
252
|
+
"<|box_start|>" in content
|
|
253
|
+
and original_sizes
|
|
254
|
+
and model_sizes
|
|
255
|
+
and 0 in original_sizes
|
|
256
|
+
and 0 in model_sizes
|
|
257
|
+
):
|
|
258
|
+
orig_size = original_sizes[0]
|
|
259
|
+
model_size = model_sizes[0]
|
|
260
|
+
# Swap arguments to perform inverse transformation for user input
|
|
261
|
+
processed_messages[msg_idx]["content"] = self._process_coordinates(
|
|
262
|
+
content, model_size, orig_size
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
# Format prompt according to model requirements using the processor directly
|
|
267
|
+
prompt = processor.apply_chat_template(
|
|
268
|
+
processed_messages, tokenize=False, add_generation_prompt=True, return_tensors="pt"
|
|
269
|
+
)
|
|
270
|
+
tokenizer = cast(PreTrainedTokenizer, processor)
|
|
271
|
+
|
|
272
|
+
# Generate response
|
|
273
|
+
text_content, usage = generate(
|
|
274
|
+
model,
|
|
275
|
+
tokenizer,
|
|
276
|
+
str(prompt),
|
|
277
|
+
images, # type: ignore
|
|
278
|
+
verbose=False,
|
|
279
|
+
max_tokens=max_tokens,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
except Exception as e:
|
|
283
|
+
raise RuntimeError(f"Error generating response: {str(e)}") from e
|
|
284
|
+
|
|
285
|
+
# Process coordinates in the response back to original image space
|
|
286
|
+
if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
|
287
|
+
# Get original image size and model size (using the first image)
|
|
288
|
+
orig_size = original_sizes[0]
|
|
289
|
+
model_size = model_sizes[0]
|
|
290
|
+
|
|
291
|
+
# Check if output contains box tokens that need processing
|
|
292
|
+
if "<|box_start|>" in text_content:
|
|
293
|
+
# Process coordinates from model space back to original image space
|
|
294
|
+
text_content = self._process_coordinates(text_content, orig_size, model_size)
|
|
295
|
+
|
|
296
|
+
return text_content
|
|
297
|
+
|
|
298
|
+
def completion(self, *args, **kwargs) -> ModelResponse:
|
|
299
|
+
"""Synchronous completion method.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
ModelResponse with generated text
|
|
303
|
+
"""
|
|
304
|
+
generated_text = self._generate(**kwargs)
|
|
305
|
+
|
|
306
|
+
result = completion(
|
|
307
|
+
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
|
|
308
|
+
mock_response=generated_text,
|
|
309
|
+
)
|
|
310
|
+
return cast(ModelResponse, result)
|
|
311
|
+
|
|
312
|
+
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
|
313
|
+
"""Asynchronous completion method.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
ModelResponse with generated text
|
|
317
|
+
"""
|
|
318
|
+
# Run _generate in thread pool to avoid blocking
|
|
319
|
+
loop = asyncio.get_event_loop()
|
|
320
|
+
generated_text = await loop.run_in_executor(
|
|
321
|
+
self._executor, functools.partial(self._generate, **kwargs)
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
result = await acompletion(
|
|
325
|
+
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
|
|
326
|
+
mock_response=generated_text,
|
|
327
|
+
)
|
|
328
|
+
return cast(ModelResponse, result)
|
|
329
|
+
|
|
330
|
+
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
|
331
|
+
"""Synchronous streaming method.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Iterator of GenericStreamingChunk
|
|
335
|
+
"""
|
|
336
|
+
generated_text = self._generate(**kwargs)
|
|
337
|
+
|
|
338
|
+
generic_streaming_chunk: GenericStreamingChunk = {
|
|
339
|
+
"finish_reason": "stop",
|
|
340
|
+
"index": 0,
|
|
341
|
+
"is_finished": True,
|
|
342
|
+
"text": generated_text,
|
|
343
|
+
"tool_use": None,
|
|
344
|
+
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
yield generic_streaming_chunk
|
|
348
|
+
|
|
349
|
+
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
|
350
|
+
"""Asynchronous streaming method.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
AsyncIterator of GenericStreamingChunk
|
|
354
|
+
"""
|
|
355
|
+
# Run _generate in thread pool to avoid blocking
|
|
356
|
+
loop = asyncio.get_event_loop()
|
|
357
|
+
generated_text = await loop.run_in_executor(
|
|
358
|
+
self._executor, functools.partial(self._generate, **kwargs)
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
generic_streaming_chunk: GenericStreamingChunk = {
|
|
362
|
+
"finish_reason": "stop",
|
|
363
|
+
"index": 0,
|
|
364
|
+
"is_finished": True,
|
|
365
|
+
"text": generated_text,
|
|
366
|
+
"tool_use": None,
|
|
367
|
+
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
yield generic_streaming_chunk
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from transformers import AutoConfig
|
|
5
|
+
|
|
6
|
+
HF_AVAILABLE = True
|
|
7
|
+
except ImportError:
|
|
8
|
+
HF_AVAILABLE = False
|
|
9
|
+
|
|
10
|
+
from .generic import GenericHFModel
|
|
11
|
+
from .internvl import InternVLModel
|
|
12
|
+
from .opencua import OpenCUAModel
|
|
13
|
+
from .qwen2_5_vl import Qwen2_5_VLModel
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
|
|
17
|
+
"""Factory function to load and return the right model handler instance.
|
|
18
|
+
|
|
19
|
+
- If the underlying transformers config class matches OpenCUA, return OpenCUAModel
|
|
20
|
+
- Otherwise, return GenericHFModel
|
|
21
|
+
"""
|
|
22
|
+
if not HF_AVAILABLE:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
|
25
|
+
)
|
|
26
|
+
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
|
|
27
|
+
cls = cfg.__class__.__name__
|
|
28
|
+
print(f"cls: {cls}")
|
|
29
|
+
if "OpenCUA" in cls:
|
|
30
|
+
return OpenCUAModel(
|
|
31
|
+
model_name=model_name, device=device, trust_remote_code=trust_remote_code
|
|
32
|
+
)
|
|
33
|
+
elif "Qwen2_5_VL" in cls:
|
|
34
|
+
return Qwen2_5_VLModel(
|
|
35
|
+
model_name=model_name, device=device, trust_remote_code=trust_remote_code
|
|
36
|
+
)
|
|
37
|
+
elif "InternVL" in cls:
|
|
38
|
+
return InternVLModel(
|
|
39
|
+
model_name=model_name, device=device, trust_remote_code=trust_remote_code
|
|
40
|
+
)
|
|
41
|
+
return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
# Hugging Face imports are local to avoid hard dependency at module import
|
|
4
|
+
try:
|
|
5
|
+
import torch # type: ignore
|
|
6
|
+
from transformers import AutoModel, AutoProcessor # type: ignore
|
|
7
|
+
|
|
8
|
+
HF_AVAILABLE = True
|
|
9
|
+
except Exception:
|
|
10
|
+
HF_AVAILABLE = False
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GenericHFModel:
|
|
14
|
+
"""Generic Hugging Face vision-language model handler.
|
|
15
|
+
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
20
|
+
) -> None:
|
|
21
|
+
if not HF_AVAILABLE:
|
|
22
|
+
raise ImportError(
|
|
23
|
+
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
|
24
|
+
)
|
|
25
|
+
self.model_name = model_name
|
|
26
|
+
self.device = device
|
|
27
|
+
self.model = None
|
|
28
|
+
self.processor = None
|
|
29
|
+
self.trust_remote_code = trust_remote_code
|
|
30
|
+
self._load()
|
|
31
|
+
|
|
32
|
+
def _load(self) -> None:
|
|
33
|
+
# Load model
|
|
34
|
+
self.model = AutoModel.from_pretrained(
|
|
35
|
+
self.model_name,
|
|
36
|
+
torch_dtype=torch.float16,
|
|
37
|
+
device_map=self.device,
|
|
38
|
+
attn_implementation="sdpa",
|
|
39
|
+
trust_remote_code=self.trust_remote_code,
|
|
40
|
+
)
|
|
41
|
+
# Load processor
|
|
42
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
43
|
+
self.model_name,
|
|
44
|
+
min_pixels=3136,
|
|
45
|
+
max_pixels=4096 * 2160,
|
|
46
|
+
device_map=self.device,
|
|
47
|
+
trust_remote_code=self.trust_remote_code,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
|
|
51
|
+
"""Generate text for the given HF-format messages.
|
|
52
|
+
messages: [{ role, content: [{type:'text'|'image', text|image}] }]
|
|
53
|
+
"""
|
|
54
|
+
assert self.model is not None and self.processor is not None
|
|
55
|
+
# Apply chat template and tokenize
|
|
56
|
+
inputs = self.processor.apply_chat_template(
|
|
57
|
+
messages,
|
|
58
|
+
add_generation_prompt=True,
|
|
59
|
+
tokenize=True,
|
|
60
|
+
return_dict=True,
|
|
61
|
+
return_tensors="pt",
|
|
62
|
+
)
|
|
63
|
+
# Move inputs to the same device as model
|
|
64
|
+
inputs = inputs.to(self.model.device)
|
|
65
|
+
# Generate
|
|
66
|
+
with torch.no_grad():
|
|
67
|
+
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
68
|
+
# Trim prompt tokens from output
|
|
69
|
+
generated_ids_trimmed = [
|
|
70
|
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
71
|
+
]
|
|
72
|
+
# Decode
|
|
73
|
+
output_text = self.processor.batch_decode(
|
|
74
|
+
generated_ids_trimmed,
|
|
75
|
+
skip_special_tokens=True,
|
|
76
|
+
clean_up_tokenization_spaces=False,
|
|
77
|
+
)
|
|
78
|
+
return output_text[0] if output_text else ""
|