cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
# Hugging Face imports are local to avoid hard dependency at module import
|
|
6
|
+
try:
|
|
7
|
+
import base64 # type: ignore
|
|
8
|
+
from io import BytesIO # type: ignore
|
|
9
|
+
|
|
10
|
+
# Attempt to import InternVL's model dependencies
|
|
11
|
+
import einops as _ # type: ignore
|
|
12
|
+
import requests # type: ignore
|
|
13
|
+
import timm as _ # type: ignore
|
|
14
|
+
import torch # type: ignore
|
|
15
|
+
import torchvision.transforms as T # type: ignore
|
|
16
|
+
from PIL import Image # type: ignore
|
|
17
|
+
from torchvision.transforms.functional import InterpolationMode # type: ignore
|
|
18
|
+
from transformers import AutoModel, AutoTokenizer # type: ignore
|
|
19
|
+
|
|
20
|
+
HF_AVAILABLE = True
|
|
21
|
+
except Exception:
|
|
22
|
+
HF_AVAILABLE = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InternVLModel:
|
|
26
|
+
"""Generic Hugging Face vision-language model handler.
|
|
27
|
+
Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
|
|
28
|
+
Provides preprocessing to support multi-turn conversations with multiple images.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
33
|
+
) -> None:
|
|
34
|
+
if not HF_AVAILABLE:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
|
|
37
|
+
)
|
|
38
|
+
self.model_name = model_name
|
|
39
|
+
self.device = device
|
|
40
|
+
self.model = None
|
|
41
|
+
self.tokenizer = None
|
|
42
|
+
self.trust_remote_code = trust_remote_code
|
|
43
|
+
self._load()
|
|
44
|
+
|
|
45
|
+
def _load(self) -> None:
|
|
46
|
+
# Load model
|
|
47
|
+
self.model = AutoModel.from_pretrained(
|
|
48
|
+
self.model_name,
|
|
49
|
+
torch_dtype=torch.bfloat16,
|
|
50
|
+
low_cpu_mem_usage=True,
|
|
51
|
+
use_flash_attn=True,
|
|
52
|
+
device_map=self.device,
|
|
53
|
+
trust_remote_code=self.trust_remote_code,
|
|
54
|
+
).eval()
|
|
55
|
+
# Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
|
|
56
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
57
|
+
self.model_name,
|
|
58
|
+
trust_remote_code=self.trust_remote_code,
|
|
59
|
+
use_fast=False,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# ---- Image preprocessing utilities adapted from InternVL docs ----
|
|
63
|
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
|
64
|
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
|
65
|
+
|
|
66
|
+
def _build_transform(self, input_size: int) -> T.Compose:
|
|
67
|
+
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
|
|
68
|
+
transform = T.Compose(
|
|
69
|
+
[
|
|
70
|
+
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
|
71
|
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
|
72
|
+
T.ToTensor(),
|
|
73
|
+
T.Normalize(mean=MEAN, std=STD),
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
return transform
|
|
77
|
+
|
|
78
|
+
def _find_closest_aspect_ratio(
|
|
79
|
+
self,
|
|
80
|
+
aspect_ratio: float,
|
|
81
|
+
target_ratios: List[tuple],
|
|
82
|
+
width: int,
|
|
83
|
+
height: int,
|
|
84
|
+
image_size: int,
|
|
85
|
+
):
|
|
86
|
+
best_ratio_diff = float("inf")
|
|
87
|
+
best_ratio = (1, 1)
|
|
88
|
+
area = width * height
|
|
89
|
+
for ratio in target_ratios:
|
|
90
|
+
target_aspect_ratio = ratio[0] / ratio[1]
|
|
91
|
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
|
92
|
+
if ratio_diff < best_ratio_diff:
|
|
93
|
+
best_ratio_diff = ratio_diff
|
|
94
|
+
best_ratio = ratio
|
|
95
|
+
elif ratio_diff == best_ratio_diff:
|
|
96
|
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
|
97
|
+
best_ratio = ratio
|
|
98
|
+
return best_ratio
|
|
99
|
+
|
|
100
|
+
def _dynamic_preprocess(
|
|
101
|
+
self,
|
|
102
|
+
image: Image.Image,
|
|
103
|
+
min_num: int = 1,
|
|
104
|
+
max_num: int = 12,
|
|
105
|
+
image_size: int = 448,
|
|
106
|
+
use_thumbnail: bool = True,
|
|
107
|
+
) -> List[Image.Image]:
|
|
108
|
+
orig_width, orig_height = image.size
|
|
109
|
+
aspect_ratio = orig_width / orig_height
|
|
110
|
+
|
|
111
|
+
target_ratios = set(
|
|
112
|
+
(i, j)
|
|
113
|
+
for n in range(min_num, max_num + 1)
|
|
114
|
+
for i in range(1, n + 1)
|
|
115
|
+
for j in range(1, n + 1)
|
|
116
|
+
if i * j <= max_num and i * j >= min_num
|
|
117
|
+
)
|
|
118
|
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
|
119
|
+
|
|
120
|
+
target_aspect_ratio = self._find_closest_aspect_ratio(
|
|
121
|
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
target_width = image_size * target_aspect_ratio[0]
|
|
125
|
+
target_height = image_size * target_aspect_ratio[1]
|
|
126
|
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
|
127
|
+
|
|
128
|
+
resized_img = image.resize((target_width, target_height))
|
|
129
|
+
processed_images: List[Image.Image] = []
|
|
130
|
+
for i in range(blocks):
|
|
131
|
+
box = (
|
|
132
|
+
(i % (target_width // image_size)) * image_size,
|
|
133
|
+
(i // (target_width // image_size)) * image_size,
|
|
134
|
+
((i % (target_width // image_size)) + 1) * image_size,
|
|
135
|
+
((i // (target_width // image_size)) + 1) * image_size,
|
|
136
|
+
)
|
|
137
|
+
split_img = resized_img.crop(box)
|
|
138
|
+
processed_images.append(split_img)
|
|
139
|
+
assert len(processed_images) == blocks
|
|
140
|
+
if use_thumbnail and len(processed_images) != 1:
|
|
141
|
+
thumbnail_img = image.resize((image_size, image_size))
|
|
142
|
+
processed_images.append(thumbnail_img)
|
|
143
|
+
return processed_images
|
|
144
|
+
|
|
145
|
+
def _load_image_from_source(self, src: str) -> Image.Image:
|
|
146
|
+
"""Load PIL image from various sources: data URL, http(s), or local path."""
|
|
147
|
+
if src.startswith("data:image/"):
|
|
148
|
+
# data URL base64
|
|
149
|
+
header, b64data = src.split(",", 1)
|
|
150
|
+
img_bytes = base64.b64decode(b64data)
|
|
151
|
+
return Image.open(BytesIO(img_bytes)).convert("RGB")
|
|
152
|
+
if src.startswith("http://") or src.startswith("https://"):
|
|
153
|
+
resp = requests.get(src, timeout=10)
|
|
154
|
+
resp.raise_for_status()
|
|
155
|
+
return Image.open(BytesIO(resp.content)).convert("RGB")
|
|
156
|
+
# Assume local file path
|
|
157
|
+
return Image.open(src).convert("RGB")
|
|
158
|
+
|
|
159
|
+
def _images_to_pixel_values(
|
|
160
|
+
self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
|
|
161
|
+
):
|
|
162
|
+
transform = self._build_transform(input_size=input_size)
|
|
163
|
+
pixel_values_list = []
|
|
164
|
+
num_patches_list: List[int] = []
|
|
165
|
+
for img in images:
|
|
166
|
+
tiles = self._dynamic_preprocess(
|
|
167
|
+
img, image_size=input_size, use_thumbnail=True, max_num=max_num
|
|
168
|
+
)
|
|
169
|
+
pv = [transform(tile) for tile in tiles]
|
|
170
|
+
pv = torch.stack(pv)
|
|
171
|
+
num_patches_list.append(pv.shape[0])
|
|
172
|
+
pixel_values_list.append(pv)
|
|
173
|
+
if not pixel_values_list:
|
|
174
|
+
return None, []
|
|
175
|
+
pixel_values = torch.cat(pixel_values_list)
|
|
176
|
+
return pixel_values, num_patches_list
|
|
177
|
+
|
|
178
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
|
|
179
|
+
"""Generate text for the given HF-format messages.
|
|
180
|
+
messages: [{ role, content: [{type:'text'|'image', text|image}] }]
|
|
181
|
+
|
|
182
|
+
This implementation constructs InternVL-compatible inputs and uses
|
|
183
|
+
`model.chat(tokenizer, pixel_values, question, history=...)` to avoid
|
|
184
|
+
relying on AutoProcessor (which fails for some tokenizers).
|
|
185
|
+
"""
|
|
186
|
+
assert self.model is not None and self.tokenizer is not None
|
|
187
|
+
|
|
188
|
+
# Build textual context and collect images and the final question
|
|
189
|
+
context_lines: List[str] = []
|
|
190
|
+
all_images: List[Image.Image] = []
|
|
191
|
+
last_user_text_parts: List[str] = []
|
|
192
|
+
|
|
193
|
+
for msg in messages:
|
|
194
|
+
role = msg.get("role", "user")
|
|
195
|
+
content = msg.get("content", [])
|
|
196
|
+
if isinstance(content, str):
|
|
197
|
+
content_items = [{"type": "text", "text": content}]
|
|
198
|
+
else:
|
|
199
|
+
content_items = content
|
|
200
|
+
|
|
201
|
+
if role == "user":
|
|
202
|
+
# Collect text and images
|
|
203
|
+
parts_text: List[str] = []
|
|
204
|
+
for item in content_items:
|
|
205
|
+
if item.get("type") == "text":
|
|
206
|
+
t = item.get("text", "")
|
|
207
|
+
if t:
|
|
208
|
+
parts_text.append(t)
|
|
209
|
+
elif item.get("type") == "image":
|
|
210
|
+
url = item.get("image", "")
|
|
211
|
+
if url:
|
|
212
|
+
try:
|
|
213
|
+
all_images.append(self._load_image_from_source(url))
|
|
214
|
+
except Exception:
|
|
215
|
+
# Ignore failed image loads but keep going
|
|
216
|
+
pass
|
|
217
|
+
text = "\n".join(parts_text).strip()
|
|
218
|
+
if text:
|
|
219
|
+
context_lines.append(f"User: {text}")
|
|
220
|
+
# Track last user text separately for question
|
|
221
|
+
last_user_text_parts = parts_text or last_user_text_parts
|
|
222
|
+
elif role == "assistant":
|
|
223
|
+
# Only keep text content for history
|
|
224
|
+
parts_text = [
|
|
225
|
+
item.get("text", "") for item in content_items if item.get("type") == "text"
|
|
226
|
+
]
|
|
227
|
+
text = "\n".join(parts_text).strip()
|
|
228
|
+
if text:
|
|
229
|
+
context_lines.append(f"Assistant: {text}")
|
|
230
|
+
|
|
231
|
+
# Prepare pixel values for all collected images (across turns)
|
|
232
|
+
pixel_values = None
|
|
233
|
+
num_patches_list: List[int] = []
|
|
234
|
+
if all_images:
|
|
235
|
+
pixel_values, num_patches_list = self._images_to_pixel_values(
|
|
236
|
+
all_images, input_size=448, max_num=12
|
|
237
|
+
)
|
|
238
|
+
if pixel_values is not None:
|
|
239
|
+
# Convert dtype/device as in docs
|
|
240
|
+
pixel_values = pixel_values.to(torch.bfloat16)
|
|
241
|
+
# Chat API expects tensors on CUDA when model is on CUDA
|
|
242
|
+
try:
|
|
243
|
+
pixel_values = pixel_values.to(self.model.device)
|
|
244
|
+
except Exception:
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
# Build question with any prior context and numbered image placeholders
|
|
248
|
+
if all_images:
|
|
249
|
+
# Separate images layout: Image-1: <image> ... then question text
|
|
250
|
+
prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
|
|
251
|
+
prefix = "\n".join(prefix_lines) + "\n"
|
|
252
|
+
else:
|
|
253
|
+
prefix = ""
|
|
254
|
+
|
|
255
|
+
last_user_text = "\n".join(last_user_text_parts).strip()
|
|
256
|
+
# Combine prior text-only turns as context to emulate multi-turn
|
|
257
|
+
context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
|
|
258
|
+
base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
|
|
259
|
+
if context_text:
|
|
260
|
+
question = (context_text + "\n" + prefix + base_question).strip()
|
|
261
|
+
else:
|
|
262
|
+
question = (prefix + base_question).strip()
|
|
263
|
+
|
|
264
|
+
# Generation config
|
|
265
|
+
generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
|
|
266
|
+
|
|
267
|
+
# Call InternVL chat
|
|
268
|
+
try:
|
|
269
|
+
if pixel_values is None:
|
|
270
|
+
# Pure-text conversation (embed prior turns in question)
|
|
271
|
+
response = self.model.chat(self.tokenizer, None, question, generation_config)
|
|
272
|
+
else:
|
|
273
|
+
# Multi-image: pass num_patches_list if >1 image
|
|
274
|
+
if len(num_patches_list) > 1:
|
|
275
|
+
response = self.model.chat(
|
|
276
|
+
self.tokenizer,
|
|
277
|
+
pixel_values,
|
|
278
|
+
question,
|
|
279
|
+
generation_config,
|
|
280
|
+
num_patches_list=num_patches_list,
|
|
281
|
+
)
|
|
282
|
+
else:
|
|
283
|
+
response = self.model.chat(
|
|
284
|
+
self.tokenizer, pixel_values, question, generation_config
|
|
285
|
+
)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
# Fallback: return empty string to avoid crashing the adapter
|
|
288
|
+
return ""
|
|
289
|
+
|
|
290
|
+
return response or ""
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import re
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import blobfile as _ # assert blobfile is installed
|
|
8
|
+
import torch # type: ignore
|
|
9
|
+
from PIL import Image # type: ignore
|
|
10
|
+
from transformers import ( # type: ignore
|
|
11
|
+
AutoImageProcessor,
|
|
12
|
+
AutoModel,
|
|
13
|
+
AutoTokenizer,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
OPENCUA_AVAILABLE = True
|
|
17
|
+
except Exception:
|
|
18
|
+
OPENCUA_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OpenCUAModel:
|
|
22
|
+
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
26
|
+
) -> None:
|
|
27
|
+
if not OPENCUA_AVAILABLE:
|
|
28
|
+
raise ImportError(
|
|
29
|
+
'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
|
|
30
|
+
)
|
|
31
|
+
self.model_name = model_name
|
|
32
|
+
self.device = device
|
|
33
|
+
self.model = None
|
|
34
|
+
self.tokenizer = None
|
|
35
|
+
self.image_processor = None
|
|
36
|
+
self.trust_remote_code = trust_remote_code
|
|
37
|
+
self._load()
|
|
38
|
+
|
|
39
|
+
def _load(self) -> None:
|
|
40
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
41
|
+
self.model_name, trust_remote_code=self.trust_remote_code
|
|
42
|
+
)
|
|
43
|
+
self.model = AutoModel.from_pretrained(
|
|
44
|
+
self.model_name,
|
|
45
|
+
torch_dtype="auto",
|
|
46
|
+
device_map=self.device,
|
|
47
|
+
trust_remote_code=self.trust_remote_code,
|
|
48
|
+
attn_implementation="sdpa",
|
|
49
|
+
)
|
|
50
|
+
self.image_processor = AutoImageProcessor.from_pretrained(
|
|
51
|
+
self.model_name, trust_remote_code=self.trust_remote_code
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
|
|
56
|
+
# Expect HF-format messages with content items type: "image" with data URL
|
|
57
|
+
for msg in reversed(messages):
|
|
58
|
+
for item in reversed(msg.get("content", [])):
|
|
59
|
+
if isinstance(item, dict) and item.get("type") == "image":
|
|
60
|
+
url = item.get("image", "")
|
|
61
|
+
if isinstance(url, str) and url.startswith("data:image/"):
|
|
62
|
+
return url.split(",", 1)[1]
|
|
63
|
+
return ""
|
|
64
|
+
|
|
65
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
|
|
66
|
+
assert (
|
|
67
|
+
self.model is not None
|
|
68
|
+
and self.tokenizer is not None
|
|
69
|
+
and self.image_processor is not None
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Tokenize text side using chat template
|
|
73
|
+
input_ids = self.tokenizer.apply_chat_template(
|
|
74
|
+
messages, tokenize=True, add_generation_prompt=True
|
|
75
|
+
)
|
|
76
|
+
input_ids = torch.tensor([input_ids]).to(self.model.device)
|
|
77
|
+
|
|
78
|
+
# Prepare image inputs from last data URL image
|
|
79
|
+
image_b64 = self._extract_last_image_b64(messages)
|
|
80
|
+
pixel_values = None
|
|
81
|
+
grid_thws = None
|
|
82
|
+
if image_b64:
|
|
83
|
+
image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
|
|
84
|
+
image_info = self.image_processor.preprocess(images=[image])
|
|
85
|
+
pixel_values = torch.tensor(image_info["pixel_values"]).to(
|
|
86
|
+
dtype=torch.bfloat16, device=self.model.device
|
|
87
|
+
)
|
|
88
|
+
grid_thws = (
|
|
89
|
+
torch.tensor(image_info["image_grid_thw"])
|
|
90
|
+
if "image_grid_thw" in image_info
|
|
91
|
+
else None
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
gen_kwargs: Dict[str, Any] = {
|
|
95
|
+
"max_new_tokens": max_new_tokens,
|
|
96
|
+
"temperature": 0,
|
|
97
|
+
}
|
|
98
|
+
if pixel_values is not None:
|
|
99
|
+
gen_kwargs["pixel_values"] = pixel_values
|
|
100
|
+
if grid_thws is not None:
|
|
101
|
+
gen_kwargs["grid_thws"] = grid_thws
|
|
102
|
+
|
|
103
|
+
with torch.no_grad():
|
|
104
|
+
generated_ids = self.model.generate(
|
|
105
|
+
input_ids,
|
|
106
|
+
**gen_kwargs,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Remove prompt tokens
|
|
110
|
+
prompt_len = input_ids.shape[1]
|
|
111
|
+
generated_ids = generated_ids[:, prompt_len:]
|
|
112
|
+
output_text = self.tokenizer.batch_decode(
|
|
113
|
+
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
114
|
+
)[0]
|
|
115
|
+
return output_text
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
# Hugging Face imports are local to avoid hard dependency at module import
|
|
4
|
+
try:
|
|
5
|
+
import torch # type: ignore
|
|
6
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
|
|
7
|
+
|
|
8
|
+
HF_AVAILABLE = True
|
|
9
|
+
except Exception:
|
|
10
|
+
HF_AVAILABLE = False
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Qwen2_5_VLModel:
|
|
14
|
+
"""Qwen2.5-VL Hugging Face vision-language model handler.
|
|
15
|
+
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
|
20
|
+
) -> None:
|
|
21
|
+
if not HF_AVAILABLE:
|
|
22
|
+
raise ImportError(
|
|
23
|
+
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
|
24
|
+
)
|
|
25
|
+
self.model_name = model_name
|
|
26
|
+
self.device = device
|
|
27
|
+
self.model = None
|
|
28
|
+
self.processor = None
|
|
29
|
+
self.trust_remote_code = trust_remote_code
|
|
30
|
+
self._load()
|
|
31
|
+
|
|
32
|
+
def _load(self) -> None:
|
|
33
|
+
# Load model
|
|
34
|
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
35
|
+
self.model_name,
|
|
36
|
+
torch_dtype=torch.bfloat16,
|
|
37
|
+
device_map=self.device,
|
|
38
|
+
attn_implementation="sdpa",
|
|
39
|
+
trust_remote_code=self.trust_remote_code,
|
|
40
|
+
)
|
|
41
|
+
# Load processor
|
|
42
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
43
|
+
self.model_name,
|
|
44
|
+
min_pixels=3136,
|
|
45
|
+
max_pixels=4096 * 2160,
|
|
46
|
+
device_map=self.device,
|
|
47
|
+
trust_remote_code=self.trust_remote_code,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
|
|
51
|
+
"""Generate text for the given HF-format messages.
|
|
52
|
+
messages: [{ role, content: [{type:'text'|'image', text|image}] }]
|
|
53
|
+
"""
|
|
54
|
+
assert self.model is not None and self.processor is not None
|
|
55
|
+
# Apply chat template and tokenize
|
|
56
|
+
inputs = self.processor.apply_chat_template(
|
|
57
|
+
messages,
|
|
58
|
+
add_generation_prompt=True,
|
|
59
|
+
tokenize=True,
|
|
60
|
+
return_dict=True,
|
|
61
|
+
return_tensors="pt",
|
|
62
|
+
)
|
|
63
|
+
# Move inputs to the same device as model
|
|
64
|
+
inputs = inputs.to(self.model.device)
|
|
65
|
+
# Generate
|
|
66
|
+
with torch.no_grad():
|
|
67
|
+
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
68
|
+
# Trim prompt tokens from output
|
|
69
|
+
generated_ids_trimmed = [
|
|
70
|
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
71
|
+
]
|
|
72
|
+
# Decode
|
|
73
|
+
output_text = self.processor.batch_decode(
|
|
74
|
+
generated_ids_trimmed,
|
|
75
|
+
skip_special_tokens=True,
|
|
76
|
+
clean_up_tokenization_spaces=False,
|
|
77
|
+
)
|
|
78
|
+
return output_text[0] if output_text else ""
|