cua-agent 0.4.30__py3-none-any.whl → 0.4.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/adapters/huggingfacelocal_adapter.py +15 -66
- agent/adapters/models/__init__.py +33 -0
- agent/adapters/models/generic.py +75 -0
- agent/adapters/models/internvl.py +254 -0
- agent/adapters/models/opencua.py +100 -0
- agent/adapters/models/qwen2_5_vl.py +75 -0
- agent/agent.py +5 -1
- agent/callbacks/trajectory_saver.py +2 -0
- agent/cli.py +90 -1
- agent/integrations/hud/__init__.py +19 -0
- agent/loops/__init__.py +15 -1
- agent/loops/anthropic.py +2 -3
- agent/loops/composed_grounded.py +1 -1
- agent/loops/glm45v.py +3 -2
- agent/loops/gta1.py +1 -1
- agent/loops/holo.py +216 -0
- agent/loops/internvl.py +185 -0
- agent/loops/opencua.py +142 -0
- agent/loops/uitars.py +1 -1
- {cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/METADATA +20 -4
- {cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/RECORD +23 -15
- {cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/entry_points.txt +0 -0
|
@@ -15,54 +15,31 @@ try:
|
|
|
15
15
|
except ImportError:
|
|
16
16
|
HF_AVAILABLE = False
|
|
17
17
|
|
|
18
|
+
from .models import load_model as load_model_handler
|
|
18
19
|
|
|
19
20
|
class HuggingFaceLocalAdapter(CustomLLM):
|
|
20
21
|
"""HuggingFace Local Adapter for running vision-language models locally."""
|
|
21
22
|
|
|
22
|
-
def __init__(self, device: str = "auto", **kwargs):
|
|
23
|
+
def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
|
|
23
24
|
"""Initialize the adapter.
|
|
24
25
|
|
|
25
26
|
Args:
|
|
26
27
|
device: Device to load model on ("auto", "cuda", "cpu", etc.)
|
|
28
|
+
trust_remote_code: Whether to trust remote code
|
|
27
29
|
**kwargs: Additional arguments
|
|
28
30
|
"""
|
|
29
31
|
super().__init__()
|
|
30
32
|
self.device = device
|
|
31
|
-
self.
|
|
32
|
-
|
|
33
|
+
self.trust_remote_code = trust_remote_code
|
|
34
|
+
# Cache for model handlers keyed by model_name
|
|
35
|
+
self._handlers: Dict[str, Any] = {}
|
|
33
36
|
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
|
|
34
37
|
|
|
35
|
-
def
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
Tuple of (model, processor)
|
|
43
|
-
"""
|
|
44
|
-
if model_name not in self.models:
|
|
45
|
-
# Load model
|
|
46
|
-
model = AutoModelForImageTextToText.from_pretrained(
|
|
47
|
-
model_name,
|
|
48
|
-
torch_dtype=torch.float16,
|
|
49
|
-
device_map=self.device,
|
|
50
|
-
attn_implementation="sdpa"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
# Load processor
|
|
54
|
-
processor = AutoProcessor.from_pretrained(
|
|
55
|
-
model_name,
|
|
56
|
-
min_pixels=3136,
|
|
57
|
-
max_pixels=4096 * 2160,
|
|
58
|
-
device_map=self.device
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
# Cache them
|
|
62
|
-
self.models[model_name] = model
|
|
63
|
-
self.processors[model_name] = processor
|
|
64
|
-
|
|
65
|
-
return self.models[model_name], self.processors[model_name]
|
|
38
|
+
def _get_handler(self, model_name: str):
|
|
39
|
+
"""Get or create a model handler for the given model name."""
|
|
40
|
+
if model_name not in self._handlers:
|
|
41
|
+
self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
|
|
42
|
+
return self._handlers[model_name]
|
|
66
43
|
|
|
67
44
|
def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
68
45
|
"""Convert OpenAI format messages to HuggingFace format.
|
|
@@ -133,41 +110,13 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
|
|
133
110
|
if ignored_kwargs:
|
|
134
111
|
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
|
|
135
112
|
|
|
136
|
-
# Load model and processor
|
|
137
|
-
model, processor = self._load_model_and_processor(model_name)
|
|
138
|
-
|
|
139
113
|
# Convert messages to HuggingFace format
|
|
140
114
|
hf_messages = self._convert_messages(messages)
|
|
141
115
|
|
|
142
|
-
#
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
tokenize=True,
|
|
147
|
-
return_dict=True,
|
|
148
|
-
return_tensors="pt"
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Move inputs to the same device as model
|
|
152
|
-
inputs = inputs.to(model.device)
|
|
153
|
-
|
|
154
|
-
# Generate response
|
|
155
|
-
with torch.no_grad():
|
|
156
|
-
generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
157
|
-
|
|
158
|
-
# Trim input tokens from output
|
|
159
|
-
generated_ids_trimmed = [
|
|
160
|
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
161
|
-
]
|
|
162
|
-
|
|
163
|
-
# Decode output
|
|
164
|
-
output_text = processor.batch_decode(
|
|
165
|
-
generated_ids_trimmed,
|
|
166
|
-
skip_special_tokens=True,
|
|
167
|
-
clean_up_tokenization_spaces=False
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
return output_text[0] if output_text else ""
|
|
116
|
+
# Delegate to model handler
|
|
117
|
+
handler = self._get_handler(model_name)
|
|
118
|
+
generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
|
|
119
|
+
return generated_text
|
|
171
120
|
|
|
172
121
|
def completion(self, *args, **kwargs) -> ModelResponse:
|
|
173
122
|
"""Synchronous completion method.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from transformers import AutoConfig
|
|
5
|
+
HF_AVAILABLE = True
|
|
6
|
+
except ImportError:
|
|
7
|
+
HF_AVAILABLE = False
|
|
8
|
+
|
|
9
|
+
from .generic import GenericHFModel
|
|
10
|
+
from .opencua import OpenCUAModel
|
|
11
|
+
from .qwen2_5_vl import Qwen2_5_VLModel
|
|
12
|
+
from .internvl import InternVLModel
|
|
13
|
+
|
|
14
|
+
def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
|
|
15
|
+
"""Factory function to load and return the right model handler instance.
|
|
16
|
+
|
|
17
|
+
- If the underlying transformers config class matches OpenCUA, return OpenCUAModel
|
|
18
|
+
- Otherwise, return GenericHFModel
|
|
19
|
+
"""
|
|
20
|
+
if not HF_AVAILABLE:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
|
|
23
|
+
)
|
|
24
|
+
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
|
|
25
|
+
cls = cfg.__class__.__name__
|
|
26
|
+
print(f"cls: {cls}")
|
|
27
|
+
if "OpenCUA" in cls:
|
|
28
|
+
return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
|
29
|
+
elif "Qwen2_5_VL" in cls:
|
|
30
|
+
return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
|
31
|
+
elif "InternVL" in cls:
|
|
32
|
+
return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
|
33
|
+
return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
|
+
|
|
3
|
+
# Hugging Face imports are local to avoid hard dependency at module import
|
|
4
|
+
try:
|
|
5
|
+
import torch # type: ignore
|
|
6
|
+
from transformers import AutoModel, AutoProcessor # type: ignore
|
|
7
|
+
HF_AVAILABLE = True
|
|
8
|
+
except Exception:
|
|
9
|
+
HF_AVAILABLE = False
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GenericHFModel:
|
|
13
|
+
"""Generic Hugging Face vision-language model handler.
|
|
14
|
+
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
|
18
|
+
if not HF_AVAILABLE:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
|
|
21
|
+
)
|
|
22
|
+
self.model_name = model_name
|
|
23
|
+
self.device = device
|
|
24
|
+
self.model = None
|
|
25
|
+
self.processor = None
|
|
26
|
+
self.trust_remote_code = trust_remote_code
|
|
27
|
+
self._load()
|
|
28
|
+
|
|
29
|
+
def _load(self) -> None:
|
|
30
|
+
# Load model
|
|
31
|
+
self.model = AutoModel.from_pretrained(
|
|
32
|
+
self.model_name,
|
|
33
|
+
torch_dtype=torch.float16,
|
|
34
|
+
device_map=self.device,
|
|
35
|
+
attn_implementation="sdpa",
|
|
36
|
+
trust_remote_code=self.trust_remote_code,
|
|
37
|
+
)
|
|
38
|
+
# Load processor
|
|
39
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
40
|
+
self.model_name,
|
|
41
|
+
min_pixels=3136,
|
|
42
|
+
max_pixels=4096 * 2160,
|
|
43
|
+
device_map=self.device,
|
|
44
|
+
trust_remote_code=self.trust_remote_code,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
|
|
48
|
+
"""Generate text for the given HF-format messages.
|
|
49
|
+
messages: [{ role, content: [{type:'text'|'image', text|image}] }]
|
|
50
|
+
"""
|
|
51
|
+
assert self.model is not None and self.processor is not None
|
|
52
|
+
# Apply chat template and tokenize
|
|
53
|
+
inputs = self.processor.apply_chat_template(
|
|
54
|
+
messages,
|
|
55
|
+
add_generation_prompt=True,
|
|
56
|
+
tokenize=True,
|
|
57
|
+
return_dict=True,
|
|
58
|
+
return_tensors="pt",
|
|
59
|
+
)
|
|
60
|
+
# Move inputs to the same device as model
|
|
61
|
+
inputs = inputs.to(self.model.device)
|
|
62
|
+
# Generate
|
|
63
|
+
with torch.no_grad():
|
|
64
|
+
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
65
|
+
# Trim prompt tokens from output
|
|
66
|
+
generated_ids_trimmed = [
|
|
67
|
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
68
|
+
]
|
|
69
|
+
# Decode
|
|
70
|
+
output_text = self.processor.batch_decode(
|
|
71
|
+
generated_ids_trimmed,
|
|
72
|
+
skip_special_tokens=True,
|
|
73
|
+
clean_up_tokenization_spaces=False,
|
|
74
|
+
)
|
|
75
|
+
return output_text[0] if output_text else ""
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
|
|
4
|
+
# Hugging Face imports are local to avoid hard dependency at module import
|
|
5
|
+
try:
|
|
6
|
+
import torch # type: ignore
|
|
7
|
+
from transformers import AutoModel, AutoTokenizer # type: ignore
|
|
8
|
+
# Attempt to import InternVL's model dependencies
|
|
9
|
+
import einops as _ # type: ignore
|
|
10
|
+
import timm as _ # type: ignore
|
|
11
|
+
from PIL import Image # type: ignore
|
|
12
|
+
import torchvision.transforms as T # type: ignore
|
|
13
|
+
from torchvision.transforms.functional import InterpolationMode # type: ignore
|
|
14
|
+
import base64 # type: ignore
|
|
15
|
+
from io import BytesIO # type: ignore
|
|
16
|
+
import requests # type: ignore
|
|
17
|
+
HF_AVAILABLE = True
|
|
18
|
+
except Exception:
|
|
19
|
+
HF_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class InternVLModel:
|
|
23
|
+
"""Generic Hugging Face vision-language model handler.
|
|
24
|
+
Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
|
|
25
|
+
Provides preprocessing to support multi-turn conversations with multiple images.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
|
29
|
+
if not HF_AVAILABLE:
|
|
30
|
+
raise ImportError(
|
|
31
|
+
"InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
|
|
32
|
+
)
|
|
33
|
+
self.model_name = model_name
|
|
34
|
+
self.device = device
|
|
35
|
+
self.model = None
|
|
36
|
+
self.tokenizer = None
|
|
37
|
+
self.trust_remote_code = trust_remote_code
|
|
38
|
+
self._load()
|
|
39
|
+
|
|
40
|
+
def _load(self) -> None:
|
|
41
|
+
# Load model
|
|
42
|
+
self.model = AutoModel.from_pretrained(
|
|
43
|
+
self.model_name,
|
|
44
|
+
torch_dtype=torch.bfloat16,
|
|
45
|
+
low_cpu_mem_usage=True,
|
|
46
|
+
use_flash_attn=True,
|
|
47
|
+
device_map=self.device,
|
|
48
|
+
trust_remote_code=self.trust_remote_code,
|
|
49
|
+
).eval()
|
|
50
|
+
# Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
|
|
51
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
52
|
+
self.model_name,
|
|
53
|
+
trust_remote_code=self.trust_remote_code,
|
|
54
|
+
use_fast=False,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# ---- Image preprocessing utilities adapted from InternVL docs ----
|
|
58
|
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
|
59
|
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
|
60
|
+
|
|
61
|
+
def _build_transform(self, input_size: int) -> T.Compose:
|
|
62
|
+
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
|
|
63
|
+
transform = T.Compose([
|
|
64
|
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
|
65
|
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
|
66
|
+
T.ToTensor(),
|
|
67
|
+
T.Normalize(mean=MEAN, std=STD)
|
|
68
|
+
])
|
|
69
|
+
return transform
|
|
70
|
+
|
|
71
|
+
def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
|
|
72
|
+
best_ratio_diff = float('inf')
|
|
73
|
+
best_ratio = (1, 1)
|
|
74
|
+
area = width * height
|
|
75
|
+
for ratio in target_ratios:
|
|
76
|
+
target_aspect_ratio = ratio[0] / ratio[1]
|
|
77
|
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
|
78
|
+
if ratio_diff < best_ratio_diff:
|
|
79
|
+
best_ratio_diff = ratio_diff
|
|
80
|
+
best_ratio = ratio
|
|
81
|
+
elif ratio_diff == best_ratio_diff:
|
|
82
|
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
|
83
|
+
best_ratio = ratio
|
|
84
|
+
return best_ratio
|
|
85
|
+
|
|
86
|
+
def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
|
|
87
|
+
orig_width, orig_height = image.size
|
|
88
|
+
aspect_ratio = orig_width / orig_height
|
|
89
|
+
|
|
90
|
+
target_ratios = set(
|
|
91
|
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
|
92
|
+
i * j <= max_num and i * j >= min_num)
|
|
93
|
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
|
94
|
+
|
|
95
|
+
target_aspect_ratio = self._find_closest_aspect_ratio(
|
|
96
|
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
|
97
|
+
|
|
98
|
+
target_width = image_size * target_aspect_ratio[0]
|
|
99
|
+
target_height = image_size * target_aspect_ratio[1]
|
|
100
|
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
|
101
|
+
|
|
102
|
+
resized_img = image.resize((target_width, target_height))
|
|
103
|
+
processed_images: List[Image.Image] = []
|
|
104
|
+
for i in range(blocks):
|
|
105
|
+
box = (
|
|
106
|
+
(i % (target_width // image_size)) * image_size,
|
|
107
|
+
(i // (target_width // image_size)) * image_size,
|
|
108
|
+
((i % (target_width // image_size)) + 1) * image_size,
|
|
109
|
+
((i // (target_width // image_size)) + 1) * image_size
|
|
110
|
+
)
|
|
111
|
+
split_img = resized_img.crop(box)
|
|
112
|
+
processed_images.append(split_img)
|
|
113
|
+
assert len(processed_images) == blocks
|
|
114
|
+
if use_thumbnail and len(processed_images) != 1:
|
|
115
|
+
thumbnail_img = image.resize((image_size, image_size))
|
|
116
|
+
processed_images.append(thumbnail_img)
|
|
117
|
+
return processed_images
|
|
118
|
+
|
|
119
|
+
def _load_image_from_source(self, src: str) -> Image.Image:
|
|
120
|
+
"""Load PIL image from various sources: data URL, http(s), or local path."""
|
|
121
|
+
if src.startswith("data:image/"):
|
|
122
|
+
# data URL base64
|
|
123
|
+
header, b64data = src.split(",", 1)
|
|
124
|
+
img_bytes = base64.b64decode(b64data)
|
|
125
|
+
return Image.open(BytesIO(img_bytes)).convert('RGB')
|
|
126
|
+
if src.startswith("http://") or src.startswith("https://"):
|
|
127
|
+
resp = requests.get(src, timeout=10)
|
|
128
|
+
resp.raise_for_status()
|
|
129
|
+
return Image.open(BytesIO(resp.content)).convert('RGB')
|
|
130
|
+
# Assume local file path
|
|
131
|
+
return Image.open(src).convert('RGB')
|
|
132
|
+
|
|
133
|
+
def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
|
|
134
|
+
transform = self._build_transform(input_size=input_size)
|
|
135
|
+
pixel_values_list = []
|
|
136
|
+
num_patches_list: List[int] = []
|
|
137
|
+
for img in images:
|
|
138
|
+
tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
|
139
|
+
pv = [transform(tile) for tile in tiles]
|
|
140
|
+
pv = torch.stack(pv)
|
|
141
|
+
num_patches_list.append(pv.shape[0])
|
|
142
|
+
pixel_values_list.append(pv)
|
|
143
|
+
if not pixel_values_list:
|
|
144
|
+
return None, []
|
|
145
|
+
pixel_values = torch.cat(pixel_values_list)
|
|
146
|
+
return pixel_values, num_patches_list
|
|
147
|
+
|
|
148
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
|
|
149
|
+
"""Generate text for the given HF-format messages.
|
|
150
|
+
messages: [{ role, content: [{type:'text'|'image', text|image}] }]
|
|
151
|
+
|
|
152
|
+
This implementation constructs InternVL-compatible inputs and uses
|
|
153
|
+
`model.chat(tokenizer, pixel_values, question, history=...)` to avoid
|
|
154
|
+
relying on AutoProcessor (which fails for some tokenizers).
|
|
155
|
+
"""
|
|
156
|
+
assert self.model is not None and self.tokenizer is not None
|
|
157
|
+
|
|
158
|
+
# Build textual context and collect images and the final question
|
|
159
|
+
context_lines: List[str] = []
|
|
160
|
+
all_images: List[Image.Image] = []
|
|
161
|
+
last_user_text_parts: List[str] = []
|
|
162
|
+
|
|
163
|
+
for msg in messages:
|
|
164
|
+
role = msg.get("role", "user")
|
|
165
|
+
content = msg.get("content", [])
|
|
166
|
+
if isinstance(content, str):
|
|
167
|
+
content_items = [{"type": "text", "text": content}]
|
|
168
|
+
else:
|
|
169
|
+
content_items = content
|
|
170
|
+
|
|
171
|
+
if role == "user":
|
|
172
|
+
# Collect text and images
|
|
173
|
+
parts_text: List[str] = []
|
|
174
|
+
for item in content_items:
|
|
175
|
+
if item.get("type") == "text":
|
|
176
|
+
t = item.get("text", "")
|
|
177
|
+
if t:
|
|
178
|
+
parts_text.append(t)
|
|
179
|
+
elif item.get("type") == "image":
|
|
180
|
+
url = item.get("image", "")
|
|
181
|
+
if url:
|
|
182
|
+
try:
|
|
183
|
+
all_images.append(self._load_image_from_source(url))
|
|
184
|
+
except Exception:
|
|
185
|
+
# Ignore failed image loads but keep going
|
|
186
|
+
pass
|
|
187
|
+
text = "\n".join(parts_text).strip()
|
|
188
|
+
if text:
|
|
189
|
+
context_lines.append(f"User: {text}")
|
|
190
|
+
# Track last user text separately for question
|
|
191
|
+
last_user_text_parts = parts_text or last_user_text_parts
|
|
192
|
+
elif role == "assistant":
|
|
193
|
+
# Only keep text content for history
|
|
194
|
+
parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
|
|
195
|
+
text = "\n".join(parts_text).strip()
|
|
196
|
+
if text:
|
|
197
|
+
context_lines.append(f"Assistant: {text}")
|
|
198
|
+
|
|
199
|
+
# Prepare pixel values for all collected images (across turns)
|
|
200
|
+
pixel_values = None
|
|
201
|
+
num_patches_list: List[int] = []
|
|
202
|
+
if all_images:
|
|
203
|
+
pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
|
|
204
|
+
if pixel_values is not None:
|
|
205
|
+
# Convert dtype/device as in docs
|
|
206
|
+
pixel_values = pixel_values.to(torch.bfloat16)
|
|
207
|
+
# Chat API expects tensors on CUDA when model is on CUDA
|
|
208
|
+
try:
|
|
209
|
+
pixel_values = pixel_values.to(self.model.device)
|
|
210
|
+
except Exception:
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
# Build question with any prior context and numbered image placeholders
|
|
214
|
+
if all_images:
|
|
215
|
+
# Separate images layout: Image-1: <image> ... then question text
|
|
216
|
+
prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
|
|
217
|
+
prefix = "\n".join(prefix_lines) + "\n"
|
|
218
|
+
else:
|
|
219
|
+
prefix = ""
|
|
220
|
+
|
|
221
|
+
last_user_text = "\n".join(last_user_text_parts).strip()
|
|
222
|
+
# Combine prior text-only turns as context to emulate multi-turn
|
|
223
|
+
context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
|
|
224
|
+
base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
|
|
225
|
+
if context_text:
|
|
226
|
+
question = (context_text + "\n" + prefix + base_question).strip()
|
|
227
|
+
else:
|
|
228
|
+
question = (prefix + base_question).strip()
|
|
229
|
+
|
|
230
|
+
# Generation config
|
|
231
|
+
generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
|
|
232
|
+
|
|
233
|
+
# Call InternVL chat
|
|
234
|
+
try:
|
|
235
|
+
if pixel_values is None:
|
|
236
|
+
# Pure-text conversation (embed prior turns in question)
|
|
237
|
+
response = self.model.chat(self.tokenizer, None, question, generation_config)
|
|
238
|
+
else:
|
|
239
|
+
# Multi-image: pass num_patches_list if >1 image
|
|
240
|
+
if len(num_patches_list) > 1:
|
|
241
|
+
response = self.model.chat(
|
|
242
|
+
self.tokenizer,
|
|
243
|
+
pixel_values,
|
|
244
|
+
question,
|
|
245
|
+
generation_config,
|
|
246
|
+
num_patches_list=num_patches_list,
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
|
|
250
|
+
except Exception as e:
|
|
251
|
+
# Fallback: return empty string to avoid crashing the adapter
|
|
252
|
+
return ""
|
|
253
|
+
|
|
254
|
+
return response or ""
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import List, Dict, Any
|
|
2
|
+
import re
|
|
3
|
+
import base64
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import torch # type: ignore
|
|
8
|
+
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore
|
|
9
|
+
from PIL import Image # type: ignore
|
|
10
|
+
import blobfile as _ # assert blobfile is installed
|
|
11
|
+
OPENCUA_AVAILABLE = True
|
|
12
|
+
except Exception:
|
|
13
|
+
OPENCUA_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpenCUAModel:
|
|
17
|
+
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
|
20
|
+
if not OPENCUA_AVAILABLE:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
|
|
23
|
+
)
|
|
24
|
+
self.model_name = model_name
|
|
25
|
+
self.device = device
|
|
26
|
+
self.model = None
|
|
27
|
+
self.tokenizer = None
|
|
28
|
+
self.image_processor = None
|
|
29
|
+
self.trust_remote_code = trust_remote_code
|
|
30
|
+
self._load()
|
|
31
|
+
|
|
32
|
+
def _load(self) -> None:
|
|
33
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
34
|
+
self.model_name, trust_remote_code=self.trust_remote_code
|
|
35
|
+
)
|
|
36
|
+
self.model = AutoModel.from_pretrained(
|
|
37
|
+
self.model_name,
|
|
38
|
+
torch_dtype="auto",
|
|
39
|
+
device_map=self.device,
|
|
40
|
+
trust_remote_code=self.trust_remote_code,
|
|
41
|
+
attn_implementation="sdpa",
|
|
42
|
+
)
|
|
43
|
+
self.image_processor = AutoImageProcessor.from_pretrained(
|
|
44
|
+
self.model_name, trust_remote_code=self.trust_remote_code
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
|
|
49
|
+
# Expect HF-format messages with content items type: "image" with data URL
|
|
50
|
+
for msg in reversed(messages):
|
|
51
|
+
for item in reversed(msg.get("content", [])):
|
|
52
|
+
if isinstance(item, dict) and item.get("type") == "image":
|
|
53
|
+
url = item.get("image", "")
|
|
54
|
+
if isinstance(url, str) and url.startswith("data:image/"):
|
|
55
|
+
return url.split(",", 1)[1]
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
|
|
59
|
+
assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
|
|
60
|
+
|
|
61
|
+
# Tokenize text side using chat template
|
|
62
|
+
input_ids = self.tokenizer.apply_chat_template(
|
|
63
|
+
messages, tokenize=True, add_generation_prompt=True
|
|
64
|
+
)
|
|
65
|
+
input_ids = torch.tensor([input_ids]).to(self.model.device)
|
|
66
|
+
|
|
67
|
+
# Prepare image inputs from last data URL image
|
|
68
|
+
image_b64 = self._extract_last_image_b64(messages)
|
|
69
|
+
pixel_values = None
|
|
70
|
+
grid_thws = None
|
|
71
|
+
if image_b64:
|
|
72
|
+
image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
|
|
73
|
+
image_info = self.image_processor.preprocess(images=[image])
|
|
74
|
+
pixel_values = torch.tensor(image_info["pixel_values"]).to(
|
|
75
|
+
dtype=torch.bfloat16, device=self.model.device
|
|
76
|
+
)
|
|
77
|
+
grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
|
|
78
|
+
|
|
79
|
+
gen_kwargs: Dict[str, Any] = {
|
|
80
|
+
"max_new_tokens": max_new_tokens,
|
|
81
|
+
"temperature": 0,
|
|
82
|
+
}
|
|
83
|
+
if pixel_values is not None:
|
|
84
|
+
gen_kwargs["pixel_values"] = pixel_values
|
|
85
|
+
if grid_thws is not None:
|
|
86
|
+
gen_kwargs["grid_thws"] = grid_thws
|
|
87
|
+
|
|
88
|
+
with torch.no_grad():
|
|
89
|
+
generated_ids = self.model.generate(
|
|
90
|
+
input_ids,
|
|
91
|
+
**gen_kwargs,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Remove prompt tokens
|
|
95
|
+
prompt_len = input_ids.shape[1]
|
|
96
|
+
generated_ids = generated_ids[:, prompt_len:]
|
|
97
|
+
output_text = self.tokenizer.batch_decode(
|
|
98
|
+
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
99
|
+
)[0]
|
|
100
|
+
return output_text
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
|
+
|
|
3
|
+
# Hugging Face imports are local to avoid hard dependency at module import
|
|
4
|
+
try:
|
|
5
|
+
import torch # type: ignore
|
|
6
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
|
|
7
|
+
HF_AVAILABLE = True
|
|
8
|
+
except Exception:
|
|
9
|
+
HF_AVAILABLE = False
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Qwen2_5_VLModel:
|
|
13
|
+
"""Qwen2.5-VL Hugging Face vision-language model handler.
|
|
14
|
+
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
|
18
|
+
if not HF_AVAILABLE:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
|
|
21
|
+
)
|
|
22
|
+
self.model_name = model_name
|
|
23
|
+
self.device = device
|
|
24
|
+
self.model = None
|
|
25
|
+
self.processor = None
|
|
26
|
+
self.trust_remote_code = trust_remote_code
|
|
27
|
+
self._load()
|
|
28
|
+
|
|
29
|
+
def _load(self) -> None:
|
|
30
|
+
# Load model
|
|
31
|
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
32
|
+
self.model_name,
|
|
33
|
+
torch_dtype=torch.bfloat16,
|
|
34
|
+
device_map=self.device,
|
|
35
|
+
attn_implementation="sdpa",
|
|
36
|
+
trust_remote_code=self.trust_remote_code,
|
|
37
|
+
)
|
|
38
|
+
# Load processor
|
|
39
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
40
|
+
self.model_name,
|
|
41
|
+
min_pixels=3136,
|
|
42
|
+
max_pixels=4096 * 2160,
|
|
43
|
+
device_map=self.device,
|
|
44
|
+
trust_remote_code=self.trust_remote_code,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
|
|
48
|
+
"""Generate text for the given HF-format messages.
|
|
49
|
+
messages: [{ role, content: [{type:'text'|'image', text|image}] }]
|
|
50
|
+
"""
|
|
51
|
+
assert self.model is not None and self.processor is not None
|
|
52
|
+
# Apply chat template and tokenize
|
|
53
|
+
inputs = self.processor.apply_chat_template(
|
|
54
|
+
messages,
|
|
55
|
+
add_generation_prompt=True,
|
|
56
|
+
tokenize=True,
|
|
57
|
+
return_dict=True,
|
|
58
|
+
return_tensors="pt",
|
|
59
|
+
)
|
|
60
|
+
# Move inputs to the same device as model
|
|
61
|
+
inputs = inputs.to(self.model.device)
|
|
62
|
+
# Generate
|
|
63
|
+
with torch.no_grad():
|
|
64
|
+
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
65
|
+
# Trim prompt tokens from output
|
|
66
|
+
generated_ids_trimmed = [
|
|
67
|
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
68
|
+
]
|
|
69
|
+
# Decode
|
|
70
|
+
output_text = self.processor.batch_decode(
|
|
71
|
+
generated_ids_trimmed,
|
|
72
|
+
skip_special_tokens=True,
|
|
73
|
+
clean_up_tokenization_spaces=False,
|
|
74
|
+
)
|
|
75
|
+
return output_text[0] if output_text else ""
|
agent/agent.py
CHANGED
|
@@ -171,6 +171,7 @@ class ComputerAgent:
|
|
|
171
171
|
use_prompt_caching: Optional[bool] = False,
|
|
172
172
|
max_trajectory_budget: Optional[float | dict] = None,
|
|
173
173
|
telemetry_enabled: Optional[bool] = True,
|
|
174
|
+
trust_remote_code: Optional[bool] = False,
|
|
174
175
|
**kwargs
|
|
175
176
|
):
|
|
176
177
|
"""
|
|
@@ -190,6 +191,7 @@ class ComputerAgent:
|
|
|
190
191
|
use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
|
|
191
192
|
max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
|
|
192
193
|
telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
|
|
194
|
+
trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
|
|
193
195
|
**kwargs: Additional arguments passed to the agent loop
|
|
194
196
|
"""
|
|
195
197
|
# If the loop is "human/human", we need to prefix a grounding model fallback
|
|
@@ -209,6 +211,7 @@ class ComputerAgent:
|
|
|
209
211
|
self.use_prompt_caching = use_prompt_caching
|
|
210
212
|
self.telemetry_enabled = telemetry_enabled
|
|
211
213
|
self.kwargs = kwargs
|
|
214
|
+
self.trust_remote_code = trust_remote_code
|
|
212
215
|
|
|
213
216
|
# == Add built-in callbacks ==
|
|
214
217
|
|
|
@@ -252,7 +255,8 @@ class ComputerAgent:
|
|
|
252
255
|
|
|
253
256
|
# Register local model providers
|
|
254
257
|
hf_adapter = HuggingFaceLocalAdapter(
|
|
255
|
-
device="auto"
|
|
258
|
+
device="auto",
|
|
259
|
+
trust_remote_code=self.trust_remote_code or False
|
|
256
260
|
)
|
|
257
261
|
human_adapter = HumanAdapter()
|
|
258
262
|
mlx_adapter = MLXVLMAdapter()
|