cua-agent 0.4.30__py3-none-any.whl → 0.4.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -15,54 +15,31 @@ try:
15
15
  except ImportError:
16
16
  HF_AVAILABLE = False
17
17
 
18
+ from .models import load_model as load_model_handler
18
19
 
19
20
  class HuggingFaceLocalAdapter(CustomLLM):
20
21
  """HuggingFace Local Adapter for running vision-language models locally."""
21
22
 
22
- def __init__(self, device: str = "auto", **kwargs):
23
+ def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
23
24
  """Initialize the adapter.
24
25
 
25
26
  Args:
26
27
  device: Device to load model on ("auto", "cuda", "cpu", etc.)
28
+ trust_remote_code: Whether to trust remote code
27
29
  **kwargs: Additional arguments
28
30
  """
29
31
  super().__init__()
30
32
  self.device = device
31
- self.models = {} # Cache for loaded models
32
- self.processors = {} # Cache for loaded processors
33
+ self.trust_remote_code = trust_remote_code
34
+ # Cache for model handlers keyed by model_name
35
+ self._handlers: Dict[str, Any] = {}
33
36
  self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
34
37
 
35
- def _load_model_and_processor(self, model_name: str):
36
- """Load model and processor if not already cached.
37
-
38
- Args:
39
- model_name: Name of the model to load
40
-
41
- Returns:
42
- Tuple of (model, processor)
43
- """
44
- if model_name not in self.models:
45
- # Load model
46
- model = AutoModelForImageTextToText.from_pretrained(
47
- model_name,
48
- torch_dtype=torch.float16,
49
- device_map=self.device,
50
- attn_implementation="sdpa"
51
- )
52
-
53
- # Load processor
54
- processor = AutoProcessor.from_pretrained(
55
- model_name,
56
- min_pixels=3136,
57
- max_pixels=4096 * 2160,
58
- device_map=self.device
59
- )
60
-
61
- # Cache them
62
- self.models[model_name] = model
63
- self.processors[model_name] = processor
64
-
65
- return self.models[model_name], self.processors[model_name]
38
+ def _get_handler(self, model_name: str):
39
+ """Get or create a model handler for the given model name."""
40
+ if model_name not in self._handlers:
41
+ self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
42
+ return self._handlers[model_name]
66
43
 
67
44
  def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
68
45
  """Convert OpenAI format messages to HuggingFace format.
@@ -133,41 +110,13 @@ class HuggingFaceLocalAdapter(CustomLLM):
133
110
  if ignored_kwargs:
134
111
  warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
135
112
 
136
- # Load model and processor
137
- model, processor = self._load_model_and_processor(model_name)
138
-
139
113
  # Convert messages to HuggingFace format
140
114
  hf_messages = self._convert_messages(messages)
141
115
 
142
- # Apply chat template and tokenize
143
- inputs = processor.apply_chat_template(
144
- hf_messages,
145
- add_generation_prompt=True,
146
- tokenize=True,
147
- return_dict=True,
148
- return_tensors="pt"
149
- )
150
-
151
- # Move inputs to the same device as model
152
- inputs = inputs.to(model.device)
153
-
154
- # Generate response
155
- with torch.no_grad():
156
- generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
157
-
158
- # Trim input tokens from output
159
- generated_ids_trimmed = [
160
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
161
- ]
162
-
163
- # Decode output
164
- output_text = processor.batch_decode(
165
- generated_ids_trimmed,
166
- skip_special_tokens=True,
167
- clean_up_tokenization_spaces=False
168
- )
169
-
170
- return output_text[0] if output_text else ""
116
+ # Delegate to model handler
117
+ handler = self._get_handler(model_name)
118
+ generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
119
+ return generated_text
171
120
 
172
121
  def completion(self, *args, **kwargs) -> ModelResponse:
173
122
  """Synchronous completion method.
@@ -0,0 +1,33 @@
1
+ from typing import Optional
2
+
3
+ try:
4
+ from transformers import AutoConfig
5
+ HF_AVAILABLE = True
6
+ except ImportError:
7
+ HF_AVAILABLE = False
8
+
9
+ from .generic import GenericHFModel
10
+ from .opencua import OpenCUAModel
11
+ from .qwen2_5_vl import Qwen2_5_VLModel
12
+ from .internvl import InternVLModel
13
+
14
+ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
15
+ """Factory function to load and return the right model handler instance.
16
+
17
+ - If the underlying transformers config class matches OpenCUA, return OpenCUAModel
18
+ - Otherwise, return GenericHFModel
19
+ """
20
+ if not HF_AVAILABLE:
21
+ raise ImportError(
22
+ "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
23
+ )
24
+ cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
25
+ cls = cfg.__class__.__name__
26
+ print(f"cls: {cls}")
27
+ if "OpenCUA" in cls:
28
+ return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
29
+ elif "Qwen2_5_VL" in cls:
30
+ return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
31
+ elif "InternVL" in cls:
32
+ return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
33
+ return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
@@ -0,0 +1,75 @@
1
+ from typing import List, Dict, Any, Optional
2
+
3
+ # Hugging Face imports are local to avoid hard dependency at module import
4
+ try:
5
+ import torch # type: ignore
6
+ from transformers import AutoModel, AutoProcessor # type: ignore
7
+ HF_AVAILABLE = True
8
+ except Exception:
9
+ HF_AVAILABLE = False
10
+
11
+
12
+ class GenericHFModel:
13
+ """Generic Hugging Face vision-language model handler.
14
+ Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
15
+ """
16
+
17
+ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
18
+ if not HF_AVAILABLE:
19
+ raise ImportError(
20
+ "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
21
+ )
22
+ self.model_name = model_name
23
+ self.device = device
24
+ self.model = None
25
+ self.processor = None
26
+ self.trust_remote_code = trust_remote_code
27
+ self._load()
28
+
29
+ def _load(self) -> None:
30
+ # Load model
31
+ self.model = AutoModel.from_pretrained(
32
+ self.model_name,
33
+ torch_dtype=torch.float16,
34
+ device_map=self.device,
35
+ attn_implementation="sdpa",
36
+ trust_remote_code=self.trust_remote_code,
37
+ )
38
+ # Load processor
39
+ self.processor = AutoProcessor.from_pretrained(
40
+ self.model_name,
41
+ min_pixels=3136,
42
+ max_pixels=4096 * 2160,
43
+ device_map=self.device,
44
+ trust_remote_code=self.trust_remote_code,
45
+ )
46
+
47
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
48
+ """Generate text for the given HF-format messages.
49
+ messages: [{ role, content: [{type:'text'|'image', text|image}] }]
50
+ """
51
+ assert self.model is not None and self.processor is not None
52
+ # Apply chat template and tokenize
53
+ inputs = self.processor.apply_chat_template(
54
+ messages,
55
+ add_generation_prompt=True,
56
+ tokenize=True,
57
+ return_dict=True,
58
+ return_tensors="pt",
59
+ )
60
+ # Move inputs to the same device as model
61
+ inputs = inputs.to(self.model.device)
62
+ # Generate
63
+ with torch.no_grad():
64
+ generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
65
+ # Trim prompt tokens from output
66
+ generated_ids_trimmed = [
67
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
68
+ ]
69
+ # Decode
70
+ output_text = self.processor.batch_decode(
71
+ generated_ids_trimmed,
72
+ skip_special_tokens=True,
73
+ clean_up_tokenization_spaces=False,
74
+ )
75
+ return output_text[0] if output_text else ""
@@ -0,0 +1,254 @@
1
+ from __future__ import annotations
2
+ from typing import List, Dict, Any, Optional
3
+
4
+ # Hugging Face imports are local to avoid hard dependency at module import
5
+ try:
6
+ import torch # type: ignore
7
+ from transformers import AutoModel, AutoTokenizer # type: ignore
8
+ # Attempt to import InternVL's model dependencies
9
+ import einops as _ # type: ignore
10
+ import timm as _ # type: ignore
11
+ from PIL import Image # type: ignore
12
+ import torchvision.transforms as T # type: ignore
13
+ from torchvision.transforms.functional import InterpolationMode # type: ignore
14
+ import base64 # type: ignore
15
+ from io import BytesIO # type: ignore
16
+ import requests # type: ignore
17
+ HF_AVAILABLE = True
18
+ except Exception:
19
+ HF_AVAILABLE = False
20
+
21
+
22
+ class InternVLModel:
23
+ """Generic Hugging Face vision-language model handler.
24
+ Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
25
+ Provides preprocessing to support multi-turn conversations with multiple images.
26
+ """
27
+
28
+ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
29
+ if not HF_AVAILABLE:
30
+ raise ImportError(
31
+ "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
32
+ )
33
+ self.model_name = model_name
34
+ self.device = device
35
+ self.model = None
36
+ self.tokenizer = None
37
+ self.trust_remote_code = trust_remote_code
38
+ self._load()
39
+
40
+ def _load(self) -> None:
41
+ # Load model
42
+ self.model = AutoModel.from_pretrained(
43
+ self.model_name,
44
+ torch_dtype=torch.bfloat16,
45
+ low_cpu_mem_usage=True,
46
+ use_flash_attn=True,
47
+ device_map=self.device,
48
+ trust_remote_code=self.trust_remote_code,
49
+ ).eval()
50
+ # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
51
+ self.tokenizer = AutoTokenizer.from_pretrained(
52
+ self.model_name,
53
+ trust_remote_code=self.trust_remote_code,
54
+ use_fast=False,
55
+ )
56
+
57
+ # ---- Image preprocessing utilities adapted from InternVL docs ----
58
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
59
+ IMAGENET_STD = (0.229, 0.224, 0.225)
60
+
61
+ def _build_transform(self, input_size: int) -> T.Compose:
62
+ MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
63
+ transform = T.Compose([
64
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
65
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
66
+ T.ToTensor(),
67
+ T.Normalize(mean=MEAN, std=STD)
68
+ ])
69
+ return transform
70
+
71
+ def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
72
+ best_ratio_diff = float('inf')
73
+ best_ratio = (1, 1)
74
+ area = width * height
75
+ for ratio in target_ratios:
76
+ target_aspect_ratio = ratio[0] / ratio[1]
77
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
78
+ if ratio_diff < best_ratio_diff:
79
+ best_ratio_diff = ratio_diff
80
+ best_ratio = ratio
81
+ elif ratio_diff == best_ratio_diff:
82
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
83
+ best_ratio = ratio
84
+ return best_ratio
85
+
86
+ def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
87
+ orig_width, orig_height = image.size
88
+ aspect_ratio = orig_width / orig_height
89
+
90
+ target_ratios = set(
91
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
92
+ i * j <= max_num and i * j >= min_num)
93
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
94
+
95
+ target_aspect_ratio = self._find_closest_aspect_ratio(
96
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
97
+
98
+ target_width = image_size * target_aspect_ratio[0]
99
+ target_height = image_size * target_aspect_ratio[1]
100
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
101
+
102
+ resized_img = image.resize((target_width, target_height))
103
+ processed_images: List[Image.Image] = []
104
+ for i in range(blocks):
105
+ box = (
106
+ (i % (target_width // image_size)) * image_size,
107
+ (i // (target_width // image_size)) * image_size,
108
+ ((i % (target_width // image_size)) + 1) * image_size,
109
+ ((i // (target_width // image_size)) + 1) * image_size
110
+ )
111
+ split_img = resized_img.crop(box)
112
+ processed_images.append(split_img)
113
+ assert len(processed_images) == blocks
114
+ if use_thumbnail and len(processed_images) != 1:
115
+ thumbnail_img = image.resize((image_size, image_size))
116
+ processed_images.append(thumbnail_img)
117
+ return processed_images
118
+
119
+ def _load_image_from_source(self, src: str) -> Image.Image:
120
+ """Load PIL image from various sources: data URL, http(s), or local path."""
121
+ if src.startswith("data:image/"):
122
+ # data URL base64
123
+ header, b64data = src.split(",", 1)
124
+ img_bytes = base64.b64decode(b64data)
125
+ return Image.open(BytesIO(img_bytes)).convert('RGB')
126
+ if src.startswith("http://") or src.startswith("https://"):
127
+ resp = requests.get(src, timeout=10)
128
+ resp.raise_for_status()
129
+ return Image.open(BytesIO(resp.content)).convert('RGB')
130
+ # Assume local file path
131
+ return Image.open(src).convert('RGB')
132
+
133
+ def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
134
+ transform = self._build_transform(input_size=input_size)
135
+ pixel_values_list = []
136
+ num_patches_list: List[int] = []
137
+ for img in images:
138
+ tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
139
+ pv = [transform(tile) for tile in tiles]
140
+ pv = torch.stack(pv)
141
+ num_patches_list.append(pv.shape[0])
142
+ pixel_values_list.append(pv)
143
+ if not pixel_values_list:
144
+ return None, []
145
+ pixel_values = torch.cat(pixel_values_list)
146
+ return pixel_values, num_patches_list
147
+
148
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
149
+ """Generate text for the given HF-format messages.
150
+ messages: [{ role, content: [{type:'text'|'image', text|image}] }]
151
+
152
+ This implementation constructs InternVL-compatible inputs and uses
153
+ `model.chat(tokenizer, pixel_values, question, history=...)` to avoid
154
+ relying on AutoProcessor (which fails for some tokenizers).
155
+ """
156
+ assert self.model is not None and self.tokenizer is not None
157
+
158
+ # Build textual context and collect images and the final question
159
+ context_lines: List[str] = []
160
+ all_images: List[Image.Image] = []
161
+ last_user_text_parts: List[str] = []
162
+
163
+ for msg in messages:
164
+ role = msg.get("role", "user")
165
+ content = msg.get("content", [])
166
+ if isinstance(content, str):
167
+ content_items = [{"type": "text", "text": content}]
168
+ else:
169
+ content_items = content
170
+
171
+ if role == "user":
172
+ # Collect text and images
173
+ parts_text: List[str] = []
174
+ for item in content_items:
175
+ if item.get("type") == "text":
176
+ t = item.get("text", "")
177
+ if t:
178
+ parts_text.append(t)
179
+ elif item.get("type") == "image":
180
+ url = item.get("image", "")
181
+ if url:
182
+ try:
183
+ all_images.append(self._load_image_from_source(url))
184
+ except Exception:
185
+ # Ignore failed image loads but keep going
186
+ pass
187
+ text = "\n".join(parts_text).strip()
188
+ if text:
189
+ context_lines.append(f"User: {text}")
190
+ # Track last user text separately for question
191
+ last_user_text_parts = parts_text or last_user_text_parts
192
+ elif role == "assistant":
193
+ # Only keep text content for history
194
+ parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
195
+ text = "\n".join(parts_text).strip()
196
+ if text:
197
+ context_lines.append(f"Assistant: {text}")
198
+
199
+ # Prepare pixel values for all collected images (across turns)
200
+ pixel_values = None
201
+ num_patches_list: List[int] = []
202
+ if all_images:
203
+ pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
204
+ if pixel_values is not None:
205
+ # Convert dtype/device as in docs
206
+ pixel_values = pixel_values.to(torch.bfloat16)
207
+ # Chat API expects tensors on CUDA when model is on CUDA
208
+ try:
209
+ pixel_values = pixel_values.to(self.model.device)
210
+ except Exception:
211
+ pass
212
+
213
+ # Build question with any prior context and numbered image placeholders
214
+ if all_images:
215
+ # Separate images layout: Image-1: <image> ... then question text
216
+ prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
217
+ prefix = "\n".join(prefix_lines) + "\n"
218
+ else:
219
+ prefix = ""
220
+
221
+ last_user_text = "\n".join(last_user_text_parts).strip()
222
+ # Combine prior text-only turns as context to emulate multi-turn
223
+ context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
224
+ base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
225
+ if context_text:
226
+ question = (context_text + "\n" + prefix + base_question).strip()
227
+ else:
228
+ question = (prefix + base_question).strip()
229
+
230
+ # Generation config
231
+ generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
232
+
233
+ # Call InternVL chat
234
+ try:
235
+ if pixel_values is None:
236
+ # Pure-text conversation (embed prior turns in question)
237
+ response = self.model.chat(self.tokenizer, None, question, generation_config)
238
+ else:
239
+ # Multi-image: pass num_patches_list if >1 image
240
+ if len(num_patches_list) > 1:
241
+ response = self.model.chat(
242
+ self.tokenizer,
243
+ pixel_values,
244
+ question,
245
+ generation_config,
246
+ num_patches_list=num_patches_list,
247
+ )
248
+ else:
249
+ response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
250
+ except Exception as e:
251
+ # Fallback: return empty string to avoid crashing the adapter
252
+ return ""
253
+
254
+ return response or ""
@@ -0,0 +1,100 @@
1
+ from typing import List, Dict, Any
2
+ import re
3
+ import base64
4
+ from io import BytesIO
5
+
6
+ try:
7
+ import torch # type: ignore
8
+ from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore
9
+ from PIL import Image # type: ignore
10
+ import blobfile as _ # assert blobfile is installed
11
+ OPENCUA_AVAILABLE = True
12
+ except Exception:
13
+ OPENCUA_AVAILABLE = False
14
+
15
+
16
+ class OpenCUAModel:
17
+ """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
18
+
19
+ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
20
+ if not OPENCUA_AVAILABLE:
21
+ raise ImportError(
22
+ "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
23
+ )
24
+ self.model_name = model_name
25
+ self.device = device
26
+ self.model = None
27
+ self.tokenizer = None
28
+ self.image_processor = None
29
+ self.trust_remote_code = trust_remote_code
30
+ self._load()
31
+
32
+ def _load(self) -> None:
33
+ self.tokenizer = AutoTokenizer.from_pretrained(
34
+ self.model_name, trust_remote_code=self.trust_remote_code
35
+ )
36
+ self.model = AutoModel.from_pretrained(
37
+ self.model_name,
38
+ torch_dtype="auto",
39
+ device_map=self.device,
40
+ trust_remote_code=self.trust_remote_code,
41
+ attn_implementation="sdpa",
42
+ )
43
+ self.image_processor = AutoImageProcessor.from_pretrained(
44
+ self.model_name, trust_remote_code=self.trust_remote_code
45
+ )
46
+
47
+ @staticmethod
48
+ def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
49
+ # Expect HF-format messages with content items type: "image" with data URL
50
+ for msg in reversed(messages):
51
+ for item in reversed(msg.get("content", [])):
52
+ if isinstance(item, dict) and item.get("type") == "image":
53
+ url = item.get("image", "")
54
+ if isinstance(url, str) and url.startswith("data:image/"):
55
+ return url.split(",", 1)[1]
56
+ return ""
57
+
58
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
59
+ assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
60
+
61
+ # Tokenize text side using chat template
62
+ input_ids = self.tokenizer.apply_chat_template(
63
+ messages, tokenize=True, add_generation_prompt=True
64
+ )
65
+ input_ids = torch.tensor([input_ids]).to(self.model.device)
66
+
67
+ # Prepare image inputs from last data URL image
68
+ image_b64 = self._extract_last_image_b64(messages)
69
+ pixel_values = None
70
+ grid_thws = None
71
+ if image_b64:
72
+ image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
73
+ image_info = self.image_processor.preprocess(images=[image])
74
+ pixel_values = torch.tensor(image_info["pixel_values"]).to(
75
+ dtype=torch.bfloat16, device=self.model.device
76
+ )
77
+ grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
78
+
79
+ gen_kwargs: Dict[str, Any] = {
80
+ "max_new_tokens": max_new_tokens,
81
+ "temperature": 0,
82
+ }
83
+ if pixel_values is not None:
84
+ gen_kwargs["pixel_values"] = pixel_values
85
+ if grid_thws is not None:
86
+ gen_kwargs["grid_thws"] = grid_thws
87
+
88
+ with torch.no_grad():
89
+ generated_ids = self.model.generate(
90
+ input_ids,
91
+ **gen_kwargs,
92
+ )
93
+
94
+ # Remove prompt tokens
95
+ prompt_len = input_ids.shape[1]
96
+ generated_ids = generated_ids[:, prompt_len:]
97
+ output_text = self.tokenizer.batch_decode(
98
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
99
+ )[0]
100
+ return output_text
@@ -0,0 +1,75 @@
1
+ from typing import List, Dict, Any, Optional
2
+
3
+ # Hugging Face imports are local to avoid hard dependency at module import
4
+ try:
5
+ import torch # type: ignore
6
+ from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
7
+ HF_AVAILABLE = True
8
+ except Exception:
9
+ HF_AVAILABLE = False
10
+
11
+
12
+ class Qwen2_5_VLModel:
13
+ """Qwen2.5-VL Hugging Face vision-language model handler.
14
+ Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
15
+ """
16
+
17
+ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
18
+ if not HF_AVAILABLE:
19
+ raise ImportError(
20
+ "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
21
+ )
22
+ self.model_name = model_name
23
+ self.device = device
24
+ self.model = None
25
+ self.processor = None
26
+ self.trust_remote_code = trust_remote_code
27
+ self._load()
28
+
29
+ def _load(self) -> None:
30
+ # Load model
31
+ self.model = AutoModelForImageTextToText.from_pretrained(
32
+ self.model_name,
33
+ torch_dtype=torch.bfloat16,
34
+ device_map=self.device,
35
+ attn_implementation="sdpa",
36
+ trust_remote_code=self.trust_remote_code,
37
+ )
38
+ # Load processor
39
+ self.processor = AutoProcessor.from_pretrained(
40
+ self.model_name,
41
+ min_pixels=3136,
42
+ max_pixels=4096 * 2160,
43
+ device_map=self.device,
44
+ trust_remote_code=self.trust_remote_code,
45
+ )
46
+
47
+ def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
48
+ """Generate text for the given HF-format messages.
49
+ messages: [{ role, content: [{type:'text'|'image', text|image}] }]
50
+ """
51
+ assert self.model is not None and self.processor is not None
52
+ # Apply chat template and tokenize
53
+ inputs = self.processor.apply_chat_template(
54
+ messages,
55
+ add_generation_prompt=True,
56
+ tokenize=True,
57
+ return_dict=True,
58
+ return_tensors="pt",
59
+ )
60
+ # Move inputs to the same device as model
61
+ inputs = inputs.to(self.model.device)
62
+ # Generate
63
+ with torch.no_grad():
64
+ generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
65
+ # Trim prompt tokens from output
66
+ generated_ids_trimmed = [
67
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
68
+ ]
69
+ # Decode
70
+ output_text = self.processor.batch_decode(
71
+ generated_ids_trimmed,
72
+ skip_special_tokens=True,
73
+ clean_up_tokenization_spaces=False,
74
+ )
75
+ return output_text[0] if output_text else ""
agent/agent.py CHANGED
@@ -171,6 +171,7 @@ class ComputerAgent:
171
171
  use_prompt_caching: Optional[bool] = False,
172
172
  max_trajectory_budget: Optional[float | dict] = None,
173
173
  telemetry_enabled: Optional[bool] = True,
174
+ trust_remote_code: Optional[bool] = False,
174
175
  **kwargs
175
176
  ):
176
177
  """
@@ -190,6 +191,7 @@ class ComputerAgent:
190
191
  use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
191
192
  max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
192
193
  telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
194
+ trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
193
195
  **kwargs: Additional arguments passed to the agent loop
194
196
  """
195
197
  # If the loop is "human/human", we need to prefix a grounding model fallback
@@ -209,6 +211,7 @@ class ComputerAgent:
209
211
  self.use_prompt_caching = use_prompt_caching
210
212
  self.telemetry_enabled = telemetry_enabled
211
213
  self.kwargs = kwargs
214
+ self.trust_remote_code = trust_remote_code
212
215
 
213
216
  # == Add built-in callbacks ==
214
217
 
@@ -252,7 +255,8 @@ class ComputerAgent:
252
255
 
253
256
  # Register local model providers
254
257
  hf_adapter = HuggingFaceLocalAdapter(
255
- device="auto"
258
+ device="auto",
259
+ trust_remote_code=self.trust_remote_code or False
256
260
  )
257
261
  human_adapter = HumanAdapter()
258
262
  mlx_adapter = MLXVLMAdapter()