PyPI - cua-agent - Versions diffs - 0.4.30__py3-none-any.whl → 0.4.32__py3-none-any.whl - Mend

cua-agent 0.4.30py3-none-any.whl → 0.4.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (23) hide show

agent/adapters/huggingfacelocal_adapter.py +15 -66
agent/adapters/models/__init__.py +33 -0
agent/adapters/models/generic.py +75 -0
agent/adapters/models/internvl.py +254 -0
agent/adapters/models/opencua.py +100 -0
agent/adapters/models/qwen2_5_vl.py +75 -0
agent/agent.py +5 -1
agent/callbacks/trajectory_saver.py +2 -0
agent/cli.py +90 -1
agent/integrations/hud/__init__.py +19 -0
agent/loops/__init__.py +15 -1
agent/loops/anthropic.py +2 -3
agent/loops/composed_grounded.py +1 -1
agent/loops/glm45v.py +3 -2
agent/loops/gta1.py +1 -1
agent/loops/holo.py +216 -0
agent/loops/internvl.py +185 -0
agent/loops/opencua.py +142 -0
agent/loops/uitars.py +1 -1
{cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/METADATA +20 -4
{cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/RECORD +23 -15
{cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/WHEEL +0 -0
{cua_agent-0.4.30.dist-info → cua_agent-0.4.32.dist-info}/entry_points.txt +0 -0

agent/adapters/huggingfacelocal_adapter.py CHANGED Viewed

@@ -15,54 +15,31 @@ try:
 except ImportError:
     HF_AVAILABLE = False
+from .models import load_model as load_model_handler
 class HuggingFaceLocalAdapter(CustomLLM):
     """HuggingFace Local Adapter for running vision-language models locally."""
-    def __init__(self, device: str = "auto", **kwargs):
+    def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
         """Initialize the adapter.
         Args:
             device: Device to load model on ("auto", "cuda", "cpu", etc.)
+            trust_remote_code: Whether to trust remote code
             **kwargs: Additional arguments
         """
         super().__init__()
         self.device = device
-        self.models = {}  # Cache for loaded models
-        self.processors = {}  # Cache for loaded processors
+        self.trust_remote_code = trust_remote_code
+        # Cache for model handlers keyed by model_name
+        self._handlers: Dict[str, Any] = {}
         self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool
-    def _load_model_and_processor(self, model_name: str):
-        """Load model and processor if not already cached.
-        Args:
-            model_name: Name of the model to load
-        Returns:
-            Tuple of (model, processor)
-        """
-        if model_name not in self.models:
-            # Load model
-            model = AutoModelForImageTextToText.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                device_map=self.device,
-                attn_implementation="sdpa"
-            )
-            # Load processor
-            processor = AutoProcessor.from_pretrained(
-                model_name,
-                min_pixels=3136,
-                max_pixels=4096 * 2160,
-                device_map=self.device
-            )
-            # Cache them
-            self.models[model_name] = model
-            self.processors[model_name] = processor
-        return self.models[model_name], self.processors[model_name]
+    def _get_handler(self, model_name: str):
+        """Get or create a model handler for the given model name."""
+        if model_name not in self._handlers:
+            self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
+        return self._handlers[model_name]
     def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Convert OpenAI format messages to HuggingFace format.
@@ -133,41 +110,13 @@ class HuggingFaceLocalAdapter(CustomLLM):
         if ignored_kwargs:
             warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
-        # Load model and processor
-        model, processor = self._load_model_and_processor(model_name)
         # Convert messages to HuggingFace format
         hf_messages = self._convert_messages(messages)
-        # Apply chat template and tokenize
-        inputs = processor.apply_chat_template(
-            hf_messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
-        # Move inputs to the same device as model
-        inputs = inputs.to(model.device)
-        # Generate response
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
-        # Trim input tokens from output
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        # Decode output
-        output_text = processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )
-        return output_text[0] if output_text else ""
+        # Delegate to model handler
+        handler = self._get_handler(model_name)
+        generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
+        return generated_text
     def completion(self, *args, **kwargs) -> ModelResponse:
         """Synchronous completion method.

agent/adapters/models/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+from typing import Optional
+try:
+    from transformers import AutoConfig
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+from .generic import GenericHFModel
+from .opencua import OpenCUAModel
+from .qwen2_5_vl import Qwen2_5_VLModel
+from .internvl import InternVLModel
+def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
+    """Factory function to load and return the right model handler instance.
+    - If the underlying transformers config class matches OpenCUA, return OpenCUAModel
+    - Otherwise, return GenericHFModel
+    """
+    if not HF_AVAILABLE:
+        raise ImportError(
+            "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+        )
+    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+    cls = cfg.__class__.__name__
+    print(f"cls: {cls}")
+    if "OpenCUA" in cls:
+        return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+    elif "Qwen2_5_VL" in cls:
+        return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+    elif "InternVL" in cls:
+        return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+    return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)

agent/adapters/models/generic.py ADDED Viewed

@@ -0,0 +1,75 @@
+from typing import List, Dict, Any, Optional
+# Hugging Face imports are local to avoid hard dependency at module import
+try:
+    import torch  # type: ignore
+    from transformers import AutoModel, AutoProcessor  # type: ignore
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
+class GenericHFModel:
+    """Generic Hugging Face vision-language model handler.
+    Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
+    """
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.processor = None
+        self.trust_remote_code = trust_remote_code
+        self._load()
+    def _load(self) -> None:
+        # Load model
+        self.model = AutoModel.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.float16,
+            device_map=self.device,
+            attn_implementation="sdpa",
+            trust_remote_code=self.trust_remote_code,
+        )
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_name,
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+            device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
+        )
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
+        """Generate text for the given HF-format messages.
+        messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+        """
+        assert self.model is not None and self.processor is not None
+        # Apply chat template and tokenize
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        # Move inputs to the same device as model
+        inputs = inputs.to(self.model.device)
+        # Generate
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        # Trim prompt tokens from output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        return output_text[0] if output_text else ""

agent/adapters/models/internvl.py ADDED Viewed

@@ -0,0 +1,254 @@
+from __future__ import annotations
+from typing import List, Dict, Any, Optional
+# Hugging Face imports are local to avoid hard dependency at module import
+try:
+    import torch  # type: ignore
+    from transformers import AutoModel, AutoTokenizer  # type: ignore
+    # Attempt to import InternVL's model dependencies
+    import einops as _  # type: ignore
+    import timm as _  # type: ignore
+    from PIL import Image  # type: ignore
+    import torchvision.transforms as T  # type: ignore
+    from torchvision.transforms.functional import InterpolationMode  # type: ignore
+    import base64  # type: ignore
+    from io import BytesIO  # type: ignore
+    import requests  # type: ignore
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
+class InternVLModel:
+    """Generic Hugging Face vision-language model handler.
+    Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
+    Provides preprocessing to support multi-turn conversations with multiple images.
+    """
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.tokenizer = None
+        self.trust_remote_code = trust_remote_code
+        self._load()
+    def _load(self) -> None:
+        # Load model
+        self.model = AutoModel.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            use_flash_attn=True,
+            device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
+        ).eval()
+        # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            trust_remote_code=self.trust_remote_code,
+            use_fast=False,
+        )
+    # ---- Image preprocessing utilities adapted from InternVL docs ----
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    def _build_transform(self, input_size: int) -> T.Compose:
+        MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD)
+        ])
+        return transform
+    def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
+        best_ratio_diff = float('inf')
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        target_ratios = set(
+            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+            i * j <= max_num and i * j >= min_num)
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        target_aspect_ratio = self._find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        resized_img = image.resize((target_width, target_height))
+        processed_images: List[Image.Image] = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    def _load_image_from_source(self, src: str) -> Image.Image:
+        """Load PIL image from various sources: data URL, http(s), or local path."""
+        if src.startswith("data:image/"):
+            # data URL base64
+            header, b64data = src.split(",", 1)
+            img_bytes = base64.b64decode(b64data)
+            return Image.open(BytesIO(img_bytes)).convert('RGB')
+        if src.startswith("http://") or src.startswith("https://"):
+            resp = requests.get(src, timeout=10)
+            resp.raise_for_status()
+            return Image.open(BytesIO(resp.content)).convert('RGB')
+        # Assume local file path
+        return Image.open(src).convert('RGB')
+    def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
+        transform = self._build_transform(input_size=input_size)
+        pixel_values_list = []
+        num_patches_list: List[int] = []
+        for img in images:
+            tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+            pv = [transform(tile) for tile in tiles]
+            pv = torch.stack(pv)
+            num_patches_list.append(pv.shape[0])
+            pixel_values_list.append(pv)
+        if not pixel_values_list:
+            return None, []
+        pixel_values = torch.cat(pixel_values_list)
+        return pixel_values, num_patches_list
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
+        """Generate text for the given HF-format messages.
+        messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+        This implementation constructs InternVL-compatible inputs and uses
+        `model.chat(tokenizer, pixel_values, question, history=...)` to avoid
+        relying on AutoProcessor (which fails for some tokenizers).
+        """
+        assert self.model is not None and self.tokenizer is not None
+        # Build textual context and collect images and the final question
+        context_lines: List[str] = []
+        all_images: List[Image.Image] = []
+        last_user_text_parts: List[str] = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", [])
+            if isinstance(content, str):
+                content_items = [{"type": "text", "text": content}]
+            else:
+                content_items = content
+            if role == "user":
+                # Collect text and images
+                parts_text: List[str] = []
+                for item in content_items:
+                    if item.get("type") == "text":
+                        t = item.get("text", "")
+                        if t:
+                            parts_text.append(t)
+                    elif item.get("type") == "image":
+                        url = item.get("image", "")
+                        if url:
+                            try:
+                                all_images.append(self._load_image_from_source(url))
+                            except Exception:
+                                # Ignore failed image loads but keep going
+                                pass
+                text = "\n".join(parts_text).strip()
+                if text:
+                    context_lines.append(f"User: {text}")
+                # Track last user text separately for question
+                last_user_text_parts = parts_text or last_user_text_parts
+            elif role == "assistant":
+                # Only keep text content for history
+                parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
+                text = "\n".join(parts_text).strip()
+                if text:
+                    context_lines.append(f"Assistant: {text}")
+        # Prepare pixel values for all collected images (across turns)
+        pixel_values = None
+        num_patches_list: List[int] = []
+        if all_images:
+            pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
+            if pixel_values is not None:
+                # Convert dtype/device as in docs
+                pixel_values = pixel_values.to(torch.bfloat16)
+                # Chat API expects tensors on CUDA when model is on CUDA
+                try:
+                    pixel_values = pixel_values.to(self.model.device)
+                except Exception:
+                    pass
+        # Build question with any prior context and numbered image placeholders
+        if all_images:
+            # Separate images layout: Image-1: <image> ... then question text
+            prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
+            prefix = "\n".join(prefix_lines) + "\n"
+        else:
+            prefix = ""
+        last_user_text = "\n".join(last_user_text_parts).strip()
+        # Combine prior text-only turns as context to emulate multi-turn
+        context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
+        base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
+        if context_text:
+            question = (context_text + "\n" + prefix + base_question).strip()
+        else:
+            question = (prefix + base_question).strip()
+        # Generation config
+        generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
+        # Call InternVL chat
+        try:
+            if pixel_values is None:
+                # Pure-text conversation (embed prior turns in question)
+                response = self.model.chat(self.tokenizer, None, question, generation_config)
+            else:
+                # Multi-image: pass num_patches_list if >1 image
+                if len(num_patches_list) > 1:
+                    response = self.model.chat(
+                        self.tokenizer,
+                        pixel_values,
+                        question,
+                        generation_config,
+                        num_patches_list=num_patches_list,
+                    )
+                else:
+                    response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
+        except Exception as e:
+            # Fallback: return empty string to avoid crashing the adapter
+            return ""
+        return response or ""

agent/adapters/models/opencua.py ADDED Viewed

@@ -0,0 +1,100 @@
+from typing import List, Dict, Any
+import re
+import base64
+from io import BytesIO
+try:
+    import torch  # type: ignore
+    from transformers import AutoTokenizer, AutoModel, AutoImageProcessor  # type: ignore
+    from PIL import Image  # type: ignore
+    import blobfile as _ # assert blobfile is installed
+    OPENCUA_AVAILABLE = True
+except Exception:
+    OPENCUA_AVAILABLE = False
+class OpenCUAModel:
+    """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        if not OPENCUA_AVAILABLE:
+            raise ImportError(
+                "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.tokenizer = None
+        self.image_processor = None
+        self.trust_remote_code = trust_remote_code
+        self._load()
+    def _load(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, trust_remote_code=self.trust_remote_code
+        )
+        self.model = AutoModel.from_pretrained(
+            self.model_name,
+            torch_dtype="auto",
+            device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
+            attn_implementation="sdpa",
+        )
+        self.image_processor = AutoImageProcessor.from_pretrained(
+            self.model_name, trust_remote_code=self.trust_remote_code
+        )
+    @staticmethod
+    def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
+        # Expect HF-format messages with content items type: "image" with data URL
+        for msg in reversed(messages):
+            for item in reversed(msg.get("content", [])):
+                if isinstance(item, dict) and item.get("type") == "image":
+                    url = item.get("image", "")
+                    if isinstance(url, str) and url.startswith("data:image/"):
+                        return url.split(",", 1)[1]
+        return ""
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
+        assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
+        # Tokenize text side using chat template
+        input_ids = self.tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True
+        )
+        input_ids = torch.tensor([input_ids]).to(self.model.device)
+        # Prepare image inputs from last data URL image
+        image_b64 = self._extract_last_image_b64(messages)
+        pixel_values = None
+        grid_thws = None
+        if image_b64:
+            image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
+            image_info = self.image_processor.preprocess(images=[image])
+            pixel_values = torch.tensor(image_info["pixel_values"]).to(
+                dtype=torch.bfloat16, device=self.model.device
+            )
+            grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
+        gen_kwargs: Dict[str, Any] = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": 0,
+        }
+        if pixel_values is not None:
+            gen_kwargs["pixel_values"] = pixel_values
+        if grid_thws is not None:
+            gen_kwargs["grid_thws"] = grid_thws
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                input_ids,
+                **gen_kwargs,
+            )
+        # Remove prompt tokens
+        prompt_len = input_ids.shape[1]
+        generated_ids = generated_ids[:, prompt_len:]
+        output_text = self.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return output_text

agent/adapters/models/qwen2_5_vl.py ADDED Viewed

@@ -0,0 +1,75 @@
+from typing import List, Dict, Any, Optional
+# Hugging Face imports are local to avoid hard dependency at module import
+try:
+    import torch  # type: ignore
+    from transformers import AutoModelForImageTextToText, AutoProcessor  # type: ignore
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
+class Qwen2_5_VLModel:
+    """Qwen2.5-VL Hugging Face vision-language model handler.
+    Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
+    """
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.processor = None
+        self.trust_remote_code = trust_remote_code
+        self._load()
+    def _load(self) -> None:
+        # Load model
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=self.device,
+            attn_implementation="sdpa",
+            trust_remote_code=self.trust_remote_code,
+        )
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_name,
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+            device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
+        )
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
+        """Generate text for the given HF-format messages.
+        messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+        """
+        assert self.model is not None and self.processor is not None
+        # Apply chat template and tokenize
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        # Move inputs to the same device as model
+        inputs = inputs.to(self.model.device)
+        # Generate
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        # Trim prompt tokens from output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        return output_text[0] if output_text else ""

agent/agent.py CHANGED Viewed

@@ -171,6 +171,7 @@ class ComputerAgent:
         use_prompt_caching: Optional[bool] = False,
         max_trajectory_budget: Optional[float | dict] = None,
         telemetry_enabled: Optional[bool] = True,
+        trust_remote_code: Optional[bool] = False,
         **kwargs
     ):
         """
@@ -190,6 +191,7 @@ class ComputerAgent:
             use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
             max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
             telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
+            trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
             **kwargs: Additional arguments passed to the agent loop
         """
         # If the loop is "human/human", we need to prefix a grounding model fallback
@@ -209,6 +211,7 @@ class ComputerAgent:
         self.use_prompt_caching = use_prompt_caching
         self.telemetry_enabled = telemetry_enabled
         self.kwargs = kwargs
+        self.trust_remote_code = trust_remote_code
         # == Add built-in callbacks ==
@@ -252,7 +255,8 @@ class ComputerAgent:
         # Register local model providers
         hf_adapter = HuggingFaceLocalAdapter(
-            device="auto"
+            device="auto",
+            trust_remote_code=self.trust_remote_code or False
         )
         human_adapter = HumanAdapter()
         mlx_adapter = MLXVLMAdapter()

cua-agent 0.4.30__py3-none-any.whl → 0.4.32__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.30py3-none-any.whl → 0.4.32py3-none-any.whl