PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py ADDED Viewed

@@ -0,0 +1,476 @@
+from typing import Any, Dict, List, Optional, Union
+import mlx.core as mx
+import numpy as np
+from PIL import Image
+import io
+import base64
+class Qwen3VLProcessor:
+    def __init__(self, tokenizer=None, image_processor=None):
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        # Vision tokens (following the official implementation)
+        self.image_token = "<|image_pad|>"
+        self.vision_start_token = "<|vision_start|>"
+        self.vision_end_token = "<|vision_end|>"
+        # Token IDs (will be set properly if tokenizer is provided)
+        if tokenizer:
+            self.image_token_id = getattr(tokenizer, 'image_token_id',
+                                        tokenizer.convert_tokens_to_ids(self.image_token))
+            self.vision_start_token_id = getattr(tokenizer, 'vision_start_token_id',
+                                               tokenizer.convert_tokens_to_ids(self.vision_start_token))
+            self.vision_end_token_id = getattr(tokenizer, 'vision_end_token_id',
+                                             tokenizer.convert_tokens_to_ids(self.vision_end_token))
+        else:
+            # Fallback IDs for when no tokenizer is provided
+            self.image_token_id = 151655
+            self.vision_start_token_id = 151652
+            self.vision_end_token_id = 151653
+        # Image processing parameters (following Qwen3VL defaults)
+        self.min_pixels = 4096
+        self.max_pixels = 16777216
+        self.patch_size = 16
+        self.merge_size = 2
+        self.temporal_patch_size = 2
+        # Add the missing image_mean and image_std
+        self.image_mean = [0.5, 0.5, 0.5]
+        self.image_std = [0.5, 0.5, 0.5]
+    def _extract_patches(self, image_array: np.ndarray) -> np.ndarray:
+        """
+        Extract patches from image array to create proper tensor for Conv3d.
+        Args:
+            image_array: Shape (C, H, W)
+        Returns:
+            patches: Flattened tensor that can be reshaped to
+                    (num_patches, C, temporal_patch_size, patch_size, patch_size)
+        """
+        C, H, W = image_array.shape
+        # Calculate number of patches
+        patch_h = H // self.patch_size
+        patch_w = W // self.patch_size
+        # Extract spatial patches
+        # Reshape to (C, patch_h, patch_size, patch_w, patch_size)
+        patches = image_array.reshape(
+            C, patch_h, self.patch_size, patch_w, self.patch_size
+        )
+        # Rearrange to (patch_h, patch_w, C, patch_size, patch_size)
+        patches = patches.transpose(1, 3, 0, 2, 4)
+        # Reshape to (patch_h * patch_w, C, patch_size, patch_size)
+        num_patches = patch_h * patch_w
+        patches = patches.reshape(num_patches, C, self.patch_size, self.patch_size)
+        # Add temporal dimension by duplicating the patches
+        # Shape: (num_patches, C, temporal_patch_size, patch_size, patch_size)
+        patches = np.tile(patches[:, :, None, :, :], (1, 1, self.temporal_patch_size, 1, 1))
+        return patches
+    def _process_single_image(self, image: Union[str, Image.Image, np.ndarray]) -> Dict[str, Any]:
+        """Process a single image and return processed data."""
+        if isinstance(image, str):
+            if image.startswith('data:image'):
+                image_data = base64.b64decode(image.split(',')[1])
+                image = Image.open(io.BytesIO(image_data))
+            else:
+                image = Image.open(image)
+        elif isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Resize image based on pixel constraints
+        width, height = image.size
+        pixels = width * height
+        if pixels < self.min_pixels:
+            scale = (self.min_pixels / pixels) ** 0.5
+            width = int(width * scale)
+            height = int(height * scale)
+        elif pixels > self.max_pixels:
+            scale = (self.max_pixels / pixels) ** 0.5
+            width = int(width * scale)
+            height = int(height * scale)
+        # Ensure dimensions are multiples of patch_size AND work with merge_size
+        # Use fraction-based rounding to match PyTorch behavior
+        import math
+        width_frac = (width / self.patch_size) % 1
+        height_frac = (height / self.patch_size) % 1
+        # Round up if fraction >= 0.3, otherwise round down
+        # This matches the observed PyTorch processor behavior
+        if width_frac >= 0.3:
+            width = math.ceil(width / self.patch_size) * self.patch_size
+        else:
+            width = (width // self.patch_size) * self.patch_size
+        if height_frac >= 0.3:
+            height = math.ceil(height / self.patch_size) * self.patch_size
+        else:
+            height = (height // self.patch_size) * self.patch_size
+        # CRITICAL: Ensure patch dimensions are even for 2x2 merging
+        # If either dimension is odd, add one more patch to make it even
+        h_patches = height // self.patch_size
+        w_patches = width // self.patch_size
+        if h_patches % 2 == 1:
+            height += self.patch_size  # Add one more patch row
+        if w_patches % 2 == 1:
+            width += self.patch_size   # Add one more patch column
+        if width == 0 or height == 0:
+            width = height = self.patch_size
+        image = image.resize((width, height), Image.Resampling.LANCZOS)
+        # Convert to array and normalize
+        image_array = np.array(image).astype(np.float32) / 255.0
+        # Qwen3VL normalization
+        mean = np.array(self.image_mean)
+        std = np.array(self.image_std)
+        image_array = (image_array - mean) / std
+        # Convert HWC to CHW
+        image_array = np.transpose(image_array, (2, 0, 1))
+        # Calculate grid dimensions
+        h_patches = height // self.patch_size
+        w_patches = width // self.patch_size
+        # Extract patches using the exact same method as PyTorch Conv3d unfold
+        C, H, W = image_array.shape
+        # Reshape to extract patches: (C, H//patch_size, patch_size, W//patch_size, patch_size)
+        patches = image_array.reshape(C, h_patches, self.patch_size, w_patches, self.patch_size)
+        # Rearrange to group patches: (h_patches, w_patches, C, patch_size, patch_size)
+        patches = patches.transpose(1, 3, 0, 2, 4)
+        # Flatten spatial patches: (h_patches * w_patches, C, patch_size, patch_size)
+        patches = patches.reshape(-1, C, self.patch_size, self.patch_size)
+        # Add temporal dimension: (num_patches, C, T, patch_size, patch_size)
+        patches_with_temporal = np.tile(patches[:, :, None, :, :], (1, 1, self.temporal_patch_size, 1, 1))
+        # Flatten each patch in the order: C, T, H, W to match PyTorch Conv3d
+        pixel_values = patches_with_temporal.reshape(patches_with_temporal.shape[0], -1)
+        # Apply spatial merging reordering to match PyTorch processor
+        # Group patches into merge_size x merge_size blocks and reorder
+        pixel_values = pixel_values.reshape(h_patches // self.merge_size, self.merge_size,
+                                          w_patches // self.merge_size, self.merge_size, -1)
+        # Rearrange to (h_blocks, w_blocks, merge_size*merge_size, feature_dim)
+        pixel_values = pixel_values.transpose(0, 2, 1, 3, 4)
+        pixel_values = pixel_values.reshape(h_patches // self.merge_size,
+                                          w_patches // self.merge_size,
+                                          self.merge_size * self.merge_size, -1)
+        # Flatten to (total_merged_patches, feature_dim)
+        pixel_values = pixel_values.reshape(-1, pixel_values.shape[-1])
+        return {
+            'pixel_values': pixel_values,  # Shape: (num_patches, 1536)
+            'grid_thw': [1, h_patches, w_patches]  # T=1 for images
+        }
+    def _insert_image_tokens(self, text: str, image_grid_thw: List[List[int]]) -> str:
+        """Insert the correct number of image tokens based on grid dimensions."""
+        if not image_grid_thw:
+            return text
+        merge_length = self.merge_size ** 2
+        index = 0
+        while self.image_token in text and index < len(image_grid_thw):
+            # Calculate number of tokens needed for this image
+            t, h, w = image_grid_thw[index]
+            num_image_tokens = (t * h * w) // merge_length
+            # Replace one image token with the calculated number of tokens
+            text = text.replace(self.image_token, self.image_token * num_image_tokens, 1)
+            index += 1
+        return text
+    def __call__(
+        self,
+        text: Union[str, List[str]] = None,
+        images: Union[Image.Image, List[Image.Image], str, List[str], np.ndarray, List[np.ndarray]] = None,
+        return_tensors: str = "mlx",
+        **kwargs
+    ) -> Dict[str, mx.array]:
+        """
+        Process text and images for Qwen3VL model.
+        Returns:
+            Dict containing:
+            - input_ids: Tokenized text with proper image tokens
+            - pixel_values: Processed image patches (if images provided)
+            - image_grid_thw: Grid dimensions for images (if images provided)
+        """
+        result = {}
+        # Process images first
+        grid_thw_list = None
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            # Check if images list is not empty
+            if len(images) > 0:
+                if self.image_processor is not None:
+                    image_inputs = self.image_processor(images=images, return_tensors="np")
+                    result["pixel_values"] = mx.array(image_inputs["pixel_values"])
+                    result["image_grid_thw"] = mx.array(image_inputs["image_grid_thw"])
+                    grid_thw_list = image_inputs["image_grid_thw"].tolist()
+                else:
+                    processed_patches = []
+                    grid_thw_list = []
+                    for image in images:
+                        processed = self._process_single_image(image)
+                        processed_patches.append(processed["pixel_values"])
+                        grid_thw_list.append(processed["grid_thw"])
+                    all_patches = np.concatenate(processed_patches, axis=0)
+                    result["pixel_values"] = mx.array(all_patches)
+                    result["image_grid_thw"] = mx.array(np.array(grid_thw_list))
+        # Process text
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            text = text.copy()
+            if grid_thw_list is not None:
+                for i in range(len(text)):
+                    text[i] = self._insert_image_tokens(text[i], grid_thw_list)
+            if self.tokenizer:
+                text_inputs = self.tokenizer(text, return_tensors="np", **kwargs)
+                result["input_ids"] = mx.array(text_inputs["input_ids"])
+                if "attention_mask" in text_inputs:
+                    result["attention_mask"] = mx.array(text_inputs["attention_mask"])
+            else:
+                all_tokens = []
+                for t in text:
+                    tokens = [hash(word) % 50000 for word in t.split()]
+                    all_tokens.append(tokens)
+                max_len = max(len(tokens) for tokens in all_tokens)
+                padded_tokens = []
+                for tokens in all_tokens:
+                    padded = tokens + [0] * (max_len - len(tokens))
+                    padded_tokens.append(padded)
+                result["input_ids"] = mx.array(np.array(padded_tokens))
+        return result
+    def _extract_images_and_text_from_messages(self, messages: List[Dict]) -> tuple:
+        """Extract images and text from message format."""
+        images = []
+        text_parts = []
+        for message in messages:
+            role = message.get("role", "user")
+            content = message.get("content", [])
+            if isinstance(content, str):
+                # Simple text content
+                text_parts.append({"role": role, "content": content})
+            elif isinstance(content, list):
+                # Multi-modal content
+                message_text_parts = []
+                for item in content:
+                    if item.get("type") == "image":
+                        images.append(item.get("image"))
+                        message_text_parts.append("<|vision_start|><|image_pad|><|vision_end|>")
+                    elif item.get("type") == "text":
+                        message_text_parts.append(item.get("text", ""))
+                combined_text = "".join(message_text_parts)
+                text_parts.append({"role": role, "content": combined_text})
+        return images, text_parts
+    def apply_chat_template(
+        self,
+        messages: List[Dict],
+        add_generation_prompt: bool = True,
+        tokenize: bool = False,
+        **kwargs
+    ) -> str:
+        """Apply chat template to messages."""
+        # Handle multi-modal messages
+        if any(isinstance(msg.get("content"), list) for msg in messages):
+            _, text_messages = self._extract_images_and_text_from_messages(messages)
+            messages = text_messages
+        if not self.tokenizer:
+            # Fallback chat template
+            formatted_messages = []
+            for msg in messages:
+                role = msg.get("role", "user")
+                content = msg.get("content", "")
+                formatted_messages.append(f"<|im_start|>{role}\n{content}<|im_end|>")
+            result = "\n".join(formatted_messages)
+            if add_generation_prompt:
+                result += "\n<|im_start|>assistant\n"
+            return result
+        # Use tokenizer and manually remove system message to match ground truth
+        result = self.tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=add_generation_prompt,
+            tokenize=tokenize,
+            **kwargs
+        )
+        # Remove system message to match ground truth format
+        system_prefix = '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n'
+        if result.startswith(system_prefix):
+            result = result[len(system_prefix):]
+        return result
+    def messages_to_text(
+        self,
+        messages: List[Dict],
+        add_generation_prompt: bool = True,
+        **kwargs
+    ) -> tuple:
+        """
+        Step 1: Convert multi-modal messages to text format.
+        Args:
+            messages: List of message dicts with role and content
+            add_generation_prompt: Whether to add generation prompt
+            **kwargs: Additional arguments
+        Returns:
+            Tuple of (text, images) where text is the formatted string and images is list of image objects
+        """
+        # Extract images and text from messages
+        images, text_messages = self._extract_images_and_text_from_messages(messages)
+        # Apply chat template
+        text = self.apply_chat_template(
+            text_messages,
+            add_generation_prompt=add_generation_prompt,
+            tokenize=False,
+            **kwargs
+        )
+        # Load images from URLs if needed
+        processed_images = []
+        for img in images:
+            if isinstance(img, str) and (img.startswith('http://') or img.startswith('https://')):
+                # Load image from URL
+                import requests
+                from io import BytesIO
+                try:
+                    response = requests.get(img, stream=True, timeout=10)
+                    img = Image.open(BytesIO(response.content))
+                except Exception as e:
+                    raise ValueError(f"Failed to load image from URL {img}: {e}")
+            processed_images.append(img)
+        return text, processed_images
+    def text_to_input_ids(
+        self,
+        text: str,
+        images: List = None,
+        return_tensors: str = "mlx",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Step 2: Process text and images into input_ids and pixel_values.
+        Args:
+            text: Formatted text string (from messages_to_text)
+            images: List of image objects
+            return_tensors: Format of returned tensors
+            **kwargs: Additional arguments
+        Returns:
+            Dict with input_ids, pixel_values, image_grid_thw
+        """
+        return self(
+            text=[text],
+            images=images,
+            return_tensors=return_tensors,
+            **kwargs
+        )
+    def process_messages(
+        self,
+        messages: List[Dict],
+        add_generation_prompt: bool = True,
+        return_tensors: str = "mlx",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Process multi-modal messages end-to-end (combines messages_to_text + text_to_input_ids).
+        Args:
+            messages: List of message dicts with role and content
+            add_generation_prompt: Whether to add generation prompt
+            return_tensors: Format of returned tensors
+            **kwargs: Additional arguments
+        Returns:
+            Dict with input_ids, pixel_values, image_grid_thw
+        """
+        # Step 1: Convert messages to text
+        text, processed_images = self.messages_to_text(
+            messages,
+            add_generation_prompt=add_generation_prompt,
+            **kwargs
+        )
+        # Step 2: Convert text to input_ids
+        return self.text_to_input_ids(
+            text,
+            images=processed_images,
+            return_tensors=return_tensors,
+            **kwargs
+        )
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens: bool = True,
+        **kwargs
+    ) -> List[str]:
+        """Decode generated token IDs back to text."""
+        if self.tokenizer:
+            if hasattr(generated_outputs, 'tolist'):
+                generated_outputs = generated_outputs.tolist()
+            return self.tokenizer.batch_decode(
+                generated_outputs,
+                skip_special_tokens=skip_special_tokens,
+                **kwargs
+            )
+        else:
+            # Fallback decoding
+            return ["[Decoded text - tokenizer not available]"] * len(generated_outputs)
+# Convenience function
+def create_qwen3vl_processor(tokenizer=None, image_processor=None):
+    """Create a Qwen3VL processor instance."""
+    return Qwen3VLProcessor(tokenizer=tokenizer, image_processor=image_processor)