PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py ADDED Viewed

@@ -0,0 +1,393 @@
+import json
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import mlx.core as mx
+import numpy as np
+from PIL import Image
+from transformers import (
+    AutoImageProcessor,
+    AutoProcessor,
+    AutoTokenizer,
+    BatchFeature,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.image_utils import ImageFeatureExtractionMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# Constants for image processing (from internvl_chat.py)
+IMAGENET_MEAN = np.array([0.485, 0.456, 0.406])
+IMAGENET_STD = np.array([0.229, 0.224, 0.225])
+# chat_template = get_conv_template("internvl2_5")
+chat_template = "{% for message in messages %}{{message['role'].capitalize() + ': '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['content'] }}{% endfor %}{{'\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:\n' }}{% endif %}"
+IMG_START_TOKEN = "<img>"
+IMG_END_TOKEN = "</img>"
+IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+def build_transform(input_size):
+    """
+    Builds a transformation pipeline for images.
+    Args:
+        input_size (int): The target size for the image (height and width).
+    Returns:
+        function: A function that takes a PIL image and returns a normalized mx.array.
+    """
+    mean = mx.array(IMAGENET_MEAN)
+    std = mx.array(IMAGENET_STD)
+    def transform(img: Image.Image) -> mx.array:
+        # Ensure image is RGB
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        # Resize using PIL - BICUBIC interpolation is default in Pillow >= 9.1.0 for resize
+        # For older versions, you might need Pillow-SIMD or explicitly set
+        # resampling=Image.BICUBIC if available.
+        img = img.resize((input_size, input_size), resample=Image.Resampling.BICUBIC)
+        # Convert PIL image to NumPy array (H, W, C) and scale to [0, 1]
+        img_np = np.array(img).astype(np.float32) / 255.0
+        # Convert to MLX array and transpose to (C, H, W)
+        img_mx = mx.array(img_np).transpose(2, 0, 1)
+        # Normalize
+        img_mx = (img_mx - mean[:, None, None]) / std[:, None, None]
+        return img_mx
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """Finds the closest aspect ratio from a list of targets."""
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            # Prioritize ratios closer to the original image area if diffs are equal
+            target_area = image_size * image_size * ratio[0] * ratio[1]
+            if abs(area - target_area) < abs(
+                area - image_size * image_size * best_ratio[0] * best_ratio[1]
+            ):
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(
+    image: Image.Image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
+):
+    """
+    Preprocesses the image by splitting it into blocks based on the closest aspect ratio.
+    Args:
+        image (PIL.Image.Image): Input image.
+        min_num (int): Minimum number of blocks.
+        max_num (int): Maximum number of blocks.
+        image_size (int): Target size for each block.
+        use_thumbnail (bool): Whether to include a thumbnail of the original image.
+    Returns:
+        list[PIL.Image.Image]: A list of processed image blocks (as PIL images).
+    """
+    orig_width, orig_height = image.size
+    if orig_width == 0 or orig_height == 0:
+        # Handle potential zero dimensions
+        return []
+    aspect_ratio = orig_width / orig_height
+    # Calculate the possible target aspect ratios
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # Find the closest target aspect ratio
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+    # Calculate the target dimensions for resizing
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # Resize the image to fit the target block structure
+    # Using BICUBIC resampling
+    resized_img = image.resize(
+        (target_width, target_height), resample=Image.Resampling.BICUBIC
+    )
+    processed_images = []
+    # Crop the resized image into blocks
+    for i in range(blocks):
+        # Calculate crop box for the i-th block
+        row_idx = i // target_aspect_ratio[0]
+        col_idx = i % target_aspect_ratio[0]
+        left = col_idx * image_size
+        top = row_idx * image_size
+        right = (col_idx + 1) * image_size
+        bottom = (row_idx + 1) * image_size
+        box = (left, top, right, bottom)
+        # Crop and add the block
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert (
+        len(processed_images) == blocks
+    ), f"Expected {blocks} blocks, but got {len(processed_images)}"
+    # Add a thumbnail if requested and if the image was split
+    if use_thumbnail and blocks > 1:
+        thumbnail_img = image.resize(
+            (image_size, image_size), resample=Image.Resampling.BICUBIC
+        )
+        processed_images.append(thumbnail_img)
+    return processed_images
+class InternVLImageProcessor(ImageFeatureExtractionMixin):
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: int = 448,  # Default image size from dynamic_preprocess
+        resample=Image.Resampling.BICUBIC,
+        do_center_crop: bool = False,  # Not used in original, but standard HF param
+        crop_size=None,
+        do_rescale: bool = True,  # Original code scales by 1/255.0
+        rescale_factor: float = 1 / 255.0,
+        do_normalize: bool = True,
+        image_mean=IMAGENET_MEAN.tolist(),
+        image_std=IMAGENET_STD.tolist(),
+        do_dynamic_preprocess: bool = True,
+        dynamic_min_num: int = 1,
+        dynamic_max_num: int = 12,
+        dynamic_use_thumbnail: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.do_resize = (
+            do_resize  # Although dynamic_preprocess handles resizing internally
+        )
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        # Custom dynamic processing params
+        self.do_dynamic_preprocess = do_dynamic_preprocess
+        self.dynamic_min_num = dynamic_min_num
+        self.dynamic_max_num = dynamic_max_num
+        self.dynamic_use_thumbnail = dynamic_use_thumbnail
+    def preprocess(
+        self,
+        images: List[Image.Image],
+        do_dynamic_preprocess: Optional[bool] = None,
+        size: Optional[int] = None,
+        # ... other params matching __init__ ...
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> List[mx.array]:
+        do_dynamic_preprocess = (
+            do_dynamic_preprocess
+            if do_dynamic_preprocess is not None
+            else self.do_dynamic_preprocess
+        )
+        size = size if size is not None else self.size
+        # ... handle other overrides ...
+        if not isinstance(images, list):
+            images = [images]
+        if not all(isinstance(image, Image.Image) for image in images):
+            raise ValueError("Input must be a list of PIL Images.")
+        processed_images_batch = []
+        for image in images:
+            # Apply dynamic preprocessing
+            if do_dynamic_preprocess:
+                processed_images = dynamic_preprocess(
+                    image,
+                    min_num=self.dynamic_min_num,
+                    max_num=self.dynamic_max_num,
+                    image_size=size,
+                    use_thumbnail=self.dynamic_use_thumbnail,
+                )
+            else:
+                # Fallback or alternative simpler preprocessing if needed
+                # e.g., simple resize + normalize
+                processed_images = [image.resize((size, size), resample=self.resample)]
+            # Create transform function
+            transform = build_transform(input_size=size)
+            # Apply transform to each image block and collect arrays
+            pixel_values_list = [transform(img) for img in processed_images]
+            # Stack the arrays along a new dimension (batch dimension)
+            pixel_values = mx.stack(pixel_values_list, axis=0)
+            processed_images_batch.append(pixel_values)
+        # At this point, processed_images_batch contains a list of mx arrays,
+        # each array corresponding to an input image with stacked blocks.
+        data = {"pixel_values": mx.array(processed_images_batch)}
+        return BatchFeature(data=data, tensor_type=None)
+class InternVLChatProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "InternVLImageProcessor"
+    tokenizer_class = (
+        "AutoTokenizer",
+        "Qwen2TokenizerFast",
+    )  # Specify possible classes
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=chat_template,
+        **kwargs,
+    ):
+        if image_processor is None:
+            image_processor = InternVLImageProcessor(**kwargs)
+        if isinstance(tokenizer, str):
+            # Defaulting to the likely repo ID found earlier
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer, trust_remote_code=True, **kwargs
+            )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.num_image_token = int((448 // 14) ** 2 * (0.5**2))
+    def __call__(
+        self,
+        text: Union[str, List[str]] = None,
+        images: List[Image.Image] = None,
+        padding: Union[bool, str] = True,
+        truncation: bool = True,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = "pt",  # Default to PyTorch tensors
+        **kwargs,
+    ):
+        processed_inputs = {}
+        if images is not None:
+            image_features = self.image_processor.preprocess(
+                images, return_tensors=return_tensors, **kwargs
+            )
+            processed_inputs.update(image_features)  # Should contain 'pixel_values'
+        if text is not None:
+            queries = []
+            if isinstance(text, str):
+                text = [text]
+            for idx in range(len(images)):
+                question = text[idx]
+                if images is not None and "<image>" not in question:
+                    question = "<image>\n" + question
+                num_patches = image_features["pixel_values"][idx].shape[0]
+                image_tokens = (
+                    IMG_START_TOKEN
+                    + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+                    + IMG_END_TOKEN
+                )
+                question = question.replace("<image>", image_tokens, 1)
+                queries.append(question)
+            self.tokenizer.padding_side = "left"
+            text_inputs = self.tokenizer(
+                queries,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            processed_inputs.update(text_inputs)  # 'input_ids', 'attention_mask'
+        return processed_inputs
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's batch_decode method.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's decode method.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def save_pretrained(self, save_directory, **kwargs):
+        pass
+    @staticmethod
+    def from_pretrained(pretrained_model_name_or_path, **kwargs):
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, **kwargs
+        )
+        image_processor = InternVLImageProcessor(**kwargs)
+        return InternVLChatProcessor(
+            image_processor=image_processor, tokenizer=tokenizer
+        )
+    # Need save_pretrained and from_pretrained
+    # save_pretrained should save both tokenizer and image_processor configs/files
+    # from_pretrained should load both
+    # Example:
+    # def save_pretrained(self, save_directory, **kwargs):
+    #     self.tokenizer.save_pretrained(save_directory, **kwargs)
+    #     self.image_processor.save_pretrained(save_directory, **kwargs)
+    # def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    #     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+    #     image_processor = InternVLImageProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs)
+    #     return cls(image_processor=image_processor, tokenizer=tokenizer)
+# Registration
+MODEL_TYPE = "internvl_chat"  # Verify this from the model's config.json
+AutoImageProcessor.register(
+    MODEL_TYPE, slow_image_processor_class=InternVLImageProcessor
+)
+AutoProcessor.register(MODEL_TYPE, InternVLChatProcessor)
+logger.info(f"Registered custom processor classes for model type '{MODEL_TYPE}'.")

nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py ADDED Viewed

@@ -0,0 +1,293 @@
+import inspect
+from dataclasses import dataclass
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from ..base import interpolate
+@dataclass
+class VisionConfig:
+    model_type: str
+    hidden_size: int = 1024
+    num_attention_heads: int = 16
+    patch_size: int = 14
+    num_hidden_layers: int = 24
+    intermediate_size: int = 4096
+    image_size: int = 448
+    num_channels: int = 3
+    layer_norm_eps: float = 1e-6
+    drop_path_rate: float = 0.1
+    qkv_bias: bool = True
+    qk_normalization: bool = False
+    norm_type: str = "layer_norm"
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+def check_array_shape(arr):
+    shape = arr.shape
+    # Check if the shape has 4 dimensions
+    if len(shape) != 4:
+        return False
+    out_channels, kH, KW, _ = shape
+    # Check if out_channels is the largest, and kH and KW are the same
+    if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
+        return True
+    else:
+        return False
+class Attention(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        if (config.hidden_size % config.num_attention_heads) != 0:
+            raise ValueError(
+                "The input feature dimensions should be divisible by the "
+                f"number of heads ({config.hidden_size} % {config.num_attention_heads}) != 0"
+            )
+        self.dims = dims = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = head_dim**-0.5
+        self.qkv_bias = config.qkv_bias
+        self.qkv = nn.Linear(dims, 3 * dims, bias=config.qkv_bias)
+        self.proj = nn.Linear(dims, dims)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = nn.RMSNorm(dims, eps=config.layer_norm_eps)
+            self.k_norm = nn.RMSNorm(dims, eps=config.layer_norm_eps)
+    def __call__(self, x, mask=None):
+        B, L, C = x.shape
+        qkv = self.qkv(x).reshape(B, L, 3, self.num_heads, C // self.num_heads)
+        qkv = qkv.transpose(2, 0, 3, 1, 4)
+        queries, keys, values = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # Each has shape (B, groups, N, C//groups)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = queries.shape
+            queries = (
+                self.q_norm(queries.transpose(0, 2, 1, 3).flatten(-2, -1))
+                .reshape(B_, N_, H_, D_)
+                .transpose(0, 2, 1, 3)
+            )
+            keys = (
+                self.k_norm(keys.transpose(0, 2, 1, 3).flatten(-2, -1))
+                .reshape(B_, N_, H_, D_)
+                .transpose(0, 2, 1, 3)
+            )
+        output = mx.fast.scaled_dot_product_attention(
+            queries, keys, values, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.proj(output)
+class MLP(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.activation_fn = nn.GELU(approx="precise")
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x)
+        return x
+class EncoderLayer(nn.Module):
+    def __init__(self, config: VisionConfig, drop_path_rate: float = 0.0):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = getattr(config, "norm_type", "layer_norm")
+        self.attn = Attention(config)
+        self.mlp = MLP(config)
+        if self.norm_type == "layer_norm":
+            self.norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        elif self.norm_type == "rms_norm":
+            self.norm1 = nn.RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.norm2 = nn.RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        else:
+            raise ValueError(f"Unsupported normalization type: {self.norm_type}")
+        self.ls1 = mx.ones((self.embed_dim,))
+        self.ls2 = mx.ones((self.embed_dim,))
+        self.drop_path1 = (
+            nn.Dropout(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+        self.drop_path2 = (
+            nn.Dropout(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+    def __call__(self, x: mx.array, mask: Optional[mx.array] = None) -> mx.array:
+        dtype = x.dtype
+        x = x + self.drop_path1(self.attn(self.norm1(x).astype(dtype)) * self.ls1)
+        x = x + self.drop_path2(self.mlp(self.norm2(x).astype(dtype)) * self.ls2)
+        return x.astype(dtype)
+class Encoder(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        dpr = [
+            mx.array(x)
+            for x in np.linspace(0, config.drop_path_rate, config.num_hidden_layers)
+        ]
+        self.layers = [
+            EncoderLayer(config, dpr[i]) for i in range(config.num_hidden_layers)
+        ]
+    def __call__(
+        self,
+        x: mx.array,
+        output_hidden_states: Optional[bool] = None,
+        mask: Optional[mx.array] = None,
+    ) -> mx.array:
+        encoder_states = (x,) if output_hidden_states else None
+        h = x
+        for l in self.layers:
+            x = l(x, mask=mask)
+            if output_hidden_states:
+                encoder_states = encoder_states + (x,)
+            h = x
+        return (h, encoder_states)
+class VisionEmbeddings(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = mx.random.normal((1, 1, self.embed_dim))
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = mx.random.normal(
+            (1, self.num_positions, self.embed_dim)
+        )
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.reshape(
+            1,
+            self.image_size // self.patch_size,
+            self.image_size // self.patch_size,
+            -1,
+        ).transpose(0, 3, 1, 2)
+        pos_embed = interpolate(pos_embed, (H, W))
+        pos_embed = (
+            pos_embed.reshape(1, -1, H * W).transpose(0, 2, 1).astype(target_dtype)
+        )
+        return pos_embed
+    def __call__(self, x: mx.array) -> mx.array:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(x).transpose(
+            0, 3, 1, 2
+        )  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = mx.flatten(patch_embeds, start_axis=2).transpose(0, 2, 1)
+        class_embeds = mx.broadcast_to(
+            self.class_embedding, (batch_size, 1, self.embed_dim)
+        ).astype(target_dtype)
+        embeddings = mx.concatenate([class_embeds, patch_embeds], axis=1)
+        position_embedding = mx.concatenate(
+            [
+                self.position_embedding[:, :1, :],
+                self._get_pos_embed(self.position_embedding[:, 1:, :], height, width),
+            ],
+            axis=1,
+        )
+        embeddings = embeddings + position_embedding.astype(target_dtype)
+        return embeddings
+class VisionModel(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.model_type = config.model_type
+        if self.model_type not in ["siglip_vision_model", "intern_vit_6b"]:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        self.embeddings = VisionEmbeddings(config)
+        self.encoder = Encoder(config)
+    def __call__(
+        self,
+        x: mx.array,
+        output_hidden_states: Optional[bool] = None,
+    ) -> mx.array:
+        x = self.embeddings(x)
+        last_hidden_state, encoder_outputs = self.encoder(
+            x=x, output_hidden_states=output_hidden_states, mask=None
+        )
+        pooler_output = last_hidden_state[:, 0, :]
+        return last_hidden_state, pooler_output, encoder_outputs[1:]
+    def sanitize(self, weights):
+        sanitized_weights = {}
+        for k, v in weights.items():
+            if "position_ids" in k:
+                # Remove unused position_ids
+                continue
+            elif "patch_embedding.weight" in k:
+                # PyTorch conv2d weight tensors have shape:
+                #   [out_channels, in_channels, kH, KW]
+                # MLX conv2d expects the weight be of shape:
+                #   [out_channels, kH, KW, in_channels]
+                if check_array_shape(v):
+                    sanitized_weights[k] = v
+                else:
+                    sanitized_weights[k] = v.transpose(0, 2, 3, 1)
+            else:
+                sanitized_weights[k] = v
+        return sanitized_weights