PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/ernie4_5_moe_vl/vision.py ADDED Viewed

@@ -0,0 +1,322 @@
+"""DFNRope Vision Transformer for ERNIE 4.5 VL."""
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from .config import VisionConfig
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return mx.concatenate([-x2, x1], axis=-1)
+def apply_rotary_pos_emb_vision(tensor: mx.array, freqs: mx.array) -> mx.array:
+    """Applies Rotary Position Embedding to the input tensors.
+    Args:
+        tensor: The input tensor.
+        freqs: The frequencies used for the rotation.
+    Returns:
+        output: the tensor rotated using the Rotary Position Embedding.
+    """
+    orig_dtype = tensor.dtype
+    tensor = tensor.astype(mx.float32)
+    cos = mx.cos(freqs)
+    sin = mx.sin(freqs)
+    # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+    cos = mx.expand_dims(cos, axis=1)
+    cos = mx.tile(cos, (1, 1, 2))
+    cos = mx.expand_dims(cos, axis=0)
+    sin = mx.expand_dims(sin, axis=1)
+    sin = mx.tile(sin, (1, 1, 2))
+    sin = mx.expand_dims(sin, axis=0)
+    output = tensor * cos + rotate_half(tensor) * sin
+    return output.astype(orig_dtype)
+class VisionRotaryEmbedding(nn.Module):
+    """Rotary position embedding for vision transformer."""
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+    def __call__(self, seqlen: int) -> mx.array:
+        inv_freq = 1.0 / (
+            self.theta ** (mx.arange(0, self.dim, 2, dtype=mx.float32) / self.dim)
+        )
+        if isinstance(seqlen, mx.array):
+            seqlen = seqlen.item()
+        seq = mx.arange(seqlen, dtype=inv_freq.dtype)
+        freqs = mx.outer(seq, inv_freq)
+        return freqs
+class PatchEmbed(nn.Module):
+    """Linear patch embedding for DFNRope Vision Transformer."""
+    def __init__(
+        self,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        # Linear projection: in_channels * patch_size * patch_size -> embed_dim
+        self.proj = nn.Linear(
+            in_channels * patch_size * patch_size, embed_dim, bias=False
+        )
+    def __call__(self, hidden_states: mx.array) -> mx.array:
+        """
+        Args:
+            hidden_states: Input tensor of shape [num_patches, in_channels * patch_size * patch_size]
+        Returns:
+            Output tensor of shape [num_patches, embed_dim]
+        """
+        target_dtype = self.proj.weight.dtype
+        hidden_states = self.proj(hidden_states.astype(target_dtype))
+        return hidden_states
+class VisionMLP(nn.Module):
+    """MLP for vision transformer block."""
+    def __init__(
+        self, dim: int, hidden_dim: int, hidden_act: str = "quick_gelu"
+    ) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, dim)
+        self.hidden_act = hidden_act
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.fc1(x)
+        if self.hidden_act == "quick_gelu":
+            x = x * mx.sigmoid(1.702 * x)
+        elif self.hidden_act == "gelu":
+            x = nn.gelu(x)
+        elif self.hidden_act == "silu":
+            x = nn.silu(x)
+        else:
+            x = nn.gelu(x)
+        return self.fc2(x)
+class VisionAttention(nn.Module):
+    """Multi-head attention for vision transformer."""
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def __call__(
+        self,
+        x: mx.array,
+        cu_seqlens: mx.array,
+        rotary_pos_emb: Optional[mx.array] = None,
+    ) -> mx.array:
+        """Forward function for vision attention."""
+        seq_length = x.shape[0]
+        qkv = (
+            self.qkv(x).reshape(seq_length, 3, self.num_heads, -1).transpose(1, 0, 2, 3)
+        )
+        q, k, v = mx.split(qkv, 3)
+        q = apply_rotary_pos_emb_vision(mx.expand_dims(q, 0), rotary_pos_emb)[0]
+        k = apply_rotary_pos_emb_vision(mx.expand_dims(k, 0), rotary_pos_emb)[0]
+        q = q.transpose(0, 2, 1, 3)
+        k = k.transpose(0, 2, 1, 3)
+        v = v.transpose(0, 2, 1, 3)
+        lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        splits = [
+            mx.split(tensor, [lengths[0], sum(lengths[:2])], axis=2)
+            for tensor in (q, k, v)
+        ]
+        attn_outputs = []
+        for q, k, v in zip(*splits):
+            output = mx.fast.scaled_dot_product_attention(
+                q, k, v, scale=self.scale, mask=None
+            )
+            attn_outputs.append(output)
+        output = mx.concatenate(attn_outputs, axis=2)
+        output = output.transpose(0, 2, 1, 3).reshape(seq_length, -1)
+        return self.proj(output)
+class DFNRopeVisionBlock(nn.Module):
+    """DFNRope Vision Transformer block."""
+    def __init__(self, config: VisionConfig) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = VisionAttention(config.embed_dim, num_heads=config.num_heads)
+        self.mlp = VisionMLP(
+            dim=config.embed_dim,
+            hidden_dim=mlp_hidden_dim,
+            hidden_act=config.hidden_act,
+        )
+    def __call__(
+        self, hidden_states: mx.array, cu_seqlens: mx.array, rotary_pos_emb: mx.array
+    ) -> mx.array:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class VisionModel(nn.Module):
+    """DFNRope Vision Transformer for ERNIE 4.5 VL."""
+    def __init__(self, config: VisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = [DFNRopeVisionBlock(config) for _ in range(config.depth)]
+        self.ln = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
+    def rot_pos_emb(self, grid_thw: mx.array, num_pad: int = 0) -> mx.array:
+        """Compute rotary position embedding for vision.
+        Args:
+            grid_thw: Grid dimensions [batch, 3] containing (t, h, w)
+            num_pad: Number of padding tokens
+        Returns:
+            Rotary position embedding tensor
+        """
+        pos_ids = []
+        grid_hw_array = np.array(grid_thw.tolist(), dtype=np.int64)
+        for t, h, w in grid_hw_array:
+            hpos_ids = np.arange(h).reshape(-1, 1)
+            hpos_ids = np.tile(hpos_ids, (1, w))
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = np.transpose(hpos_ids, (0, 2, 1, 3))
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = np.arange(w).reshape(1, -1)
+            wpos_ids = np.tile(wpos_ids, (h, 1))
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = np.transpose(wpos_ids, (0, 2, 1, 3))
+            wpos_ids = wpos_ids.flatten()
+            stacked_ids = np.stack([hpos_ids, wpos_ids], axis=-1)
+            tiled_ids = np.tile(stacked_ids, (t, 1))
+            pos_ids.append(tiled_ids)
+        pos_ids = np.concatenate(pos_ids, axis=0)
+        if num_pad > 0:
+            pos_ids = np.concatenate(
+                [pos_ids, np.zeros((num_pad, 2), dtype=pos_ids.dtype)], axis=0
+            )
+        max_grid_size = int(np.max(grid_hw_array[:, 1:]))
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        pos_ids_mx = mx.array(pos_ids, dtype=mx.int32)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids_mx].reshape(pos_ids.shape[0], -1)
+        return rotary_pos_emb
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        grid_thw: mx.array,
+        output_hidden_states: Optional[bool] = None,
+        num_pad: int = 0,
+    ) -> mx.array:
+        """Forward pass through the vision model.
+        Args:
+            hidden_states: Input pixel values [num_patches, channels * patch_h * patch_w]
+            grid_thw: Grid dimensions [batch, 3]
+            output_hidden_states: Whether to output hidden states
+            num_pad: Number of padding tokens
+        Returns:
+            Vision features
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw, num_pad=num_pad)
+        # Compute cumulative sequence lengths
+        cu_seqlens = mx.zeros(1, dtype=mx.int32)
+        for i in range(grid_thw.shape[0]):
+            t, h, w = grid_thw[i].tolist()
+            seq_len = t * h * w
+            cu_seqlens = mx.concatenate([cu_seqlens, cu_seqlens[-1:] + seq_len])
+        if num_pad > 0:
+            cu_seqlens = mx.concatenate([cu_seqlens, cu_seqlens[-1:] + num_pad])
+        encoder_states = (hidden_states,) if output_hidden_states else None
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+        hidden_states = self.ln(hidden_states)
+        return hidden_states
+    def sanitize(self, weights):
+        """Sanitize weights for loading."""
+        sanitized_weights = {}
+        for k, v in weights.items():
+            if "position_ids" in k:
+                continue
+            sanitized_weights[k] = v
+        return sanitized_weights

mlx_vlm/models/fastvlm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .config import ModelConfig, TextConfig, VisionConfig
2	+ from .fastvlm import LanguageModel, Model, VisionModel

mlx_vlm/models/fastvlm/config.py ADDED Viewed

@@ -0,0 +1,79 @@
+import inspect
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+from ..base import BaseModelConfig
+@dataclass
+class TextConfig(BaseModelConfig):
+    model_type: str
+    hidden_size: int = 896
+    num_hidden_layers: int = 24
+    intermediate_size: int = 4864
+    num_attention_heads: int = 14
+    rms_norm_eps: float = 1e-06
+    vocab_size: int = 151936
+    num_key_value_heads: int = 2
+    max_position_embeddings: int = 32768
+    rope_theta: float = 1000000
+    rope_traditional: bool = False
+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
+    tie_word_embeddings: bool = True
+@dataclass
+class VisionConfig(BaseModelConfig):
+    model_type: str = "llava_qwen2"  # fastvlm?
+    hidden_size: int = 1024
+    intermediate_size: int = 3072
+    image_size: int = 1024
+    patch_size: int = 64
+    projection_dim: int = 768
+    num_classes = 1000
+    down_patch_size = 7
+    down_stride = 2
+    layer_scale_init_value = 1e-5
+    cls_ratio = 2.0
+    # FastViTHD variant
+    layers = [2, 12, 24, 4, 2]
+    embed_dims = [96, 192, 384, 768, 1536]
+    mlp_ratios = [4, 4, 4, 4, 4]
+    downsamples = [True, True, True, True, True]
+    pos_embs_shapes = [None, None, None, (7, 7), (7, 7)]
+    token_mixers = ("repmixer", "repmixer", "repmixer", "attention", "attention")
+    repmixer_kernel_size = 3
+@dataclass
+class ModelConfig(BaseModelConfig):
+    text_config: TextConfig
+    vision_config: VisionConfig
+    model_type: str = "llava_qwen2"  # fastvlm?
+    ignore_index: int = -100
+    image_token_index: int = -200
+    eos_token_id: int = 151645
+    mm_projector_type: str = "mlp2x_gelu"
+    mm_hidden_size: int = 3072
+    tokenizer_model_max_length: int = 8192
+    tokenizer_padding_side: str = "right"
+    @classmethod
+    def from_dict(cls, params):
+        if not params.get("text_config", {}):
+            # Copy text config parameters from root level
+            excluded_keys = {"vision_config"}
+            params["text_config"] = dict(
+                filter(lambda x: x[0] not in excluded_keys, params.items())
+            )
+        if not params.get("vision_config", {}):
+            params["vision_config"] = {}
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )

mlx_vlm/models/fastvlm/fastvlm.py ADDED Viewed

@@ -0,0 +1,198 @@
+import re
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from ..base import InputEmbeddingsFeatures
+from .config import ModelConfig
+from .language import LanguageModel
+from .vision import CallableModuleList, VisionModel
+def build_vision_projector(config):
+    hidden_size = config.text_config.hidden_size
+    projector_type = getattr(config, "mm_projector_type", "mlp2x_gelu")
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, hidden_size)
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = CallableModuleList()
+        modules.append(nn.Linear(config.mm_hidden_size, hidden_size))
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(hidden_size, hidden_size))
+        return modules
+    raise ValueError(f"Unknown projector type: {projector_type}")
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.vision_tower = VisionModel(config.vision_config)
+        self.language_model = LanguageModel(config.text_config)
+        self.mm_projector = build_vision_projector(config)
+    def get_input_embeddings(
+        self,
+        input_ids: Optional[mx.array] = None,
+        pixel_values: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        **kwargs,
+    ):
+        if pixel_values is None:
+            return InputEmbeddingsFeatures(
+                inputs_embeds=self.language_model.model.embed_tokens(input_ids)
+            )
+        _, image_features, _ = self.vision_tower(pixel_values.transpose(0, 2, 3, 1))
+        B, H, W, C = image_features.shape
+        image_features = image_features.reshape(B, H * W, C)
+        image_features = self.mm_projector(image_features)
+        final_inputs_embeds = self.prepare_inputs_for_multimodal(
+            image_features, input_ids, mask
+        )
+        return InputEmbeddingsFeatures(inputs_embeds=final_inputs_embeds)
+    # Source: https://github.com/apple/ml-fastvlm/blob/592b4add3c1c8a518e77d95dc6248e76c1dd591f/llava/model/llava_arch.py#L146
+    def prepare_inputs_for_multimodal(self, image_features, input_ids, mask):
+        if mask is not None:
+            input_ids = [
+                cur_input_ids[
+                    (start := mx.argmax(cur_mask).item()) : start
+                    + cur_mask.sum().item()
+                ]
+                for cur_input_ids, cur_mask in zip(input_ids, mask)
+            ]
+        new_input_embeds = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == self.config.image_token_index).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.language_model.model.embed_tokens(
+                    cur_input_ids
+                )
+                cur_input_embeds = mx.concatenate(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                cur_image_idx += 1
+                continue
+            image_token_indices = (
+                [-1]
+                + np.where(np.array(cur_input_ids == self.config.image_token_index))[
+                    0
+                ].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(
+                    cur_input_ids[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+            split_sizes = image_token_indices[1:]
+            cur_input_embeds = self.language_model.model.embed_tokens(
+                mx.concatenate(cur_input_ids_noim)
+            )
+            cur_input_embeds_no_im = mx.split(cur_input_embeds, split_sizes)
+            cur_new_input_embeds = []
+            for i in range(num_images.item() + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+            cur_new_input_embeds = mx.concatenate(cur_new_input_embeds)
+            new_input_embeds.append(cur_new_input_embeds)
+        if self.config.tokenizer_model_max_length is not None:
+            new_input_embeds = [
+                x[: self.config.tokenizer_model_max_length] for x in new_input_embeds
+            ]
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        new_input_embeds_padded = []
+        for i, cur_new_embed in enumerate(new_input_embeds):
+            cur_len = cur_new_embed.shape[0]
+            padded = cur_new_embed
+            if max_len > cur_len:
+                if self.config.tokenizer_padding_side == "left":
+                    padded = mx.concatenate(
+                        (
+                            mx.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                else:
+                    padded = mx.concatenate(
+                        (
+                            cur_new_embed,
+                            mx.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                            ),
+                        ),
+                        dim=0,
+                    )
+            new_input_embeds_padded.append(padded)
+        new_input_embeds = mx.stack(new_input_embeds_padded)
+        return new_input_embeds
+    @property
+    def layers(self):
+        return self.language_model.model.layers
+    def __call__(
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
+    ):
+        input_embeddings_features = self.get_input_embeddings(
+            input_ids, pixel_values, mask
+        )
+        logits = self.language_model(
+            input_ids,
+            mask=mask,
+            cache=cache,
+            inputs_embeds=input_embeddings_features.inputs_embeds,
+        )
+        return logits
+    def sanitize(self, weights):
+        def transform_key(key):
+            if "vision_tower" in key:
+                if "model.vision_tower" in key:
+                    key = key.replace(
+                        "model.vision_tower.vision_tower.model",
+                        "vision_tower.vision_model",
+                    )
+                    key = key.replace("patch_embed", "patch_embed.blocks")
+                return key
+            if "lm_head" in key:
+                return key
+            if "mm_projector" in key:
+                return key.replace("model.", "")
+            if "language_model" not in key:
+                return "language_model." + key
+            return key
+        return {transform_key(k): v for k, v in weights.items()}

mlx_vlm/models/fastvlm/language.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.qwen2 import Qwen2Model
+from ..base import LanguageModelOutput
+from .config import TextConfig
+class LanguageModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        self.model = Qwen2Model(config)
+        if not config.tie_word_embeddings:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+    # TODO: mask is going away in mlx-lm, see https://github.com/ml-explore/mlx-lm/pull/430
+    def __call__(
+        self,
+        inputs: mx.array,
+        mask: mx.array = None,
+        cache=None,
+        inputs_embeds: Optional[mx.array] = None,
+    ):
+        out = self.model(inputs, cache=cache, input_embeddings=inputs_embeds)
+        out = self.model.embed_tokens.as_linear(out)
+        return LanguageModelOutput(out)
+    def sanitize(self, weights):
+        if self.config.tie_word_embeddings:
+            weights.pop("lm_head.weight", None)
+        return {
+            k: v for k, v in weights.items() if "self_attn.rotary_emb.inv_freq" not in k
+        }
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def head_dim(self):
+        return self.args.hidden_size // self.args.num_attention_heads
+    @property
+    def n_kv_heads(self):
+        return self.args.num_key_value_heads