PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/molmo/vision.py ADDED Viewed

@@ -0,0 +1,408 @@
+from typing import List, Optional, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+from .config import VisionConfig
+class MLP(nn.Module):
+    def __init__(self, config: VisionConfig, input_dim: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.w1 = nn.Linear(
+            input_dim,
+            self.hidden_size,
+            bias=False,
+        )
+        self.w2 = nn.Linear(
+            self.hidden_size,
+            config.d_model,
+            bias=False,
+        )
+        self.w3 = nn.Linear(
+            input_dim,
+            self.hidden_size,
+            bias=False,
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.w2(nn.silu(self.w1(x)) * self.w3(x))
+        return x
+class ViTMLP(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.w1 = nn.Linear(config.image_emb_dim, config.image_mlp_dim, bias=True)
+        self.w2 = nn.Linear(config.image_mlp_dim, config.image_emb_dim, bias=True)
+        self.act = nn.GELU(approx="fast")
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.w1(x)
+        x = self.act(x)
+        x = self.w2(x)
+        return x
+class MultiHeadDotProductAttention(nn.Module):
+    def __init__(self, config: VisionConfig, is_vit_layer: Optional[bool] = True):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.image_emb_dim
+        self.num_heads = config.image_num_heads
+        self.head_dim = config.image_head_dim
+        self.num_key_value_heads = config.image_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scale = self.head_dim**-0.5
+        self.is_vit_layer = is_vit_layer
+        n_layers = (
+            1 if (is_vit_layer or config.vit_layers is None) else len(config.vit_layers)
+        )
+        self.wq = nn.Linear(
+            n_layers * self.embed_dim, self.num_heads * self.head_dim, bias=True
+        )
+        self.wk = nn.Linear(
+            n_layers * self.embed_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=True,
+        )
+        self.wv = nn.Linear(
+            n_layers * self.embed_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=True,
+        )
+        self.wo = nn.Linear(self.num_heads * self.head_dim, self.embed_dim, bias=True)
+    def _split_heads(self, hidden_states, num_heads) -> mx.array:
+        return hidden_states.reshape(
+            hidden_states.shape[:2] + (num_heads, self.head_dim)
+        )
+    def _merge_heads(self, hidden_states) -> mx.array:
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    def __call__(self, x: mx.array, kv: mx.array = None) -> mx.array:
+        batch_size, seq_len, _ = x.shape
+        if kv is None:
+            k = x
+            v = x
+        else:
+            k = kv
+            v = kv
+        q = self._split_heads(self.wq(x), self.num_heads).transpose(0, 2, 1, 3)
+        k = self._split_heads(self.wk(k), self.num_key_value_heads).transpose(
+            0, 2, 1, 3
+        )
+        v = self._split_heads(self.wv(v), self.num_key_value_heads).transpose(
+            0, 2, 1, 3
+        )
+        attn = mx.fast.scaled_dot_product_attention(q, k, v, scale=self.scale)
+        out = attn.transpose(0, 2, 1, 3)
+        out = self._merge_heads(out)
+        out = self.wo(out)
+        return out
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.attention = MultiHeadDotProductAttention(config)
+        self.feed_forward = ViTMLP(config)
+        self.attention_norm = nn.LayerNorm(
+            config.image_emb_dim, eps=config.image_norm_eps
+        )
+        self.ffn_norm = nn.LayerNorm(config.image_emb_dim, eps=config.image_norm_eps)
+    def __call__(self, x: mx.array) -> mx.array:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class ResidualAttentionBlocks(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.resblocks = [
+            ResidualAttentionBlock(config) for _ in range(config.image_num_layers)
+        ]
+    def __call__(self, x: mx.array) -> mx.array:
+        h = []
+        for block in self.resblocks:
+            x = block(x)
+            h.append(x)
+        return h
+def _expand_token(token, batch_size: int):
+    return mx.broadcast_to(
+        mx.reshape(token, (1, 1, -1)), (batch_size, 1, token.shape[-1])
+    )
+def pad_to_multiple(x, target_size, pad_mode="edge", pad_value=0):
+    """
+    Pad the last dimension of input tensor to match target size.
+    Args:
+        x: Input tensor with shape [..., D]
+        target_size: Desired size for the last dimension
+        pad_mode: Padding mode ('constant', 'reflect', etc.)
+        pad_value: Value to use for constant padding
+    Returns:
+        Padded tensor with shape [..., target_size]
+    """
+    current_size = x.shape[-1]
+    # Return early if no padding needed
+    if current_size == target_size:
+        return x
+    # Ensure target size is larger
+    if current_size > target_size:
+        raise ValueError(
+            f"Current size {current_size} is larger than target size {target_size}"
+        )
+    # Calculate padding needed
+    pad_size = target_size - current_size
+    # Create padding configuration
+    # No padding for batch and channel dimensions (0,0), only pad the last dim
+    pad_config = [(0, 0)] * (len(x.shape) - 1) + [(0, pad_size)]
+    return mx.pad(x, pad_width=pad_config, mode=pad_mode, constant_values=pad_value)
+class VisionTransformer(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.class_embedding = mx.zeros((config.image_emb_dim,))
+        self.positional_embedding = mx.zeros(
+            (config.image_num_pos, config.image_emb_dim)
+        )
+        self.patch_embedding = nn.Linear(
+            config.intermediate_size,
+            config.image_emb_dim,
+            bias=False,
+        )
+        self.pre_ln = nn.LayerNorm(config.image_emb_dim, eps=config.image_norm_eps)
+        self.transformer = ResidualAttentionBlocks(config)
+    def add_pos_emb(self, x: mx.array, patch_num: int) -> mx.array:
+        cls_emb = self.positional_embedding[0:1]
+        pos_emb = self.positional_embedding[1:]
+        # Reshape into 2D grid
+        pos_emb_size = int(pos_emb.shape[0] ** 0.5)
+        pos_emb = mx.reshape(pos_emb, (pos_emb_size, pos_emb_size, pos_emb.shape[1]))
+        (patch_num_0, patch_num_1) = patch_num
+        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
+            # Reshape for upsampling (add batch and channel dims)
+            pos_emb = mx.expand_dims(pos_emb, 0)
+            pos_emb = mx.transpose(pos_emb, (0, 3, 1, 2))
+            # Create and apply upsampler
+            upsampler = nn.Upsample(
+                scale_factor=(
+                    patch_num_0 / pos_emb.shape[2],
+                    patch_num_1 / pos_emb.shape[3],
+                ),
+                mode="linear",  # MLX doesn't have bicubic, using linear as closest alternative
+                align_corners=False,
+            )
+            pos_emb = upsampler(pos_emb)
+            # Restore original dimensions
+            pos_emb = mx.transpose(pos_emb, (0, 2, 3, 1))
+            pos_emb = mx.squeeze(pos_emb, 0)
+        pos_emb = mx.reshape(pos_emb, (-1, pos_emb.shape[-1]))
+        # Expand cls_emb and pos_emb
+        expanded_cls = cls_emb[None, :, :]
+        expanded_pos = pos_emb[None, :, :]
+        # Concatenate and add to x
+        pos_embedding = mx.concatenate([expanded_cls, expanded_pos], axis=1)
+        x = x + pos_embedding
+        return x
+    def __call__(self, x: mx.array, patch_num: int = None) -> List[mx.array]:
+        """
+        : param x: (batch_size, num_patch, n_pixels)
+        """
+        if patch_num is None:
+            patch_num = self.config.image_num_patch
+        B, N, D = x.shape
+        # (Optional) Due to quantization, pad around the image to match intermediate_size
+        x = pad_to_multiple(x, self.config.intermediate_size)
+        x = self.patch_embedding(x)
+        # class embeddings and positional embeddings
+        expanded_class_emb = _expand_token(self.class_embedding, x.shape[0])
+        expanded_class_emb = expanded_class_emb
+        x = mx.concatenate([expanded_class_emb, x], axis=1)
+        x = self.add_pos_emb(x, patch_num)
+        x = self.pre_ln(x)
+        hidden_states = self.transformer(x)
+        return hidden_states
+class VisionModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        if self.model_type != "molmo":
+            raise ValueError(
+                f"Model type {self.model_type} not supported. Currently only 'molmo' is supported"
+            )
+        self.image_vit = VisionTransformer(config)
+        self.num_prefix_tokens = 1
+        self.image_pooling_2d = MultiHeadDotProductAttention(config, is_vit_layer=False)
+        self.image_projector = MLP(config, config.image_emb_dim)
+        self.pad_embed = mx.zeros((2, config.image_emb_dim * 2))
+    def encode_image(self, images: mx.array) -> mx.array:
+        """
+        : param images: (batch_size, num_crops, num_patch, n_pixels)
+        """
+        cfg = self.config
+        B, T, N, D = images.shape
+        # Check for -1 values across dimensions 1 and 2
+        reshaped_images = mx.reshape(images, (B * T, N, D))
+        mask = ~mx.all(reshaped_images == -1, axis=(1, 2), keepdims=True)
+        # Output all hidden states
+        images = reshaped_images
+        image_features = self.image_vit(images)
+        if cfg.vit_layers is not None:
+            features = []
+            for layer in cfg.vit_layers:
+                features.append(image_features[layer])
+            image_features = mx.concatenate(features, axis=-1)
+        else:
+            image_features = image_features[-1]
+        cls_embed = None
+        if self.num_prefix_tokens > 0:
+            cls_embed = image_features[:, 0]
+            image_features = image_features[:, 1:]
+        image_features = image_features * mask
+        image_features = mx.reshape(image_features, (B, T, N, -1))
+        cls_embed = mx.reshape(cls_embed, (B, T, -1)) if cls_embed is not None else None
+        return image_features, cls_embed
+    def __call__(
+        self, images: mx.array, image_masks: mx.array
+    ) -> Tuple[mx.array, Optional[mx.array]]:
+        cfg = self.config
+        batch_size, num_image = images.shape[:2]
+        image_features, cls_embed = self.encode_image(images)
+        if cfg.image_padding_embed:
+            assert image_masks is not None
+            if cfg.image_padding_embed == "pad_embed":
+                all_pad = image_masks == 0
+                pad_embed = mx.reshape(self.pad_embed, (1, 1, 1, -1))
+                image_features = image_features + pad_embed * mx.expand_dims(
+                    all_pad, -1
+                )
+            elif cfg.image_padding_embed == "regress":
+                pad_embed = mx.reshape(self.pad_embed, (1, 1, 1, -1))
+                image_features = image_features + pad_embed * mx.expand_dims(
+                    mx.maximum(image_masks, mx.zeros_like(image_masks)), -1
+                )
+            elif cfg.image_padding_embed == "pad_and_partial_pad":
+                pad_embed = mx.reshape(self.pad_embed, (2, 1, 1, 1, -1))
+                all_pad = image_masks == 0
+                partial_pad = mx.logical_and(image_masks < 1, mx.logical_not(all_pad))
+                partial_pad = partial_pad
+                all_pad = all_pad
+                image_features = image_features + pad_embed[0] * mx.expand_dims(
+                    all_pad, -1
+                )
+                image_features = image_features + pad_embed[1] * mx.expand_dims(
+                    partial_pad, -1
+                )
+            else:
+                raise ValueError(cfg.image_padding_embed)
+        image_features = mx.reshape(
+            image_features, (batch_size, num_image) + cfg.image_num_patch + (-1,)
+        )
+        if cfg.image_num_patch[0] % cfg.image_pooling_h == 1:
+            # Pad so we can still pool 2x2 patches
+            image_features = mx.pad(
+                image_features, [(0, 0), (0, 0), (0, 1), (0, 1), (0, 0)]
+            )
+        # image pooling
+        # MLX equivalent of einops rearrange
+        h_blocks = image_features.shape[2] // cfg.image_pooling_h
+        w_blocks = image_features.shape[3] // cfg.image_pooling_w
+        image_features = mx.reshape(
+            mx.transpose(
+                mx.reshape(
+                    image_features,
+                    (
+                        batch_size,
+                        num_image,
+                        h_blocks,
+                        cfg.image_pooling_h,
+                        w_blocks,
+                        cfg.image_pooling_w,
+                        -1,
+                    ),
+                ),
+                (0, 1, 2, 4, 3, 5, 6),
+            ),
+            (
+                batch_size * num_image * h_blocks * w_blocks,
+                cfg.image_pooling_h * cfg.image_pooling_w,
+                -1,
+            ),
+        )
+        if cfg.image_pooling_2d == "attention-meanq":
+            query = mx.mean(image_features, axis=-2, keepdims=True)
+            image_features = self.image_pooling_2d(query, image_features)
+        elif cfg.image_pooling_2d not in {"none", "stack"}:
+            image_features = self.image_pooling_2d(
+                image_features[:, :1, :], image_features
+            )
+        h, w = cfg.llm_patches_per_crop
+        image_features = mx.reshape(image_features, (batch_size, num_image, h * w, -1))
+        # # MLP layer to map the feature
+        image_features = self.image_projector(image_features)
+        return image_features, cls_embed

mlx_vlm/models/molmo2/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .config import AdapterConfig, ModelConfig, TextConfig, VisionConfig, VitConfig
+from .language import LanguageModel
+from .molmo2 import Model
+from .processing import Molmo2ImageProcessor as ImageProcessor
+from .processing import Molmo2Processor as Processor
+from .vision import VisionModel

mlx_vlm/models/molmo2/config.py ADDED Viewed

@@ -0,0 +1,137 @@
+import inspect
+from dataclasses import dataclass, field
+from typing import List, Optional
+from ..base import BaseModelConfig
+@dataclass
+class VitConfig(BaseModelConfig):
+    model_type: str = "molmo2"
+    hidden_size: int = 1152
+    intermediate_size: int = 4304
+    num_hidden_layers: int = 25  # Note: HF config says 27 but weights only have 25
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 16
+    head_dim: int = 72
+    image_patch_size: int = 14
+    image_num_pos: int = 729
+    image_default_input_size: List[int] = field(default_factory=lambda: [378, 378])
+    hidden_act: str = "gelu_pytorch_tanh"
+    layer_norm_eps: float = 1e-6
+    attention_dropout: float = 0.0
+    residual_dropout: float = 0.0
+    float32_attention: bool = True
+    attn_implementation: str = "sdpa"
+    @classmethod
+    def from_dict(cls, params):
+        # Workaround: HuggingFace config says 27 layers but weights only have 25
+        # Override to use 25 layers
+        if params.get("num_hidden_layers", 25) > 25:
+            params = dict(params)  # Don't modify original
+            params["num_hidden_layers"] = 25
+        return super().from_dict(params)
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+@dataclass
+class AdapterConfig(BaseModelConfig):
+    model_type: str = "molmo2"
+    hidden_size: int = 1152
+    intermediate_size: int = 9728
+    text_hidden_size: int = 2560
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 16
+    head_dim: int = 72
+    hidden_act: str = "silu"
+    vit_layers: List[int] = field(default_factory=lambda: [-3, -9])
+    image_feature_dropout: float = 0.0
+    pooling_attention_mask: bool = True
+    attention_dropout: float = 0.0
+    residual_dropout: float = 0.0
+    float32_attention: bool = True
+    attn_implementation: str = "sdpa"
+@dataclass
+class VisionConfig(BaseModelConfig):
+    vit_config: VitConfig = field(default_factory=VitConfig)
+    adapter_config: AdapterConfig = field(default_factory=AdapterConfig)
+    @classmethod
+    def from_dict(cls, params):
+        vit_cfg = params.get("vit_config", {})
+        adapter_cfg = params.get("adapter_config", {})
+        return cls(
+            vit_config=VitConfig.from_dict(vit_cfg),
+            adapter_config=AdapterConfig.from_dict(adapter_cfg),
+        )
+@dataclass
+class TextConfig(BaseModelConfig):
+    model_type: str = "molmo2"
+    hidden_size: int = 2560
+    intermediate_size: int = 9728
+    num_hidden_layers: int = 36
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 8
+    head_dim: int = 128
+    vocab_size: int = 151936
+    additional_vocab_size: int = 128
+    hidden_act: str = "silu"
+    layer_norm_eps: float = 1e-6
+    attention_dropout: float = 0.0
+    residual_dropout: float = 0.0
+    embedding_dropout: float = 0.0
+    max_position_embeddings: int = 36864
+    rope_theta: float = 5000000.0
+    rope_scaling: Optional[dict] = None
+    use_qk_norm: bool = True
+    qk_norm_type: str = "qwen3"
+    qkv_bias: bool = False
+    use_cache: bool = True
+    norm_after: bool = False
+@dataclass
+class ModelConfig(BaseModelConfig):
+    text_config: TextConfig = field(default_factory=TextConfig)
+    vision_config: VisionConfig = field(default_factory=VisionConfig)
+    model_type: str = "molmo2"
+    image_start_token_id: int = 151936
+    low_res_image_start_token_id: int = 151940
+    image_end_token_id: int = 151937
+    image_low_res_id: int = 151942
+    image_patch_id: int = 151938
+    image_col_id: int = 151939
+    frame_start_token_id: int = 151943
+    frame_end_token_id: int = 151944
+    use_frame_special_tokens: bool = False
+    tie_word_embeddings: bool = False
+    initializer_range: float = 0.02
+    eos_token_id: Optional[List[int]] = None
+    @classmethod
+    def from_dict(cls, params):
+        # Normalize how the repo loads configs: always provide `vision_config`.
+        if not params.get("vision_config"):
+            params["vision_config"] = {
+                "vit_config": params.get("vit_config", {}),
+                "adapter_config": params.get("adapter_config", {}),
+            }
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )