PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/molmo2/vision.py ADDED Viewed

@@ -0,0 +1,286 @@
+from typing import Optional, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from ..base import interpolate
+from .config import AdapterConfig, VisionConfig, VitConfig
+def _gelu_from_name(name: str) -> nn.Module:
+    if name == "gelu_pytorch_tanh":
+        return nn.GELU(approx="fast")
+    return nn.GELU(approx="fast")
+class ViTMLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, hidden_act: str):
+        super().__init__()
+        self.w1 = nn.Linear(hidden_size, intermediate_size, bias=True)
+        self.w2 = nn.Linear(intermediate_size, hidden_size, bias=True)
+        self.act = _gelu_from_name(hidden_act)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.w2(self.act(self.w1(x)))
+class ViTMultiHeadDotProductAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        num_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        input_dim: Optional[int] = None,
+        use_bias: bool = True,
+        float32_attention: bool = True,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scale = head_dim**-0.5
+        self.float32_attention = float32_attention
+        input_dim = input_dim or hidden_size
+        self.wq = nn.Linear(input_dim, self.num_heads * self.head_dim, bias=use_bias)
+        self.wk = nn.Linear(
+            input_dim, self.num_key_value_heads * self.head_dim, bias=use_bias
+        )
+        self.wv = nn.Linear(
+            input_dim, self.num_key_value_heads * self.head_dim, bias=use_bias
+        )
+        self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)
+    def __call__(
+        self,
+        inputs_q: mx.array,
+        inputs_kv: Optional[mx.array] = None,
+        attn_mask: Optional[mx.array] = None,
+    ) -> mx.array:
+        if inputs_kv is None:
+            inputs_k = inputs_q
+            inputs_v = inputs_q
+        else:
+            inputs_k = inputs_kv
+            inputs_v = inputs_kv
+        xq = self.wq(inputs_q)
+        xk = self.wk(inputs_k)
+        xv = self.wv(inputs_v)
+        bsz, q_len, _ = xq.shape
+        _, kv_len, _ = xk.shape
+        xq = xq.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        xk = xk.reshape(bsz, kv_len, self.num_key_value_heads, self.head_dim)
+        xv = xv.reshape(bsz, kv_len, self.num_key_value_heads, self.head_dim)
+        if self.num_heads != self.num_key_value_heads:
+            xk = mx.repeat(xk, self.num_key_value_groups, axis=2)
+            xv = mx.repeat(xv, self.num_key_value_groups, axis=2)
+        q = xq.transpose(0, 2, 1, 3)
+        k = xk.transpose(0, 2, 1, 3)
+        v = xv.transpose(0, 2, 1, 3)
+        dtype = q.dtype
+        if self.float32_attention:
+            q = q.astype(mx.float32)
+            k = k.astype(mx.float32)
+            v = v.astype(mx.float32)
+        scores = mx.matmul(q, k.transpose(0, 1, 3, 2)) * self.scale
+        if attn_mask is not None:
+            scores = mx.where(
+                attn_mask,
+                scores,
+                mx.full(scores.shape, vals=-1e9, dtype=scores.dtype),
+            )
+        weights = mx.softmax(scores, axis=-1)
+        out = mx.matmul(weights, v).astype(dtype)
+        out = out.transpose(0, 2, 1, 3).reshape(bsz, q_len, -1)
+        return self.wo(out)
+class Molmo2VisionBlock(nn.Module):
+    def __init__(self, config: VitConfig):
+        super().__init__()
+        self.attention = ViTMultiHeadDotProductAttention(
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            float32_attention=config.float32_attention,
+            input_dim=config.hidden_size,
+        )
+        self.feed_forward = ViTMLP(
+            config.hidden_size, config.intermediate_size, config.hidden_act
+        )
+        self.attention_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.ffn_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def __call__(self, x: mx.array) -> mx.array:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class Molmo2VisionTransformer(nn.Module):
+    def __init__(self, config: VitConfig):
+        super().__init__()
+        self.config = config
+        self.num_prefix_tokens = 0
+        self.positional_embedding = mx.zeros((config.image_num_pos, config.hidden_size))
+        patch_dim = config.image_patch_size * config.image_patch_size * 3
+        self.patch_embedding = nn.Linear(patch_dim, config.hidden_size, bias=True)
+        self.transformer = [
+            Molmo2VisionBlock(config) for _ in range(config.num_hidden_layers)
+        ]
+    def add_pos_emb(self, x: mx.array, patch_num: Tuple[int, int]) -> mx.array:
+        pos_emb = self.positional_embedding
+        pos_emb_size = int(pos_emb.shape[0] ** 0.5)
+        pos_emb = mx.reshape(pos_emb, (pos_emb_size, pos_emb_size, pos_emb.shape[1]))
+        patch_h, patch_w = patch_num
+        if pos_emb.shape[0] != patch_h or pos_emb.shape[1] != patch_w:
+            pos_emb = mx.transpose(pos_emb[None, ...], (0, 3, 1, 2))
+            pos_emb = interpolate(
+                pos_emb, (patch_h, patch_w), mode="cubic", align_corners=False
+            )
+            pos_emb = mx.transpose(pos_emb, (0, 2, 3, 1))[0]
+        pos_emb = mx.reshape(pos_emb, (-1, pos_emb.shape[-1]))
+        return x + pos_emb[None, :, :].astype(x.dtype)
+    def __call__(
+        self,
+        x: mx.array,
+        patch_num: Optional[Tuple[int, int]] = None,
+    ):
+        if patch_num is None:
+            patch_num = self.config.image_num_patch
+        x = self.patch_embedding(x)
+        x = self.add_pos_emb(x, patch_num)
+        hidden_states = []
+        for block in self.transformer:
+            x = block(x)
+            hidden_states.append(x)
+        return hidden_states
+class ImageProjectorMLP(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, output_dim, bias=False)
+        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False)
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.w2(nn.silu(self.w1(x)) * self.w3(x))
+class VisionModel(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = "molmo2"
+        self.vit_config: VitConfig = config.vit_config
+        self.adapter_config: AdapterConfig = config.adapter_config
+        self.image_vit = Molmo2VisionTransformer(self.vit_config)
+        self.vit_layers = []
+        for layer in self.adapter_config.vit_layers:
+            self.vit_layers.append(
+                layer if layer >= 0 else layer + self.vit_config.num_hidden_layers
+            )
+        pool_dim = self.vit_config.hidden_size * len(self.vit_layers)
+        self.image_pooling_2d = ViTMultiHeadDotProductAttention(
+            hidden_size=self.adapter_config.hidden_size,
+            num_heads=self.adapter_config.num_attention_heads,
+            num_key_value_heads=self.adapter_config.num_key_value_heads,
+            head_dim=self.adapter_config.head_dim,
+            input_dim=pool_dim,
+            float32_attention=self.adapter_config.float32_attention,
+        )
+        self.image_projector = ImageProjectorMLP(
+            self.adapter_config.hidden_size,
+            self.adapter_config.intermediate_size,
+            self.adapter_config.text_hidden_size,
+        )
+    def encode_image(self, images: mx.array) -> mx.array:
+        batch_size, num_crops, num_patch, patch_dim = images.shape
+        images = images.reshape(batch_size * num_crops, num_patch, patch_dim)
+        hidden_states = self.image_vit(images)
+        features = [hidden_states[layer] for layer in self.vit_layers]
+        image_features = mx.concatenate(features, axis=-1)
+        image_features = image_features.reshape(batch_size, num_crops, num_patch, -1)
+        return image_features
+    def __call__(
+        self,
+        images: mx.array,
+        pooled_patches_idx: mx.array,
+    ) -> mx.array:
+        batch_size, num_crops = images.shape[:2]
+        image_features = self.encode_image(images)
+        dim = image_features.shape[-1]
+        valid = pooled_patches_idx >= 0
+        valid_token = mx.any(valid, axis=-1)
+        flat_features = image_features.reshape(batch_size, -1, dim)
+        idx = mx.clip(pooled_patches_idx, 0, None)
+        batch_idx = mx.arange(batch_size)[:, None, None]
+        batch_idx = mx.broadcast_to(batch_idx, idx.shape)
+        gathered = flat_features[mx.reshape(batch_idx, (-1,)), mx.reshape(idx, (-1,))]
+        to_pool = gathered.reshape(
+            pooled_patches_idx.shape[0],
+            pooled_patches_idx.shape[1],
+            pooled_patches_idx.shape[2],
+            dim,
+        )
+        to_pool = to_pool * valid[..., None].astype(to_pool.dtype)
+        to_pool = to_pool.reshape(-1, pooled_patches_idx.shape[-1], dim)
+        if self.adapter_config.pooling_attention_mask:
+            attn_mask = valid.reshape(-1, 1, 1, valid.shape[-1])
+            denom = valid.reshape(-1, to_pool.shape[-2]).astype(mx.float32).sum(axis=-1)
+            denom = mx.where(denom == 0, mx.ones_like(denom), denom)
+            query = to_pool.sum(axis=-2, keepdims=True) / denom[:, None, None].astype(
+                to_pool.dtype
+            )
+        else:
+            attn_mask = None
+            query = mx.mean(to_pool, axis=-2, keepdims=True)
+        pooled = self.image_pooling_2d(query, to_pool, attn_mask=attn_mask)
+        pooled = pooled.reshape(batch_size, -1, pooled.shape[-1])
+        pooled = self.image_projector(pooled)
+        pooled = pooled.reshape(-1, pooled.shape[-1])
+        # MLX doesn't support boolean indexing, so convert to integer indices
+        valid_flat = np.array(valid_token).flatten()
+        valid_indices = np.where(valid_flat)[0]
+        return pooled[mx.array(valid_indices)]

mlx_vlm/models/moondream2/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .config import ModelConfig, TextConfig, VisionConfig
+from .image_crops import (
+    adaptive_avg_pool2d,
+    overlap_crop_image,
+    reconstruct_from_crops,
+    select_tiling,
+)
+from .moondream2 import ImageProcessor, Model
+from .vision import VisionModel
+from .language import LanguageModel
+from . import processing_moondream  # Registers the AutoProcessor patch

mlx_vlm/models/moondream2/config.py ADDED Viewed

@@ -0,0 +1,92 @@
+import inspect
+from dataclasses import dataclass
+from typing import Optional
+from ..base import BaseModelConfig
+@dataclass
+class TextConfig(BaseModelConfig):
+    model_type: str = "phi"
+    hidden_size: int = 2048
+    num_hidden_layers: int = 24
+    intermediate_size: int = 8192
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 32
+    vocab_size: int = 51200
+    max_position_embeddings: int = 2048
+    rope_theta: float = 10000.0
+    layer_norm_eps: float = 1e-5
+    # Moondream uses partial RoPE - only first 32 dims (head_dim // 2)
+    partial_rotary_factor: float = 0.5
+    # Prefix attention length: BOS (1) + image patches (729) = 730
+    prefix_attn_len: int = 730
+@dataclass
+class VisionConfig(BaseModelConfig):
+    model_type: str = "moondream_vision"
+    hidden_size: int = 1152  # enc_dim
+    num_hidden_layers: int = 27  # enc_n_layers
+    intermediate_size: int = 4304  # enc_ff_dim
+    num_attention_heads: int = 16  # enc_n_heads
+    image_size: int = 378  # crop_size
+    patch_size: int = 14  # enc_patch_size
+    num_channels: int = 3  # in_channels
+    layer_norm_eps: float = 1e-5
+    # Multi-crop settings (for future full implementation)
+    max_crops: int = 12
+    overlap_margin: int = 4
+@dataclass
+class ModelConfig(BaseModelConfig):
+    text_config: TextConfig = None
+    vision_config: VisionConfig = None
+    model_type: str = "moondream1"
+    # Projection MLP inner dimension
+    proj_inner_dim: int = 8192
+    # Image features are prepended after BOS token
+    image_token_index: int = -200
+    vocab_size: int = 51200
+    # Prefix attention length: BOS (1) + image patches (729) = 730
+    prefix_attn_len: int = 730
+    # Token IDs (EOS and BOS are the same for moondream)
+    eos_token_id: int = 0
+    bos_token_id: int = 0
+    def __post_init__(self):
+        if self.text_config is None:
+            self.text_config = TextConfig()
+        if self.vision_config is None:
+            self.vision_config = VisionConfig()
+    @classmethod
+    def from_dict(cls, params):
+        # Extract nested configs
+        text_config_dict = params.get("text_config", {})
+        vision_config_dict = params.get("vision_config", {})
+        # If text_config is empty, try to get from root level
+        if not text_config_dict:
+            text_config_dict = {
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(TextConfig).parameters
+            }
+        # Create nested config objects
+        text_config = TextConfig.from_dict(text_config_dict)
+        vision_config = VisionConfig.from_dict(vision_config_dict)
+        # Build the main config
+        return cls(
+            text_config=text_config,
+            vision_config=vision_config,
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+                and k not in ("text_config", "vision_config")
+            },
+        )

mlx_vlm/models/moondream2/image_crops.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""
+Multi-crop image processing utilities for Moondream2.
+Reference implementation: moondream2/image_crops.py
+"""
+import math
+from typing import Tuple
+import mlx.core as mx
+import numpy as np
+from PIL import Image
+def select_tiling(
+    height: int, width: int, crop_size: int, max_crops: int
+) -> Tuple[int, int]:
+    """
+    Determine the optimal number of tiles to cover an image with overlapping crops.
+    Ported from HF reference: moondream2/image_crops.py:17-50
+    """
+    if height <= crop_size or width <= crop_size:
+        return (1, 1)
+    # Minimum required tiles in each dimension
+    min_h = math.ceil(height / crop_size)
+    min_w = math.ceil(width / crop_size)
+    # If minimum required tiles exceed max_crops, return proportional distribution
+    if min_h * min_w > max_crops:
+        ratio = math.sqrt(max_crops / (min_h * min_w))
+        return (max(1, math.floor(min_h * ratio)), max(1, math.floor(min_w * ratio)))
+    # Perfect aspect-ratio tiles that satisfy max_crops
+    h_tiles = math.floor(math.sqrt(max_crops * height / width))
+    w_tiles = math.floor(math.sqrt(max_crops * width / height))
+    # Ensure we meet minimum tile requirements
+    h_tiles = max(h_tiles, min_h)
+    w_tiles = max(w_tiles, min_w)
+    # If we exceeded max_crops, scale down the larger dimension
+    if h_tiles * w_tiles > max_crops:
+        if w_tiles > h_tiles:
+            w_tiles = math.floor(max_crops / h_tiles)
+        else:
+            h_tiles = math.floor(max_crops / w_tiles)
+    return (max(1, h_tiles), max(1, w_tiles))
+def overlap_crop_image(
+    image: np.ndarray,
+    max_crops: int = 12,
+    overlap_margin: int = 4,
+    base_size: Tuple[int, int] = (378, 378),
+    patch_size: int = 14,
+) -> Tuple[np.ndarray, Tuple[int, int]]:
+    """
+    Create overlapping crops from an image for multi-scale processing.
+    Ported from HF reference: moondream2/image_crops.py:58-167
+    Args:
+        image: Input image as numpy array [H, W, C] in range [0, 255]
+        max_crops: Maximum number of local crops allowed (default 12)
+        overlap_margin: Number of patches to overlap between adjacent crops (default 4)
+        base_size: Size of each crop (default (378, 378))
+        patch_size: Size of each patch for the vision encoder (default 14)
+    Returns:
+        crops: numpy array [n_crops, H, W, C] - crops[0] is global, rest are local
+        tiling: (h_tiles, w_tiles) tuple describing the local crop layout
+    """
+    original_h, original_w = image.shape[:2]
+    # Convert margin from patch units to pixels
+    margin_pixels = patch_size * overlap_margin
+    total_margin_pixels = margin_pixels * 2  # Both sides
+    # Calculate crop parameters
+    crop_patches = base_size[0] // patch_size  # patches per crop dimension
+    crop_window_patches = crop_patches - (2 * overlap_margin)  # usable patches
+    crop_window_size = crop_window_patches * patch_size  # usable size in pixels
+    # Determine tiling using margin-adjusted dimensions and effective crop size
+    tiling = select_tiling(
+        original_h - total_margin_pixels,
+        original_w - total_margin_pixels,
+        crop_window_size,
+        max_crops,
+    )
+    # Pre-allocate crops
+    n_crops = tiling[0] * tiling[1] + 1  # +1 for global crop
+    crops = np.zeros(
+        (n_crops, base_size[0], base_size[1], image.shape[2]), dtype=np.uint8
+    )
+    # Resize image to fit tiling
+    target_size = (
+        tiling[0] * crop_window_size + total_margin_pixels,
+        tiling[1] * crop_window_size + total_margin_pixels,
+    )
+    pil_image = Image.fromarray(image.astype(np.uint8))
+    # Resize for local crops
+    resized = pil_image.resize(
+        (int(target_size[1]), int(target_size[0])),
+        resample=Image.Resampling.LANCZOS,
+    )
+    image = np.asarray(resized)
+    # Create global crop
+    global_crop = pil_image.resize(
+        (int(base_size[1]), int(base_size[0])),
+        resample=Image.Resampling.LANCZOS,
+    )
+    crops[0] = np.asarray(global_crop)
+    # Extract local crops
+    for i in range(tiling[0]):
+        for j in range(tiling[1]):
+            y0 = i * crop_window_size
+            x0 = j * crop_window_size
+            y_end = min(y0 + base_size[0], image.shape[0])
+            x_end = min(x0 + base_size[1], image.shape[1])
+            crop_region = image[y0:y_end, x0:x_end]
+            crops[
+                1 + i * tiling[1] + j, : crop_region.shape[0], : crop_region.shape[1]
+            ] = crop_region
+    return crops, tiling
+def reconstruct_from_crops(
+    local_features: mx.array,
+    tiling: Tuple[int, int],
+    overlap_margin: int = 4,
+) -> mx.array:
+    """
+    Reconstruct a unified feature map from local crop features.
+    This function stitches together the features from local crops,
+    handling the overlap regions by trimming interior margins.
+    Args:
+        local_features: [n_local, 27, 27, 1152] features from local crops
+                       (27x27 patches per crop, each with 1152-dim features)
+        tiling: (h_tiles, w_tiles) describing the crop layout
+        overlap_margin: Number of patches that overlap between adjacent crops (default 4)
+    Returns:
+        Reconstructed feature map [H, W, 1152] where:
+        H = h_tiles * (27 - 2*overlap_margin) + 2*overlap_margin
+        W = w_tiles * (27 - 2*overlap_margin) + 2*overlap_margin
+    """
+    h_tiles, w_tiles = tiling
+    n_local = h_tiles * w_tiles
+    patches_per_side = 27  # 378 / 14 = 27 patches per crop side
+    hidden_size = local_features.shape[-1]  # 1152
+    # Effective patches per crop after removing interior overlaps
+    effective_patches = patches_per_side - 2 * overlap_margin  # 27 - 8 = 19
+    # Output feature map size
+    out_h = h_tiles * effective_patches + 2 * overlap_margin
+    out_w = w_tiles * effective_patches + 2 * overlap_margin
+    # Initialize output
+    # Use numpy for easier slicing, convert to mx at the end
+    local_np = np.array(local_features)
+    output = np.zeros((out_h, out_w, hidden_size), dtype=local_np.dtype)
+    crop_idx = 0
+    for i in range(h_tiles):
+        for j in range(w_tiles):
+            crop_features = local_np[crop_idx]  # [27, 27, 1152]
+            # Determine which margins to keep based on position
+            top_margin = overlap_margin if i == 0 else 0
+            bottom_margin = overlap_margin if i == h_tiles - 1 else 0
+            left_margin = overlap_margin if j == 0 else 0
+            right_margin = overlap_margin if j == w_tiles - 1 else 0
+            # Trim interior margins
+            start_y = 0 if i == 0 else overlap_margin
+            end_y = patches_per_side if i == h_tiles - 1 else patches_per_side - overlap_margin
+            start_x = 0 if j == 0 else overlap_margin
+            end_x = patches_per_side if j == w_tiles - 1 else patches_per_side - overlap_margin
+            trimmed = crop_features[start_y:end_y, start_x:end_x]
+            # Calculate output position
+            out_y = 0 if i == 0 else (patches_per_side - overlap_margin) + (i - 1) * effective_patches
+            out_x = 0 if j == 0 else (patches_per_side - overlap_margin) + (j - 1) * effective_patches
+            out_h_slice = end_y - start_y
+            out_w_slice = end_x - start_x
+            output[out_y : out_y + out_h_slice, out_x : out_x + out_w_slice] = trimmed
+            crop_idx += 1
+    return mx.array(output)
+def adaptive_avg_pool2d(
+    x: mx.array,
+    output_size: Tuple[int, int],
+) -> mx.array:
+    """
+    Adaptive average pooling that pools input to a fixed output size.
+    Args:
+        x: Input tensor [H, W, C] or [C, H, W]
+        output_size: Target (H_out, W_out)
+    Returns:
+        Pooled tensor with spatial dimensions matching output_size
+    """
+    # Assume input is [H, W, C] (channel last)
+    H, W, C = x.shape
+    out_h, out_w = output_size
+    if H == out_h and W == out_w:
+        return x
+    # Calculate kernel and stride sizes for adaptive pooling
+    # Kernel size = ceil(input_size / output_size)
+    # Stride = floor(input_size / output_size)
+    kernel_h = (H + out_h - 1) // out_h
+    kernel_w = (W + out_w - 1) // out_w
+    stride_h = H // out_h
+    stride_w = W // out_w
+    # Pad if necessary to ensure we can cover the output size
+    pad_h = max(0, (out_h - 1) * stride_h + kernel_h - H)
+    pad_w = max(0, (out_w - 1) * stride_w + kernel_w - W)
+    if pad_h > 0 or pad_w > 0:
+        # Pad with zeros
+        x = mx.pad(x, [(0, pad_h), (0, pad_w), (0, 0)])
+    # Perform pooling using a simple averaging approach
+    # Convert to [1, H, W, C] for batch processing
+    x = x[None, :, :, :]  # [1, H, W, C]
+    # Use reshape and mean for pooling
+    result = np.zeros((out_h, out_w, C), dtype=np.float32)
+    x_np = np.array(x[0])  # [H, W, C]
+    for i in range(out_h):
+        for j in range(out_w):
+            # Calculate the input region for this output pixel
+            h_start = i * stride_h
+            h_end = min(h_start + kernel_h, x_np.shape[0])
+            w_start = j * stride_w
+            w_end = min(w_start + kernel_w, x_np.shape[1])
+            # Average pool
+            region = x_np[h_start:h_end, w_start:w_end, :]
+            result[i, j, :] = region.mean(axis=(0, 1))
+    return mx.array(result)