PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/multi_modality/vision.py ADDED Viewed

@@ -0,0 +1,450 @@
+import copy
+from functools import partial
+from math import sqrt
+from typing import Dict, Optional, Union
+import cv2
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from .config import MLPConfig, VisionConfig
+from .sam import SAMEncoder
+def check_array_shape(arr):
+    shape = arr.shape
+    # Check if the shape has 4 dimensions
+    if len(shape) != 4:
+        return False
+    out_channels, kH, KW, _ = shape
+    # Check if out_channels is the largest, and kH and KW are the same
+    if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
+        return True
+    else:
+        return False
+class AttentionPoolLatent(nn.Module):
+    """Attention pooling w/ latent query"""
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int = None,
+        embed_dim: int = None,
+        num_heads: int = 8,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        latent_len: int = 1,
+        latent_dim: int = None,
+        pos_embed: str = "",
+        pool_type: str = "token",
+        norm_layer: Optional[nn.Module] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        embed_dim = embed_dim or in_features
+        out_features = out_features or in_features
+        assert embed_dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.pool = pool_type
+        self.latent_dim = latent_dim or embed_dim
+        self.latent_len = latent_len
+        self.latent = mx.zeros((self.latent_len, embed_dim))[None, :]
+        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(drop)
+        if pos_embed == "abs":
+            spatial_len = self.feat_size
+            self.pos_embed = mx.zeros((spatial_len, in_features))
+        else:
+            self.pos_embed = None
+        self.norm = nn.LayerNorm(out_features)
+        config = MLPConfig(
+            hidden_size=embed_dim, intermediate_size=int(embed_dim * mlp_ratio)
+        )
+        self.mlp = MLP(config)
+    def __call__(self, x: mx.array):
+        B, N, C = x.shape
+        if self.pos_embed is not None:
+            x = x + self.pos_embed.unsqueeze(0).to(x.dtype)
+        q_latent = mx.array(self.latent)
+        q = (
+            self.q(q_latent)
+            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
+            .transpose(0, 2, 1, 3)
+        )
+        kv = (
+            self.kv(x)
+            .reshape(B, N, 2, self.num_heads, self.head_dim)
+            .transpose(2, 0, 3, 1, 4)
+        )
+        k, v = mx.split(kv, 2, axis=0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        x = mx.fast.scaled_dot_product_attention(
+            q, k[0], v[0], scale=(1.0 / sqrt(q.shape[-1])), mask=None
+        )
+        x = x.transpose(0, 2, 1, 3).reshape(B, self.latent_len, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        x = x + self.mlp(self.norm(x))
+        # optional pool if latent seq_len > 1 and pooled output is desired
+        if self.pool == "token":
+            x = x[:, 0]
+        elif self.pool == "avg":
+            x = x.mean(1)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dims: int,
+        num_heads: int,
+        qkv_bias: bool = False,
+    ):
+        super().__init__()
+        if (dims % num_heads) != 0:
+            raise ValueError(
+                "The input feature dimensions should be divisible by the "
+                f"number of heads ({dims} % {num_heads}) != 0"
+            )
+        self.num_heads = num_heads = num_heads
+        head_dim = dims // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dims, dims * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dims, dims, bias=True)
+    def __call__(self, x, mask=None):
+        qkv = self.qkv(x)
+        queries, keys, values = mx.split(qkv, 3, axis=-1)
+        num_heads = self.num_heads
+        B, L, D = queries.shape
+        _, S, _ = keys.shape
+        queries = queries.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, S, num_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, S, num_heads, -1).transpose(0, 2, 1, 3)
+        output = mx.fast.scaled_dot_product_attention(
+            queries, keys, values, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.proj(output)
+class FastGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+    def __call__(self, input: mx.array) -> mx.array:
+        return (
+            0.5
+            * input
+            * (1.0 + mx.tanh(np.sqrt(2 / np.pi) * (input + 0.044715 * (input**3))))
+        ).astype(input.dtype)
+class MLP(nn.Module):
+    def __init__(self, config: Union[VisionConfig, Dict], bias: bool = True):
+        super().__init__()
+        self.activation_fn = FastGELUActivation()
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=bias)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=bias)
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.activation_fn(self.fc1(x))
+        x = self.fc2(x)
+        return x
+class EncoderLayer(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.attn = Attention(
+            config.hidden_size, config.num_attention_heads, qkv_bias=True
+        )
+        self.norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = MLP(config)
+        self.norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def __call__(self, x: mx.array, mask: Optional[mx.array] = None) -> mx.array:
+        y = self.norm1(x)
+        y = self.attn(y, mask)
+        x = x + y
+        y = self.norm2(x)
+        y = self.mlp(y)
+        return x + y
+class VisionEmbeddings(nn.Module):
+    def __init__(self, config: VisionConfig, norm_layer: bool = False):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.proj = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.norm = (
+            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            if norm_layer
+            else nn.Identity()
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        patch_embeddings = self.proj(x)
+        patch_embeddings = mx.flatten(patch_embeddings, start_axis=1, end_axis=2)
+        return self.norm(patch_embeddings)
+class SigLipVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: VisionConfig,
+        ignore_head: bool,
+        pre_norm: bool = False,
+        no_embed_class: bool = True,
+    ):
+        super().__init__()
+        self.num_prefix_tokens = 1
+        self.no_embed_class = False
+        self.dynamic_img_size = False
+        self.ignore_head = ignore_head
+        self.cls_token = None
+        self.reg_token = None
+        self.patch_embed = VisionEmbeddings(config)
+        self.norm_pre = nn.LayerNorm(config.hidden_size) if pre_norm else nn.Identity()
+        self.blocks = [EncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        self.norm = nn.LayerNorm(config.hidden_size)
+        num_patches = self.patch_embed.num_patches
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = mx.random.normal((embed_len, config.hidden_size))[None, :]
+        norm_layer = partial(nn.LayerNorm, eps=1e-5)
+        self.attn_pool = AttentionPoolLatent(
+            config.hidden_size,
+            num_heads=config.num_attention_heads,
+            norm_layer=norm_layer,
+        )
+    def __call__(
+        self,
+        x: mx.array,
+        output_hidden_states: Optional[bool] = None,
+    ) -> mx.array:
+        x = self.patch_embed(x)
+        x += self.pos_embed
+        x = self.norm_pre(x)
+        encoder_states = (x,) if output_hidden_states else None
+        for l in self.blocks:
+            x = l(x, mask=None)
+            if output_hidden_states:
+                encoder_states = encoder_states + (x,)
+        pooler_output = self.norm(x)
+        if not self.ignore_head:
+            pooler_output = self.attn_pool(pooler_output)
+        return pooler_output, x, encoder_states
+class HybridVisionModel(nn.Module):
+    def __init__(self, config: VisionConfig, resolution: str, ignore_head: bool = True):
+        super().__init__()
+        self.model_type = config.model_type
+        self.resolution = resolution
+        if self.model_type != "vision":
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        if resolution == "high":
+            self.vision_tower = SAMEncoder()
+        else:
+            self.vision_tower = SigLipVisionModel(config, ignore_head)
+    def __call__(self, x: mx.array) -> mx.array:
+        if self.resolution == "high":
+            return self.vision_tower(x)
+        else:
+            return self.vision_tower(x)[0]
+def resize_image(image, size, antialias=True):
+    """
+    Resize an image with OpenCV.
+    Args:
+        image (numpy.ndarray): The input image array.  Supports H × W or H × W × C.
+                               If you pass in a batch (N × H × W × C) just slice the
+                               element you want, e.g. image[0].
+        size  (tuple): Target size as (width, height) — exactly the same order that
+                       cv2.resize expects.
+        antialias (bool):
+            * True  → high‑quality interpolation (bicubic for upscaling, area for downscaling)
+            * False → nearest‑neighbor (fast, blocky)
+    Returns:
+        numpy.ndarray: The resized image array.
+    """
+    img = np.ascontiguousarray(np.asarray(image))
+    if img.ndim == 4 and img.shape[0] == 1:  # squeeze stray batch dim
+        img = img[0]
+    h0, w0 = img.shape[:2]
+    # --- work out dsize vs fx/fy ---------------------------------------------
+    dsize = None
+    fx = fy = 0.0
+    if isinstance(size, (int, float)):  # uniform scale
+        fx = fy = float(size)
+    elif isinstance(size, (tuple, list)) and len(size) == 2:
+        a, b = size
+        # Heuristic: treat "small" floats as scale factors
+        if all(isinstance(x, (int, float)) and x < 10 for x in (a, b)):
+            fx, fy = float(a), float(b)  # scale factors
+        else:
+            dsize = (int(a), int(b))  # absolute pixels
+    else:
+        raise ValueError("target must be scalar or a 2‑tuple")
+    # Guard against zeros after int‑casting
+    if dsize:
+        if dsize[0] <= 0 or dsize[1] <= 0:
+            raise ValueError(f"dsize became {dsize}")
+    else:
+        if fx <= 0 or fy <= 0:
+            raise ValueError(f"fx,fy became {(fx, fy)}")
+    # --- choose interpolation -------------------------------------------------
+    if antialias:
+        # Use Lanczos interpolation for potentially better detail preservation
+        interp = cv2.INTER_LANCZOS4
+    else:
+        interp = cv2.INTER_NEAREST
+    # --- call OpenCV ----------------------------------------------------------
+    return mx.array(cv2.resize(img, dsize=dsize, fx=fx, fy=fy, interpolation=interp))
+class VisionModel(nn.Module):
+    def __init__(self, config: VisionConfig, ignore_head: bool = True):
+        super().__init__()
+        self.model_type = config.model_type
+        self.config = config
+        if self.model_type != "vision":
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        if config.cls == "HybridVisionTower":
+            self.high_layer_norm = nn.LayerNorm(
+                config.params["high_res_cfg"]["output_dim"]
+            )
+            self.low_layer_norm = nn.LayerNorm(
+                config.params["low_res_cfg"]["output_dim"]
+            )
+            high_res_cfg = copy.deepcopy(config)
+            high_res_cfg.image_size = config.params["high_res_cfg"]["image_size"]
+            self.vision_tower_high = HybridVisionModel(
+                high_res_cfg, "high", ignore_head
+            )
+            low_res_cfg = copy.deepcopy(config)
+            low_res_cfg.image_size = config.params["low_res_cfg"]["image_size"]
+            self.vision_tower_low = HybridVisionModel(low_res_cfg, "low", ignore_head)
+            self.low_res_size = config.params["low_res_cfg"]["image_size"]
+            self.resize = lambda image: resize_image(
+                image, (self.low_res_size, self.low_res_size), antialias=True
+            )
+        else:
+            self.vision_tower = SigLipVisionModel(config, ignore_head)
+    def __call__(
+        self, x: mx.array, output_hidden_states: Optional[bool] = None
+    ) -> mx.array:
+        if self.config.cls == "HybridVisionTower":
+            high_images = x
+            low_images = mx.array(self.resize(np.array(x)))[None, :]
+            high_res = self.vision_tower_high(high_images)
+            low_res = self.vision_tower_low(low_images)
+            return (high_res, low_res)
+        else:
+            return self.vision_tower(x, output_hidden_states)
+    def sanitize(self, weights):
+        sanitized_weights = {}
+        weight_keys = {
+            "neck.0.weight",
+            "neck.2.weight",
+            "neck_hd.0.weight",
+            "neck_hd.2.weight",
+            "downsamples.0.weight",
+            "downsamples.1.weight",
+            "patch_embed.proj.weight",
+        }
+        for k, v in weights.items():
+            if "position_ids" in k:
+                # Remove unused position_ids
+                continue
+            elif ".".join(k.split(".")[-3:]) in weight_keys:
+                # PyTorch conv2d weight tensors have shape:
+                #   [out_channels, in_channels, kH, KW]
+                # MLX conv2d expects the weight be of shape:
+                #   [out_channels, kH, KW, in_channels]
+                if check_array_shape(v):
+                    sanitized_weights[k] = v
+                else:
+                    sanitized_weights[k] = v.transpose(0, 2, 3, 1)
+            else:
+                sanitized_weights[k] = v
+        return sanitized_weights

mlx_vlm/models/paddleocr_vl/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .config import ModelConfig, TextConfig, VisionConfig
+from .paddleocr_vl import LanguageModel, Model, VisionModel
+from .processing_paddleocr_vl import PaddleOCRVLProcessor

mlx_vlm/models/paddleocr_vl/config.py ADDED Viewed

@@ -0,0 +1,93 @@
+import inspect
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Union
+from ..base import BaseModelConfig
+@dataclass
+class VisionConfig(BaseModelConfig):
+    model_type: str = "paddleocr_vl"
+    hidden_size: int = 1152
+    intermediate_size: int = 4304
+    num_hidden_layers: int = 27
+    num_attention_heads: int = 16
+    num_channels: int = 3
+    image_size: int = 384
+    patch_size: int = 14
+    hidden_act: str = "gelu_pytorch_tanh"
+    layer_norm_eps: float = 1e-6
+    attention_dropout: float = 0.0
+    spatial_merge_size: int = 2
+@dataclass
+class TextConfig(BaseModelConfig):
+    model_type: str = "paddleocr_vl"
+    hidden_size: int = 1024
+    num_hidden_layers: int = 18
+    intermediate_size: int = 3072
+    num_attention_heads: int = 16
+    rms_norm_eps: float = 1e-05
+    vocab_size: int = 103424
+    num_key_value_heads: Optional[int] = 2
+    max_position_embeddings: Optional[int] = 131072
+    rope_theta: float = 500000.0
+    rope_traditional: bool = False
+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
+    use_cache: bool = True
+    hidden_act: str = ("silu",)
+    pad_token_id: int = (0,)
+    bos_token_id: int = (1,)
+    eos_token_id: int = (2,)
+    use_bias: bool = (False,)
+    head_dim: int = (128,)
+    rope_parameters: Dict = None
+    rope_scaling: Dict = field(
+        default_factory=lambda: {
+            "rope_type": "default",
+            "type": "default",
+            "mrope_section": [16, 24, 24],
+        }
+    )
+    def __post_init__(self):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        if self.rope_scaling:
+            required_keys = {"mrope_section", "type"}
+            if not all(key in self.rope_scaling for key in required_keys):
+                raise ValueError(f"rope_scaling must contain keys {required_keys}")
+            if not self.rope_scaling["type"] in ["mrope", "default"]:
+                raise ValueError(f"rope_scaling type must be 'mrope' or 'default'")
+@dataclass
+class ModelConfig(BaseModelConfig):
+    text_config: TextConfig
+    vision_config: VisionConfig
+    model_type: str = "paddleocr_vl"
+    ignore_index: int = -100
+    image_token_id: int = 100295
+    video_token_id: int = 100296
+    vision_start_token_id: int = 101305
+    vision_end_token_id: int = (101306,)
+    eos_token_id: int = (2,)
+    @classmethod
+    def from_dict(cls, params):
+        # Copy text config parameters from root level
+        excluded_keys = {"vision_config"}
+        params["text_config"] = dict(
+            filter(lambda x: x[0] not in excluded_keys, params.items())
+        )
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )