PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/jina_vlm/vision.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Vision encoder for Jina VLM in MLX."""
+from typing import List, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+from .config import VisionConfig
+class PatchEmbedding(nn.Module):
+    """Patch embedding using linear projection."""
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.num_channels = config.num_channels
+        self.hidden_size = config.hidden_size
+        # Linear projection for patches - named to match weights
+        patch_dim = config.num_channels * config.patch_size * config.patch_size
+        self.proj = nn.Linear(patch_dim, config.hidden_size, bias=config.use_bias)
+    def __call__(self, x: mx.array) -> Tuple[mx.array, Tuple[int, int]]:
+        if x.ndim == 3:
+            # Already patchified: (B, n_patches, patch_dim)
+            B, n_patches, _ = x.shape
+            nH = nW = int(n_patches**0.5)
+            x = self.proj(x)
+        else:
+            # Image format: (B, C, H, W)
+            B, C, H, W = x.shape
+            pH, pW = self.patch_size, self.patch_size
+            nH, nW = H // pH, W // pW
+            x = x.reshape(B, C, nH, pH, nW, pW)
+            x = x.transpose(0, 2, 4, 1, 3, 5)
+            x = x.reshape(B, nH * nW, C * pH * pW)
+            x = self.proj(x)
+        return x, (nH, nW)
+class VisionMLP(nn.Module):
+    """MLP for vision transformer - matches weight naming: ffn.up, ffn.down"""
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        # Named to match weights: ffn.up, ffn.down
+        self.up = nn.Linear(
+            config.hidden_size, config.intermediate_size, bias=config.use_bias
+        )
+        self.down = nn.Linear(
+            config.intermediate_size, config.hidden_size, bias=config.use_bias
+        )
+        # Use built-in GELU with tanh approximation
+        if config.activation == "gelu_pytorch_tanh":
+            self.gelu = nn.GELU(approx="tanh")
+        else:
+            self.gelu = nn.GELU()
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.up(x)
+        x = self.gelu(x)
+        x = self.down(x)
+        return x
+class VisionAttention(nn.Module):
+    """Multi-head self-attention - matches weight naming: attn.qkv, attn.out"""
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.scale = self.head_dim**-0.5
+        # Fused QKV projection - named to match weights
+        self.qkv = nn.Linear(
+            config.hidden_size,
+            3 * config.num_attention_heads * config.head_dim,
+            bias=config.use_bias,
+        )
+        self.out = nn.Linear(
+            config.num_attention_heads * config.head_dim,
+            config.hidden_size,
+            bias=config.use_bias,
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        B, L, _ = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, L, 3, self.num_heads, self.head_dim)
+        qkv = qkv.transpose(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(0, 1, 3, 2)) * self.scale
+        attn = mx.softmax(attn, axis=-1)
+        x = attn @ v
+        x = x.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        x = self.out(x)
+        return x
+class VisionEncoderLayer(nn.Module):
+    """Transformer block - matches weight naming: attn_norm, ffn_norm"""
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        # Named to match weights: attn_norm, ffn_norm
+        self.attn_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps, bias=config.use_bias
+        )
+        self.attn = VisionAttention(config)
+        self.ffn_norm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps, bias=config.use_bias
+        )
+        self.ffn = VisionMLP(config)
+    def __call__(self, x: mx.array) -> mx.array:
+        x = x + self.attn(self.attn_norm(x))
+        x = x + self.ffn(self.ffn_norm(x))
+        return x
+class VisionModel(nn.Module):
+    """Vision encoder (SigLIP-style ViT)."""
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        self.hidden_size = config.hidden_size
+        self.vit_layers = config.vit_layers
+        # Named to match weights: patch_embed.proj
+        self.patch_embed = PatchEmbedding(config)
+        # Named to match weights: pos_embed (saved as 2D, not 3D)
+        num_patches = (config.image_size // config.patch_size) ** 2
+        if config.use_cls_token:
+            num_patches += 1
+            self.cls_token = mx.zeros((1, 1, config.hidden_size))
+        else:
+            self.cls_token = None
+        self.pos_embed = mx.zeros((num_patches, config.hidden_size))
+        # Transformer blocks
+        self.layers = [
+            VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)
+        ]
+        # Named to match weights: post_norm
+        if config.post_layer_norm:
+            self.post_norm = nn.LayerNorm(
+                config.hidden_size, eps=config.layer_norm_eps, bias=config.use_bias
+            )
+        else:
+            self.post_norm = None
+    def __call__(self, x: mx.array) -> Tuple[mx.array, List[mx.array]]:
+        x, shape = self.patch_embed(x)
+        if self.cls_token is not None:
+            B = x.shape[0]
+            cls = mx.broadcast_to(self.cls_token, (B, 1, self.hidden_size))
+            x = mx.concatenate([cls, x], axis=1)
+        # pos_embed is (num_patches, hidden_size), add batch dim for broadcast
+        x = x + self.pos_embed[None, :, :]
+        hidden_states = []
+        for layer in self.layers:
+            x = layer(x)
+            hidden_states.append(x)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+            hidden_states.append(x)
+        return x, hidden_states
+    def get_features(self, images: mx.array) -> mx.array:
+        """Extract features from specific ViT layers.
+        Note: hidden_states includes all layer outputs plus the post_norm output.
+        vit_layers indices (e.g., [-4, -10]) are applied to this full list.
+        For 27 layers with post_norm, hidden_states has 28 elements:
+        - indices 0-26: layer 0-26 outputs
+        - index 27: post_norm output
+        So vit_layers=[-4, -10] extracts layers 24 and 18 (not 23 and 17).
+        """
+        _, hidden_states = self(images)
+        # Use full hidden_states including post_norm output for correct indexing
+        features = []
+        for layer_idx in self.vit_layers:
+            feats = hidden_states[layer_idx]
+            if self.cls_token is not None:
+                feats = feats[:, 1:]
+            features.append(feats)
+        return mx.concatenate(features, axis=-1)

mlx_vlm/models/kernels.py ADDED Viewed

@@ -0,0 +1,447 @@
+import mlx.core as mx
+def nearest_interpolate(x, size=None, scale_factor=None):
+    """
+    Nearest neighbor interpolation that exactly matches PyTorch's behavior.
+    """
+    # Get input dimensions
+    batch_size, channels, in_h, in_w = x.shape
+    # Calculate output dimensions
+    if size is not None:
+        out_h, out_w = size
+    elif scale_factor is not None:
+        if isinstance(scale_factor, (int, float)):
+            scale_h = scale_w = scale_factor
+        else:
+            scale_h, scale_w = scale_factor
+        out_h, out_w = int(in_h * scale_h), int(in_w * scale_w)
+    else:
+        raise ValueError("Either size or scale_factor must be specified")
+    # Create dimensions tensor
+    dims = mx.array([batch_size, channels, in_h, in_w, out_h, out_w], dtype=mx.int32)
+    # Reshape input tensor to 1D for kernel processing
+    x_flat = x.reshape(-1)
+    input_dtype = x.dtype
+    if input_dtype != mx.float32:
+        x_flat = x_flat.astype(mx.float32)
+    # Metal kernel source that matches PyTorch's coordinate calculation
+    source = """
+        uint x_out = thread_position_in_grid.x;
+        uint y_out = thread_position_in_grid.y;
+        uint bc_idx = thread_position_in_grid.z;
+        int batch_size = dims[0];
+        int channels = dims[1];
+        int in_h = dims[2];
+        int in_w = dims[3];
+        int out_h = dims[4];
+        int out_w = dims[5];
+        if (x_out >= (uint)out_w || y_out >= (uint)out_h || bc_idx >= (uint)(batch_size * channels))
+            return;
+        int c = bc_idx % channels;
+        int b = bc_idx / channels;
+        // PyTorch's coordinate calculation for nearest neighbor
+        // This matches: torch.nn.functional.interpolate(..., mode='nearest')
+        float scale_h = float(in_h) / float(out_h);
+        float scale_w = float(in_w) / float(out_w);
+        // PyTorch uses floor for nearest neighbor coordinate mapping
+        int y_in = int(floor(float(y_out) * scale_h));
+        int x_in = int(floor(float(x_out) * scale_w));
+        // Clamp to bounds
+        y_in = max(0, min(y_in, in_h - 1));
+        x_in = max(0, min(x_in, in_w - 1));
+        int input_offset = ((b * channels + c) * in_h + y_in) * in_w + x_in;
+        int output_offset = ((b * channels + c) * out_h + y_out) * out_w + x_out;
+        output[output_offset] = input[input_offset];
+    """
+    # Create and run kernel
+    kernel = mx.fast.metal_kernel(
+        name="nearest_interpolation",
+        input_names=["input", "dims"],
+        output_names=["output"],
+        source=source,
+    )
+    threadgroup = get_optimal_threadgroup(out_w, out_h)
+    outputs = kernel(
+        inputs=[x_flat, dims],
+        grid=(out_w, out_h, batch_size * channels),
+        threadgroup=threadgroup,
+        output_shapes=[(batch_size * channels * out_h * out_w,)],
+        output_dtypes=[mx.float32],
+    )
+    result = outputs[0].reshape(batch_size, channels, out_h, out_w)
+    if input_dtype != mx.float32:
+        result = result.astype(input_dtype)
+    return result
+def bicubic_interpolate(
+    x, size=None, scale_factor=None, align_corners=False, antialias=False
+):
+    """
+    Bicubic interpolation using MLX's built-in interpolate function.
+    Args:
+        x: MLX tensor of shape [B, C, H, W]
+        size: Tuple of (out_h, out_w) or None
+        scale_factor: Float or tuple of (scale_h, scale_w) or None
+        align_corners: Whether to align corners
+        antialias: Whether to apply antialiasing
+    Returns:
+        Interpolated MLX tensor
+    """
+    # Get input dimensions
+    batch_size, channels, in_h, in_w = x.shape
+    # Calculate output dimensions
+    if size is not None:
+        out_h, out_w = size
+        scale_h, scale_w = out_h / in_h, out_w / in_w
+    elif scale_factor is not None:
+        if isinstance(scale_factor, (int, float)):
+            scale_h = scale_w = scale_factor
+        else:
+            scale_h, scale_w = scale_factor
+        out_h, out_w = int(in_h * scale_h), int(in_w * scale_w)
+    else:
+        raise ValueError("Either size or scale_factor must be specified")
+    # Calculate antialiasing parameters
+    # PyTorch uses support = 2.0 for bicubic when antialiasing
+    support = 2.0
+    antialias_flag = 1.0 if (antialias and (scale_h < 1.0 or scale_w < 1.0)) else 0.0
+    # When downsampling with antialias, PyTorch expands the filter support
+    if antialias and scale_h < 1.0:
+        filter_scale_h = 1.0 / scale_h
+    else:
+        filter_scale_h = 1.0
+    if antialias and scale_w < 1.0:
+        filter_scale_w = 1.0 / scale_w
+    else:
+        filter_scale_w = 1.0
+    # Create parameters tensor
+    params = mx.array(
+        [
+            scale_h,
+            scale_w,
+            1.0 if align_corners else 0.0,
+            antialias_flag,
+            filter_scale_h,
+            filter_scale_w,
+            support,
+        ],
+        dtype=mx.float32,
+    )
+    # Create dimensions tensor
+    dims = mx.array([batch_size, channels, in_h, in_w, out_h, out_w], dtype=mx.int32)
+    # Reshape input tensor to 1D for kernel processing
+    x_flat = x.reshape(-1)
+    # Convert to float32 for processing if needed
+    input_dtype = x.dtype
+    if input_dtype != mx.float32:
+        x_flat = x_flat.astype(mx.float32)
+    header = """
+        // Bicubic kernel function
+        float cubic_kernel(float x) {
+            float absx = fabs(x);
+            float absx2 = absx * absx;
+            float absx3 = absx2 * absx;
+            const float a = -0.5f;
+            if (absx <= 1.0f) {
+                return (a + 2.0f) * absx3 - (a + 3.0f) * absx2 + 1.0f;
+            } else if (absx < 2.0f) {
+                return a * absx3 - 5.0f * a * absx2 + 8.0f * a * absx - 4.0f * a;
+            }
+            return 0.0f;
+        }
+        // Antialiased bicubic kernel - scales the support region for downsampling
+        float cubic_kernel_antialias(float x, float scale) {
+            // When downsampling, we need to integrate over a wider region
+            // This matches PyTorch's antialiasing behavior
+            return cubic_kernel(x / scale);
+        }
+    """
+    # Metal kernel source code with antialiasing support
+    source = """
+        // Get thread position
+        uint x_out = thread_position_in_grid.x;
+        uint y_out = thread_position_in_grid.y;
+        uint bc_idx = thread_position_in_grid.z;
+        // Extract dimensions
+        int batch_size = dims[0];
+        int channels = dims[1];
+        int in_h = dims[2];
+        int in_w = dims[3];
+        int out_h = dims[4];
+        int out_w = dims[5];
+        // Extract parameters
+        float scale_h = params[0];
+        float scale_w = params[1];
+        bool align_corners = params[2] > 0.5f;
+        bool use_antialias = params[3] > 0.5f;
+        float filter_scale_h = params[4];
+        float filter_scale_w = params[5];
+        float support = params[6];
+        // Check bounds
+        if (x_out >= (uint)out_w || y_out >= (uint)out_h || bc_idx >= (uint)(batch_size * channels))
+            return;
+        // Calculate batch and channel indices
+        int c = bc_idx % channels;
+        int b = bc_idx / channels;
+        // Calculate input coordinates
+        float x_in, y_in;
+        if (align_corners && out_w > 1 && out_h > 1) {
+            x_in = float(x_out) * (in_w - 1) / (out_w - 1);
+            y_in = float(y_out) * (in_h - 1) / (out_h - 1);
+        } else {
+            // PyTorch's default coordinate mapping
+            x_in = ((float(x_out) + 0.5f) / float(out_w)) * float(in_w) - 0.5f;
+            y_in = ((float(y_out) + 0.5f) / float(out_h)) * float(in_h) - 0.5f;
+        }
+        // Calculate the support region based on antialiasing
+        float support_h = use_antialias ? support * filter_scale_h : support;
+        float support_w = use_antialias ? support * filter_scale_w : support;
+        // Calculate the range of input pixels to sample
+        int y_start = int(floor(y_in - support_h)) + 1;
+        int y_end = int(floor(y_in + support_h)) + 1;
+        int x_start = int(floor(x_in - support_w)) + 1;
+        int x_end = int(floor(x_in + support_w)) + 1;
+        // Clamp to valid range
+        y_start = max(0, y_start);
+        y_end = min(in_h, y_end);
+        x_start = max(0, x_start);
+        x_end = min(in_w, x_end);
+        // Perform bicubic interpolation with antialiasing
+        float result = 0.0f;
+        float weight_sum = 0.0f;
+        for (int y_pos = y_start; y_pos < y_end; y_pos++) {
+            float dy = float(y_pos) - y_in;
+            float wy = use_antialias ?
+                cubic_kernel_antialias(dy, filter_scale_h) :
+                cubic_kernel(dy);
+            for (int x_pos = x_start; x_pos < x_end; x_pos++) {
+                float dx = float(x_pos) - x_in;
+                float wx = use_antialias ?
+                    cubic_kernel_antialias(dx, filter_scale_w) :
+                    cubic_kernel(dx);
+                float weight = wy * wx;
+                // Calculate input tensor offset
+                int input_offset = ((b * channels + c) * in_h + y_pos) * in_w + x_pos;
+                // Add weighted contribution
+                result += input[input_offset] * weight;
+                weight_sum += weight;
+            }
+        }
+        // Normalize by weight sum
+        if (weight_sum > 1e-8f) {
+            result /= weight_sum;
+        }
+        // Calculate output tensor offset
+        int output_offset = ((b * channels + c) * out_h + y_out) * out_w + x_out;
+        // Assign the result to output
+        output[output_offset] = result;
+    """
+    # Create the kernel
+    kernel = mx.fast.metal_kernel(
+        name="bicubic_interpolation_antialias",
+        input_names=["input", "dims", "params"],
+        output_names=["output"],
+        source=source,
+        header=header,
+    )
+    # Run the kernel
+    threadgroup = get_optimal_threadgroup(out_w, out_h)
+    outputs = kernel(
+        inputs=[x_flat, dims, params],
+        grid=(out_w, out_h, batch_size * channels),
+        threadgroup=threadgroup,
+        output_shapes=[(batch_size * channels * out_h * out_w,)],
+        output_dtypes=[mx.float32],
+    )
+    # Reshape output back to 4D tensor and convert back to original dtype
+    result = outputs[0].reshape(batch_size, channels, out_h, out_w)
+    if input_dtype != mx.float32:
+        result = result.astype(input_dtype)
+    return result
+def grid_sample(x, grid):
+    """
+    Grid sample using MLX's built-in interpolate function.
+    Args:
+        x: MLX tensor of shape [B, C, H, W]
+        grid: MLX tensor of shape [B, gN, gM, 2]
+    Returns:
+        Interpolated MLX tensor
+    """
+    assert x.ndim == 4, "`x` must be 4D."
+    assert grid.ndim == 4, "`grid` must be 4D."
+    B, _, _, C = x.shape
+    _, gN, gM, D = grid.shape
+    out_shape = (B, gN, gM, C)
+    assert D == 2, "Last dim of `grid` must be size 2."
+    source = """
+        uint elem = thread_position_in_grid.x;
+        int H = x_shape[1];
+        int W = x_shape[2];
+        int C = x_shape[3];
+        int gH = grid_shape[1];
+        int gW = grid_shape[2];
+        int w_stride = C;
+        int h_stride = W * w_stride;
+        int b_stride = H * h_stride;
+        uint grid_idx = elem / C * 2;
+        float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+        float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+        int ix_nw = floor(ix);
+        int iy_nw = floor(iy);
+        int ix_ne = ix_nw + 1;
+        int iy_ne = iy_nw;
+        int ix_sw = ix_nw;
+        int iy_sw = iy_nw + 1;
+        int ix_se = ix_nw + 1;
+        int iy_se = iy_nw + 1;
+        T nw = (ix_se - ix)    * (iy_se - iy);
+        T ne = (ix    - ix_sw) * (iy_sw - iy);
+        T sw = (ix_ne - ix)    * (iy    - iy_ne);
+        T se = (ix    - ix_nw) * (iy    - iy_nw);
+        int batch_idx = elem / C / gH / gW * b_stride;
+        int channel_idx = elem % C;
+        int base_idx = batch_idx + channel_idx;
+        T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+        T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+        T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+        T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+        I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+        I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+        I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+        I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+        out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+    """
+    kernel = mx.fast.metal_kernel(
+        name="grid_sample",
+        input_names=["x", "grid"],
+        output_names=["out"],
+        source=source,
+    )
+    outputs = kernel(
+        inputs=[x, grid],
+        template=[("T", x.dtype)],
+        output_shapes=[out_shape],
+        output_dtypes=[x.dtype],
+        grid=(mx.prod(mx.array(out_shape)), 1, 1),
+        threadgroup=(256, 1, 1),
+    )
+    return outputs[0]
+def get_optimal_threadgroup(out_w, out_h):
+    # Calculate optimal threadgroup dimensions based on output dimensions
+    # Maximum threadgroup size for most Metal GPUs
+    # This could be made more dynamic with Metal API queries if needed
+    MAX_THREADS_PER_GROUP = 1024
+    MAX_THREADS_PER_DIM = 1024
+    # Start with a reasonable default size for 2D workloads
+    default_threadgroup = (32, 32, 1)
+    try:
+        # Don't create threadgroups larger than the work dimensions
+        max_width = min(MAX_THREADS_PER_DIM, out_w)
+        max_height = min(MAX_THREADS_PER_DIM, out_h)
+        # Find largest power of 2 that fits within our dimensions
+        width = 2 ** (max_width.bit_length() - 1)
+        if width > max_width:
+            width = width // 2
+        height = 2 ** (max_height.bit_length() - 1)
+        if height > max_height:
+            height = height // 2
+        # Ensure we don't exceed maximum threads per threadgroup
+        while width * height > MAX_THREADS_PER_GROUP:
+            # Reduce the larger dimension first
+            if width >= height:
+                width = width // 2
+            else:
+                height = height // 2
+        # Ensure minimum size for efficiency
+        width = max(8, width)
+        height = max(8, height)
+        return (width, height, 1)
+    except Exception:
+        # Return safe defaults if calculation fails
+        return default_threadgroup

mlx_vlm/models/kimi_vl/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .config import ModelConfig, TextConfig, VisionConfig
+from .kimi_vl import LanguageModel, Model, VisionModel
+from .processing_kimi_vl import KimiVLImageProcessor as ImageProcessor
+from .processing_kimi_vl import KimiVLProcessor as Processor