PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/internvl_chat/language.py ADDED Viewed

@@ -0,0 +1,187 @@
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from ..base import (
+    LanguageModelOutput,
+    create_attention_mask,
+    scaled_dot_product_attention,
+)
+from ..cache import KVCache
+from .config import TextConfig
+class Attention(nn.Module):
+    def __init__(self, args: TextConfig):
+        super().__init__()
+        dim = args.hidden_size
+        self.n_heads = n_heads = args.num_attention_heads
+        assert args.num_key_value_heads is not None
+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+        # Allow overriding head_dim to support architectures where
+        # n_heads * head_dim != hidden_size.
+        self.head_dim = head_dim = getattr(args, "head_dim", None) or (
+            args.hidden_size // n_heads
+        )
+        self.scale = head_dim**-0.5
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=True)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=True)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=True)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
+        self.rotary_emb = nn.RoPE(
+            head_dim,
+            base=args.rope_theta,
+            traditional=args.rope_traditional,
+        )
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[KVCache] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, self.head_dim).transpose(
+            0, 2, 1, 3
+        )
+        keys = keys.reshape(B, L, self.n_kv_heads, self.head_dim).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, self.head_dim).transpose(
+            0, 2, 1, 3
+        )
+        offset = cache.offset if cache else 0
+        if mask is not None and isinstance(mask, mx.array):
+            mask = mask[..., : keys.shape[-2]]
+        queries = self.rotary_emb(queries, offset=offset)
+        keys = self.rotary_emb(keys, offset=offset)
+        if cache is not None:
+            keys, values = cache.update_and_fetch(keys, values)
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+class MLP(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+class Qwen2VLDecoderLayer(nn.Module):
+    def __init__(self, args: TextConfig):
+        super().__init__()
+        self.num_attention_heads = args.num_attention_heads
+        self.hidden_size = args.hidden_size
+        self.self_attn = Attention(args)
+        self.mlp = MLP(args.hidden_size, args.intermediate_size)
+        self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            args.hidden_size, eps=args.rms_norm_eps
+        )
+        self.args = args
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[KVCache] = None,
+    ) -> mx.array:
+        r = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+class Qwen2Model(nn.Module):
+    def __init__(self, args: TextConfig):
+        super().__init__()
+        self.args = args
+        self.vocab_size = args.vocab_size
+        self.num_hidden_layers = args.num_hidden_layers
+        assert self.vocab_size > 0
+        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.layers = [
+            Qwen2VLDecoderLayer(args=args) for _ in range(args.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+    def __call__(
+        self,
+        inputs: mx.array,
+        inputs_embeds: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache=None,
+    ):
+        if inputs_embeds is None:
+            h = self.embed_tokens(inputs)
+        else:
+            h = inputs_embeds
+        if cache is None:
+            cache = [None] * len(self.layers)
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+        return self.norm(h)
+class LanguageModel(nn.Module):
+    def __init__(self, args: TextConfig):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.model = Qwen2Model(args)
+        if not args.tie_word_embeddings:
+            self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
+    def __call__(
+        self,
+        inputs: mx.array,
+        inputs_embeds: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache=None,
+    ):
+        out = self.model(inputs, cache=cache, inputs_embeds=inputs_embeds)
+        if self.args.tie_word_embeddings:
+            out = self.model.embed_tokens.as_linear(out)
+        else:
+            out = self.lm_head(out)
+        return LanguageModelOutput(logits=out)
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def head_dim(self):
+        return self.args.hidden_size // self.args.num_attention_heads
+    @property
+    def n_kv_heads(self):
+        return self.args.num_key_value_heads

mlx_vlm/models/internvl_chat/processor.py ADDED Viewed

@@ -0,0 +1,395 @@
+from typing import List, Optional, Union
+import mlx.core as mx
+import numpy as np
+from PIL import Image
+from transformers import (
+    AutoImageProcessor,
+    AutoProcessor,
+    AutoTokenizer,
+    BatchFeature,
+    ProcessorMixin,
+)
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# Constants for image processing (from internvl_chat.py)
+IMAGENET_MEAN = np.array([0.485, 0.456, 0.406])
+IMAGENET_STD = np.array([0.229, 0.224, 0.225])
+# chat_template = get_conv_template("internvl2_5")
+chat_template = "{% for message in messages %}{{message['role'].capitalize() + ': '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['content'] }}{% endfor %}{{'\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:\n' }}{% endif %}"
+IMG_START_TOKEN = "<img>"
+IMG_END_TOKEN = "</img>"
+IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+def build_transform(input_size):
+    """
+    Builds a transformation pipeline for images.
+    Args:
+        input_size (int): The target size for the image (height and width).
+    Returns:
+        function: A function that takes a PIL image and returns a normalized mx.array.
+    """
+    mean = mx.array(IMAGENET_MEAN)
+    std = mx.array(IMAGENET_STD)
+    def transform(img: Image.Image) -> mx.array:
+        # Ensure image is RGB
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        # Resize using PIL - BICUBIC interpolation is default in Pillow >= 9.1.0 for resize
+        # For older versions, you might need Pillow-SIMD or explicitly set
+        # resampling=Image.BICUBIC if available.
+        img = img.resize((input_size, input_size), resample=Image.Resampling.BICUBIC)
+        # Convert PIL image to NumPy array (H, W, C) and scale to [0, 1]
+        img_np = np.array(img).astype(np.float32) / 255.0
+        # Convert to MLX array and transpose to (C, H, W)
+        img_mx = mx.array(img_np).transpose(2, 0, 1)
+        # Normalize
+        img_mx = (img_mx - mean[:, None, None]) / std[:, None, None]
+        return img_mx
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """Finds the closest aspect ratio from a list of targets."""
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            # Prioritize ratios closer to the original image area if diffs are equal
+            target_area = image_size * image_size * ratio[0] * ratio[1]
+            if abs(area - target_area) < abs(
+                area - image_size * image_size * best_ratio[0] * best_ratio[1]
+            ):
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(
+    image: Image.Image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
+):
+    """
+    Preprocesses the image by splitting it into blocks based on the closest aspect ratio.
+    Args:
+        image (PIL.Image.Image): Input image.
+        min_num (int): Minimum number of blocks.
+        max_num (int): Maximum number of blocks.
+        image_size (int): Target size for each block.
+        use_thumbnail (bool): Whether to include a thumbnail of the original image.
+    Returns:
+        list[PIL.Image.Image]: A list of processed image blocks (as PIL images).
+    """
+    orig_width, orig_height = image.size
+    if orig_width == 0 or orig_height == 0:
+        # Handle potential zero dimensions
+        return []
+    aspect_ratio = orig_width / orig_height
+    # Calculate the possible target aspect ratios
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # Find the closest target aspect ratio
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+    # Calculate the target dimensions for resizing
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # Resize the image to fit the target block structure
+    # Using BICUBIC resampling
+    resized_img = image.resize(
+        (target_width, target_height), resample=Image.Resampling.BICUBIC
+    )
+    processed_images = []
+    # Crop the resized image into blocks
+    for i in range(blocks):
+        # Calculate crop box for the i-th block
+        row_idx = i // target_aspect_ratio[0]
+        col_idx = i % target_aspect_ratio[0]
+        left = col_idx * image_size
+        top = row_idx * image_size
+        right = (col_idx + 1) * image_size
+        bottom = (row_idx + 1) * image_size
+        box = (left, top, right, bottom)
+        # Crop and add the block
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert (
+        len(processed_images) == blocks
+    ), f"Expected {blocks} blocks, but got {len(processed_images)}"
+    # Add a thumbnail if requested and if the image was split
+    if use_thumbnail and blocks > 1:
+        thumbnail_img = image.resize(
+            (image_size, image_size), resample=Image.Resampling.BICUBIC
+        )
+        processed_images.append(thumbnail_img)
+    return processed_images
+class InternVLImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: int = 448,  # Default image size from dynamic_preprocess
+        resample=Image.Resampling.BICUBIC,
+        do_center_crop: bool = False,  # Not used in original, but standard HF param
+        crop_size=None,
+        do_rescale: bool = True,  # Original code scales by 1/255.0
+        rescale_factor: float = 1 / 255.0,
+        do_normalize: bool = True,
+        image_mean=IMAGENET_MEAN.tolist(),
+        image_std=IMAGENET_STD.tolist(),
+        do_dynamic_preprocess: bool = True,
+        dynamic_min_num: int = 1,
+        dynamic_max_num: int = 12,
+        dynamic_use_thumbnail: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = (
+            do_resize  # Although dynamic_preprocess handles resizing internally
+        )
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        # Custom dynamic processing params
+        self.do_dynamic_preprocess = do_dynamic_preprocess
+        self.dynamic_min_num = dynamic_min_num
+        self.dynamic_max_num = dynamic_max_num
+        self.dynamic_use_thumbnail = dynamic_use_thumbnail
+    def preprocess(
+        self,
+        images: List[Image.Image],
+        do_dynamic_preprocess: Optional[bool] = None,
+        size: Optional[int] = None,
+        # ... other params matching __init__ ...
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> List[mx.array]:
+        do_dynamic_preprocess = (
+            do_dynamic_preprocess
+            if do_dynamic_preprocess is not None
+            else self.do_dynamic_preprocess
+        )
+        size = size if size is not None else self.size
+        # ... handle other overrides ...
+        if not isinstance(images, list):
+            images = [images]
+        if not all(isinstance(image, Image.Image) for image in images):
+            raise ValueError("Input must be a list of PIL Images.")
+        processed_images_batch = []
+        for image in images:
+            # Apply dynamic preprocessing
+            if do_dynamic_preprocess:
+                processed_images = dynamic_preprocess(
+                    image,
+                    min_num=self.dynamic_min_num,
+                    max_num=self.dynamic_max_num,
+                    image_size=size,
+                    use_thumbnail=self.dynamic_use_thumbnail,
+                )
+            else:
+                # Fallback or alternative simpler preprocessing if needed
+                # e.g., simple resize + normalize
+                processed_images = [image.resize((size, size), resample=self.resample)]
+            # Create transform function
+            transform = build_transform(input_size=size)
+            # Apply transform to each image block and collect arrays
+            pixel_values_list = [transform(img) for img in processed_images]
+            # Stack the arrays along a new dimension (batch dimension)
+            pixel_values = mx.stack(pixel_values_list, axis=0)
+            processed_images_batch.append(pixel_values)
+        # At this point, processed_images_batch contains a list of mx arrays,
+        # each array corresponding to an input image with stacked blocks.
+        data = {"pixel_values": mx.array(processed_images_batch)}
+        return BatchFeature(data=data, tensor_type=None)
+class InternVLChatProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "InternVLImageProcessor"
+    tokenizer_class = (
+        "AutoTokenizer",
+        "Qwen2TokenizerFast",
+    )  # Specify possible classes
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=chat_template,
+        **kwargs,
+    ):
+        if image_processor is None:
+            image_processor = InternVLImageProcessor(**kwargs)
+        if isinstance(tokenizer, str):
+            # Defaulting to the likely repo ID found earlier
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer, trust_remote_code=True, **kwargs
+            )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.num_image_token = int((448 // 14) ** 2 * (0.5**2))
+    def __call__(
+        self,
+        text: Union[str, List[str]] = None,
+        images: List[Image.Image] = None,
+        padding: Union[bool, str] = True,
+        truncation: bool = True,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = "pt",  # Default to PyTorch tensors
+        **kwargs,
+    ):
+        processed_inputs = {}
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            if len(text) == 1 and images is not None and len(images) > 1:
+                raise ValueError("Multi-image inference is not supported.")
+        if images is not None:
+            image_features = self.image_processor.preprocess(
+                images, return_tensors=return_tensors, **kwargs
+            )
+            processed_inputs.update(image_features)  # Should contain 'pixel_values'
+        if text is not None:
+            queries = []
+            for idx in range(len(images)):
+                question = text[idx]
+                if images is not None and "<image>" not in question:
+                    question = "<image>\n" + question
+                num_patches = image_features["pixel_values"][idx].shape[0]
+                image_tokens = (
+                    IMG_START_TOKEN
+                    + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+                    + IMG_END_TOKEN
+                )
+                question = question.replace("<image>", image_tokens, 1)
+                queries.append(question)
+            self.tokenizer.padding_side = "left"
+            text_inputs = self.tokenizer(
+                queries,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            processed_inputs.update(text_inputs)  # 'input_ids', 'attention_mask'
+        return processed_inputs
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's batch_decode method.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's decode method.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def save_pretrained(self, save_directory, **kwargs):
+        pass
+    @staticmethod
+    def from_pretrained(pretrained_model_name_or_path, **kwargs):
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, **kwargs
+        )
+        image_processor = InternVLImageProcessor(**kwargs)
+        return InternVLChatProcessor(
+            image_processor=image_processor, tokenizer=tokenizer
+        )
+    # Need save_pretrained and from_pretrained
+    # save_pretrained should save both tokenizer and image_processor configs/files
+    # from_pretrained should load both
+    # Example:
+    # def save_pretrained(self, save_directory, **kwargs):
+    #     self.tokenizer.save_pretrained(save_directory, **kwargs)
+    #     self.image_processor.save_pretrained(save_directory, **kwargs)
+    # def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    #     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+    #     image_processor = InternVLImageProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs)
+    #     return cls(image_processor=image_processor, tokenizer=tokenizer)
+# Registration
+MODEL_TYPE = "internvl_chat"  # Verify this from the model's config.json
+AutoImageProcessor.register(
+    MODEL_TYPE, slow_image_processor_class=InternVLImageProcessor
+)
+AutoProcessor.register(MODEL_TYPE, InternVLChatProcessor)
+logger.info(f"Registered custom processor classes for model type '{MODEL_TYPE}'.")