PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/florence2/language.py ADDED Viewed

@@ -0,0 +1,452 @@
+import math
+from typing import Optional, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+from ..base import (
+    LanguageModelOutput,
+    create_attention_mask,
+    scaled_dot_product_attention,
+)
+from ..cache import SimpleKVCache
+from .config import TextConfig
+class Florence2Attention(nn.Module):
+    def __init__(
+        self, config: TextConfig, is_decoder: bool = False, is_causal: bool = False
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.num_heads = (
+            config.decoder_attention_heads
+            if is_decoder
+            else config.encoder_attention_heads
+        )
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def __call__(
+        self,
+        hidden_states,
+        key_value_states=None,
+        cache: Optional[SimpleKVCache] = None,
+        attention_mask=None,
+        layer_head_mask=None,
+    ):
+        batch_size, tgt_len, _ = hidden_states.shape
+        q = (
+            self.q_proj(hidden_states)
+            .reshape(batch_size, tgt_len, self.num_heads, self.head_dim)
+            .transpose(0, 2, 1, 3)
+        )
+        is_cross_attention = key_value_states is not None
+        batch_size, tgt_len, _ = hidden_states.shape
+        src_len = (
+            key_value_states.shape[1]
+            if key_value_states is not None
+            else hidden_states.shape[1]
+        )
+        if (
+            is_cross_attention
+            and cache is not None
+            and cache.cache_length > 0
+            and cache.keys.shape[2] == key_value_states.shape[1]
+        ):
+            # Cross-attention with cached keys/values - reuse them
+            k = cache.keys
+            v = cache.values
+        elif is_cross_attention:
+            # Cross attention - compute and cache keys/values from encoder
+            k = (
+                self.k_proj(key_value_states)
+                .reshape(batch_size, src_len, self.num_heads, self.head_dim)
+                .transpose(0, 2, 1, 3)
+            )
+            v = (
+                self.v_proj(key_value_states)
+                .reshape(batch_size, src_len, self.num_heads, self.head_dim)
+                .transpose(0, 2, 1, 3)
+            )
+            # Cache the cross-attention keys/values
+            if cache is not None:
+                cache.update(k, v)
+        elif cache is not None:
+            # Self-attention with cache - compute new k,v and concatenate with cache
+            k = (
+                self.k_proj(hidden_states)
+                .reshape(batch_size, src_len, self.num_heads, -1)
+                .transpose(0, 2, 1, 3)
+            )
+            v = (
+                self.v_proj(hidden_states)
+                .reshape(batch_size, src_len, self.num_heads, -1)
+                .transpose(0, 2, 1, 3)
+            )
+            # update_and_fetch handles cache concatenation
+            k, v = cache.update_and_fetch(k, v)
+        else:
+            # Self attention without cache (encoder)
+            k = (
+                self.k_proj(hidden_states)
+                .reshape(batch_size, src_len, self.num_heads, self.head_dim)
+                .transpose(0, 2, 1, 3)
+            )
+            v = (
+                self.v_proj(hidden_states)
+                .reshape(batch_size, src_len, self.num_heads, self.head_dim)
+                .transpose(0, 2, 1, 3)
+            )
+        if self.is_causal and self.is_decoder:
+            causal_mask = create_attention_mask(hidden_states)
+            attention_mask = causal_mask
+        attn_output = (
+            scaled_dot_product_attention(
+                q, k, v, cache=cache, scale=self.scaling, mask=attention_mask
+            )
+            .transpose(0, 2, 1, 3)
+            .reshape(batch_size, tgt_len, -1)
+        )
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class Florence2EncoderLayer(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = Florence2Attention(config, is_decoder=False, is_causal=False)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.activation_fn = nn.GELU()
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def __call__(self, hidden_states, attention_mask=None):
+        residual = hidden_states
+        hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+class Florence2DecoderLayer(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = Florence2Attention(config, is_decoder=True, is_causal=True)
+        self.dropout = config.dropout
+        self.activation_fn = nn.GELU()
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = Florence2Attention(config, is_decoder=True)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def __call__(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        encoder_attention_mask=None,
+        cache: Optional[Tuple[SimpleKVCache, SimpleKVCache]] = None,
+    ):
+        residual = hidden_states
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_cache = cache[0] if cache[0] is not None else None
+        hidden_states = self.self_attn(
+            hidden_states, attention_mask=attention_mask, cache=self_attn_cache
+        )
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            # cross_attn cached key/values tuple is at positions 3,4 of cache tuple
+            cross_attn_cache = cache[-1] if cache[-1] is not None else None
+            hidden_states = self.encoder_attn(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                cache=cross_attn_cache,
+            )
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+class Florence2Encoder(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = config.d_model
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.offset = 2
+        self.embed_positions = nn.Embedding(
+            config.max_position_embeddings + self.offset, embed_dim
+        )
+        self.layers = [
+            Florence2EncoderLayer(config) for _ in range(config.encoder_layers)
+        ]
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+    def __call__(self, input_ids=None, inputs_embeds=None, attention_mask=None):
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            input_shape = inputs_embeds.shape
+        else:
+            input_shape = inputs_embeds.shape
+        positions = mx.arange(input_shape[1])
+        if positions.ndim == 1:
+            positions = mx.expand_dims(positions, axis=0)
+        embed_pos = self.embed_positions(positions + self.offset)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        for encoder_layer in self.layers:
+            # Add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = mx.random.uniform()
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            hidden_states = encoder_layer(hidden_states, attention_mask)
+        return hidden_states
+class Florence2Decoder(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.offset = 2
+        self.embed_positions = nn.Embedding(
+            config.max_position_embeddings + self.offset, config.d_model
+        )
+        self.layers = [
+            Florence2DecoderLayer(config) for _ in range(config.decoder_layers)
+        ]
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+    def __call__(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        inputs_embeds=None,
+        cache=None,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            input_shape = inputs_embeds.shape  # for 2d masks
+            positions = input_ids
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]  # for 4d masks
+            positions = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        if positions.ndim == 1:
+            positions = mx.expand_dims(positions, axis=0)
+        cache_length = cache[0][0].keys.shape[2] if cache[0][0].cache_length > 0 else 0
+        bsz, seq_len = inputs_embeds.shape[:2]
+        positions = mx.arange(
+            cache_length,
+            cache_length + seq_len,
+            dtype=mx.int64,
+        )
+        positions = mx.expand_dims(positions, axis=0)
+        embed_pos = self.embed_positions(positions + self.offset)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        for decoder_layer, c in zip(self.layers, cache):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = mx.random.uniform()
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            hidden_states = decoder_layer(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                cache=c,
+            )
+        return hidden_states
+class Florence2LanguageModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.encoder = Florence2Encoder(config)
+        self.decoder = Florence2Decoder(config)
+        if config.scale_embedding:
+            self.embed_scale = math.sqrt(config.d_model)
+        else:
+            self.embed_scale = 1.0
+    def __call__(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        decoder_input_ids=None,
+        decoder_inputs_embeds=None,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        cache=None,
+    ):
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = mx.zeros_like(input_ids)
+            decoder_input_ids[:, 1:] = input_ids[:, :-1]
+            decoder_input_ids[:, 0] = self.config.bos_token_id
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds * self.embed_scale
+        if cache is None:
+            cache = [(SimpleKVCache(), SimpleKVCache())] * len(self.decoder.layers)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+            )
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs,
+            encoder_attention_mask=attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            cache=cache,
+        )
+        return decoder_outputs, encoder_outputs
+class LanguageModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.model = Florence2LanguageModel(config)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+    def __call__(
+        self,
+        inputs=None,
+        inputs_embeds=None,
+        decoder_input_ids=None,
+        decoder_inputs_embeds=None,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        cache=None,
+        **kwargs,
+    ):
+        decoder_outputs, encoder_outputs = self.model(
+            inputs,
+            inputs_embeds,
+            decoder_input_ids,
+            decoder_inputs_embeds,
+            attention_mask,
+            decoder_attention_mask,
+            encoder_outputs,
+            cache,
+        )
+        out = self.lm_head(decoder_outputs)
+        return LanguageModelOutput(logits=out, encoder_outputs=encoder_outputs)
+    @property
+    def layers(self):
+        return range(self.model.config.decoder_layers)
+    @property
+    def head_dim(self):
+        return self.config.d_model // self.config.decoder_attention_heads
+    @property
+    def n_kv_heads(self):
+        return self.config.decoder_attention_heads
+    def make_cache(self):
+        return [(SimpleKVCache(), SimpleKVCache()) for n in self.layers]

mlx_vlm/models/florence2/processing_florence2.py ADDED Viewed

@@ -0,0 +1,30 @@
+from transformers.models.florence2.processing_florence2 import Florence2Processor
+# Store the original __init__
+_original_init = Florence2Processor.__init__
+def _patched_init(self, image_processor=None, tokenizer=None, **kwargs):
+    """Patched __init__ that adds image_token attributes to tokenizer if missing."""
+    if tokenizer is not None:
+        # Ensure tokenizer has image_token attribute
+        if not hasattr(tokenizer, "image_token"):
+            tokenizer.image_token = "<image>"
+        # Ensure tokenizer has image_token_id attribute
+        if not hasattr(tokenizer, "image_token_id"):
+            vocab = tokenizer.get_vocab()
+            if tokenizer.image_token in vocab:
+                tokenizer.image_token_id = vocab[tokenizer.image_token]
+            else:
+                tokenizer.add_tokens([tokenizer.image_token], special_tokens=True)
+                tokenizer.image_token_id = tokenizer.convert_tokens_to_ids(
+                    tokenizer.image_token
+                )
+    # Call original __init__
+    _original_init(self, image_processor=image_processor, tokenizer=tokenizer, **kwargs)
+# Apply the patch
+Florence2Processor.__init__ = _patched_init