PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/models/hunyuan_vl/hunyuan_vl.py ADDED Viewed

@@ -0,0 +1,181 @@
+from typing import Dict, Optional
+import mlx.core as mx
+import mlx.nn as nn
+from ..base import InputEmbeddingsFeatures, check_array_shape
+from .config import ModelConfig
+from .language import LanguageModel
+from .vision import VisionModel
+try:
+    from transformers import AutoImageProcessor, AutoProcessor
+    from .processing_hunyuan_vl import HunYuanVLImageProcessor, HunYuanVLProcessor
+    MODEL_TYPE = "hunyuan_vl"
+    AutoImageProcessor.register(
+        MODEL_TYPE, slow_image_processor_class=HunYuanVLImageProcessor
+    )
+    AutoProcessor.register(MODEL_TYPE, HunYuanVLProcessor)
+except Exception as e:
+    raise e
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        self.vision_tower = VisionModel(config.vision_config)
+        self.language_model = LanguageModel(config)
+    def get_input_embeddings(
+        self,
+        input_ids: Optional[mx.array] = None,
+        pixel_values: Optional[mx.array] = None,
+        **kwargs,
+    ) -> mx.array:
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+        position_ids_from_processor = kwargs.pop("position_ids", None)
+        # Get text embeddings
+        inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+        # If no image, return text embeddings
+        if pixel_values is None:
+            # Reset stored position_ids when no image
+            self.language_model._position_ids = None
+            return InputEmbeddingsFeatures(inputs_embeds=inputs_embeds)
+        # Get vision features
+        vision_features = self.vision_tower(pixel_values, image_grid_thw)
+        # Find image token positions and replace with vision features
+        image_token_id = self.config.image_token_id
+        image_mask = input_ids == image_token_id
+        # Get number of image tokens expected
+        num_image_tokens = image_mask.sum().item()
+        num_vision_tokens = vision_features.shape[1]
+        if num_image_tokens != num_vision_tokens:
+            raise ValueError(
+                f"Number of image placeholders ({num_image_tokens}) does not match "
+                f"number of vision tokens ({num_vision_tokens}). "
+                f"Expected token count based on grid: {num_vision_tokens}"
+            )
+        B, L, _ = inputs_embeds.shape
+        output_parts = []
+        for b in range(B):
+            mask_b = image_mask[b]  # (L,) boolean mask
+            text_embeds_b = inputs_embeds[b]  # (L, D)
+            vis_feats_b = vision_features[b]  # (num_vis_tokens, D)
+            # Build sequence for this batch
+            vis_idx = 0
+            seq_parts = []
+            for pos in range(L):
+                if mask_b[pos].item():
+                    # Use vision feature
+                    seq_parts.append(vis_feats_b[vis_idx : vis_idx + 1])
+                    vis_idx += 1
+                else:
+                    # Use text embedding
+                    seq_parts.append(text_embeds_b[pos : pos + 1])
+            # Concatenate all parts for this batch
+            batch_embeds = mx.concatenate(seq_parts, axis=0)  # (L, D)
+            output_parts.append(batch_embeds[None, :, :])  # (1, L, D)
+        # Stack batches
+        inputs_embeds = mx.concatenate(output_parts, axis=0)  # (B, L, D)
+        # Pre-calculate position_ids for chunked prefill
+        if position_ids_from_processor is not None:
+            self.language_model._position_ids = position_ids_from_processor
+        elif image_grid_thw is not None:
+            position_ids = self.language_model.get_xdrope_input_positions(
+                input_tokens=input_ids[0].tolist(),
+                image_grid_thw=image_grid_thw,
+                image_token_id=self.config.image_token_id,
+                spatial_merge_size=self.config.vision_config.spatial_merge_size,
+            )[None, ...]
+            self.language_model._position_ids = position_ids
+        return InputEmbeddingsFeatures(inputs_embeds=inputs_embeds)
+    @property
+    def layers(self):
+        return self.language_model.model.layers
+    @property
+    def head_dim(self):
+        return self.config.text_config.head_dim
+    @property
+    def n_kv_heads(self):
+        return self.config.text_config.num_key_value_heads
+    def __call__(
+        self,
+        input_ids: mx.array,
+        pixel_values: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache=None,
+        **kwargs,
+    ):
+        # Get embeddings (with vision features merged if image provided)
+        input_embeddings_features = self.get_input_embeddings(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            **kwargs,
+        )
+        # Forward through language model
+        return self.language_model(
+            input_ids=input_ids,
+            inputs_embeds=input_embeddings_features.inputs_embeds,
+            mask=mask,
+            cache=cache,
+            image_grid_thw=image_grid_thw,
+        )
+    def sanitize(self, weights: Dict[str, mx.array]) -> Dict[str, mx.array]:
+        sanitized = {}
+        for key, value in weights.items():
+            new_key = key
+            # Language model mappings
+            if key.startswith("model."):
+                new_key = "language_model." + key
+            # Vision tower mappings
+            elif key.startswith("vit."):
+                new_key = key.replace("vit.", "vision_tower.", 1)
+            # Handle Conv2d weight transposition for MLX
+            # PyTorch Conv2d: [out_channels, in_channels, kH, kW]
+            # MLX Conv2d: [out_channels, kH, kW, in_channels]
+            if (
+                "patch_embedding.weight" in new_key
+                or "proj.0.weight" in new_key
+                or "proj.2.weight" in new_key
+            ):
+                if not check_array_shape(value):
+                    value = value.transpose(0, 2, 3, 1)
+            sanitized[new_key] = value
+        return sanitized

mlx_vlm/models/hunyuan_vl/language.py ADDED Viewed

@@ -0,0 +1,509 @@
+from typing import List, Optional, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from ..base import (
+    LanguageModelOutput,
+    create_attention_mask,
+    scaled_dot_product_attention,
+)
+from ..cache import KVCache
+from .config import ModelConfig, TextConfig
+class HunyuanRotaryEmbedding:
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.dim = config.head_dim
+        self.max_position_embeddings = config.max_position_embeddings
+        self.base = config.rope_theta
+        # Handle xdrope/dynamic scaling
+        self.xdrope_section = config.rope_scaling.get("xdrope_section")
+        self.rope_type = config.rope_scaling.get("type")
+        alpha = config.rope_scaling.get("alpha")
+        if config.rope_scaling is not None and self.rope_type in ["xdrope", "dynamic"]:
+            if alpha:
+                self.base = self.base * (alpha ** (self.dim / (self.dim - 2)))
+        inv_freq = 1.0 / (
+            self.base ** (mx.arange(0, self.dim, 2).astype(mx.float32) / self.dim)
+        )
+        self._inv_freq = inv_freq
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cached_seq_len = 0
+    def _update_cache(self, seq_len: int, dtype: mx.Dtype):
+        if seq_len > self._cached_seq_len:
+            self._cached_seq_len = seq_len
+            t = mx.arange(seq_len, dtype=mx.float32)
+            freqs = mx.outer(t, self._inv_freq)
+            emb = mx.concatenate([freqs, freqs], axis=-1)
+            self._cos_cached = mx.cos(emb).astype(dtype)
+            self._sin_cached = mx.sin(emb).astype(dtype)
+    def __call__(self, x: mx.array, seq_len: int) -> Tuple[mx.array, mx.array]:
+        self._update_cache(seq_len, x.dtype)
+        return self._cos_cached[:seq_len], self._sin_cached[:seq_len]
+def rotate_half(x: mx.array) -> mx.array:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return mx.concatenate([-x2, x1], axis=-1)
+def apply_rotary_pos_emb_xdrope(
+    q: mx.array,
+    k: mx.array,
+    cos: mx.array,
+    sin: mx.array,
+    position_ids: mx.array,
+    xdrope_section: list,
+    output_size: tuple,
+) -> Tuple[mx.array, mx.array]:
+    """Applies XD Rotary Position Embedding."""
+    x_dim = len(xdrope_section)
+    cos = (
+        cos[position_ids, ...]
+        .transpose(0, 2, 1, 3)
+        .reshape(output_size[0], output_size[2], x_dim, -1)
+    )
+    sin = (
+        sin[position_ids, ...]
+        .transpose(0, 2, 1, 3)
+        .reshape(output_size[0], output_size[2], x_dim, -1)
+    )
+    xdrope_section = xdrope_section * 2
+    # for xd concat
+    assert sum(xdrope_section) == cos.shape[-1], "Illegal partition for xd rope"
+    # Convert split sizes to split indices for MLX
+    split_indices = [
+        sum(xdrope_section[: i + 1]) for i in range(len(xdrope_section) - 1)
+    ]
+    cos_splits = mx.split(cos, split_indices, axis=-1)
+    sin_splits = mx.split(sin, split_indices, axis=-1)
+    cos = mx.concatenate(
+        [m[:, :, i % x_dim, :] for i, m in enumerate(cos_splits)], axis=-1
+    )
+    sin = mx.concatenate(
+        [m[:, :, i % x_dim, :] for i, m in enumerate(sin_splits)], axis=-1
+    )
+    # for head repeat
+    cos = cos.reshape(output_size[0], 1, output_size[2], -1)
+    sin = sin.reshape(output_size[0], 1, output_size[2], -1)
+    origin_dtype = q.dtype
+    q, k = q.astype(mx.float32), k.astype(mx.float32)
+    cos, sin = cos.astype(mx.float32), sin.astype(mx.float32)
+    q_out = (q * cos) + (rotate_half(q) * sin)
+    k_out = (k * cos) + (rotate_half(k) * sin)
+    return q_out.astype(origin_dtype), k_out.astype(origin_dtype)
+def apply_rotary_pos_emb(
+    q: mx.array, k: mx.array, cos: mx.array, sin: mx.array, unsqueeze_dim: int = 1
+) -> Tuple[mx.array, mx.array]:
+    """Standard rotary position embedding.
+    Args:
+        q: Queries with shape (batch, n_heads, seq_len, head_dim)
+        k: Keys with shape (batch, n_heads, seq_len, head_dim)
+        cos: Cosine values with shape (seq_len, head_dim)
+        sin: Sine values with shape (seq_len, head_dim)
+    """
+    # Expand cos/sin to (1, 1, seq_len, head_dim) for broadcasting
+    cos = cos[None, None, :, :]
+    sin = sin[None, None, :, :]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Attention(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.n_heads = config.num_attention_heads
+        self.n_kv_heads = config.num_key_value_heads
+        self.head_dim = config.head_dim
+        self.scale = self.head_dim**-0.5
+        self.q_proj = nn.Linear(
+            self.hidden_size, self.n_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size,
+            self.n_kv_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size,
+            self.n_kv_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.o_proj = nn.Linear(
+            self.n_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+        )
+        if config.use_qk_norm:
+            self.query_layernorm = nn.RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = nn.RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.rotary_emb = HunyuanRotaryEmbedding(config=config)
+        self.xdrope_section = None
+        if config.rope_scaling is not None:
+            self.xdrope_section = config.rope_scaling.get("xdrope_section")
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[KVCache] = None,
+        position_ids: Optional[mx.array] = None,
+    ) -> mx.array:
+        B, L, _ = x.shape
+        # Project Q, K, V
+        queries = self.q_proj(x)
+        keys = self.k_proj(x)
+        values = self.v_proj(x)
+        # Reshape to (B, n_heads, L, head_dim)
+        queries = queries.reshape(B, L, self.n_heads, self.head_dim).transpose(
+            0, 2, 1, 3
+        )
+        keys = keys.reshape(B, L, self.n_kv_heads, self.head_dim).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, self.head_dim).transpose(
+            0, 2, 1, 3
+        )
+        kv_seq_len = L
+        offset = 0
+        if cache is not None:
+            offset = cache.offset
+            kv_seq_len += offset
+        cos, sin = self.rotary_emb(values, seq_len=kv_seq_len)
+        # Apply rotary embeddings
+        if self.xdrope_section is not None and (cache is None or offset == 0):
+            # XD RoPE for prefill (first forward pass)
+            output_size = (B, self.n_heads, L, L)
+            queries, keys = apply_rotary_pos_emb_xdrope(
+                queries,
+                keys,
+                cos,
+                sin,
+                position_ids,
+                self.xdrope_section,
+                output_size,
+            )
+        else:
+            # Standard RoPE for decode (subsequent tokens)
+            if cache is not None and offset > 0:
+                cos = cos[-L:]
+                sin = sin[-L:]
+            queries, keys = apply_rotary_pos_emb(queries, keys, cos, sin)
+        # Apply QK normalization if configured
+        if self.config.use_qk_norm:
+            queries = self.query_layernorm(queries)
+            keys = self.key_layernorm(keys)
+        # Update cache
+        if cache is not None:
+            keys, values = cache.update_and_fetch(keys, values)
+        # Apply mask
+        if mask is not None and isinstance(mask, mx.array):
+            mask = mask[..., : keys.shape[-2]]
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache=cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+class MLP(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=config.mlp_bias
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=config.mlp_bias
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=config.mlp_bias
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+class DecoderLayer(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config)
+        self.mlp = MLP(config)
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[KVCache] = None,
+        position_ids: Optional[mx.array] = None,
+    ) -> mx.array:
+        # Self-attention with residual
+        r = self.self_attn(self.input_layernorm(x), mask, cache, position_ids)
+        h = x + r
+        # MLP with residual
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+class HunyuanModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = [DecoderLayer(config) for _ in range(config.num_hidden_layers)]
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def __call__(
+        self,
+        input_ids: Optional[mx.array] = None,
+        inputs_embeds: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache=None,
+        position_ids: Optional[mx.array] = None,
+    ) -> mx.array:
+        if inputs_embeds is None:
+            h = self.embed_tokens(input_ids)
+        else:
+            h = inputs_embeds
+        if cache is None:
+            cache = [None] * len(self.layers)
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c, position_ids)
+        return self.norm(h)
+class LanguageModel(nn.Module):
+    def __init__(self, config: ModelConfig = None):
+        super().__init__()
+        self.args = config.text_config
+        self.config = config
+        self.model_type = self.args.model_type
+        self.model = HunyuanModel(self.args)
+        self._position_ids = None
+        if not self.args.tie_word_embeddings:
+            self.lm_head = nn.Linear(
+                self.args.hidden_size, self.args.vocab_size, bias=False
+            )
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: List[int],
+        image_grid_thw: Optional[mx.array],
+        image_token_id: int,
+        spatial_merge_size: int,
+    ) -> mx.array:
+        """Compute XD-RoPE position IDs for image-text interleaved inputs."""
+        xd_num = len(self.args.rope_scaling["xdrope_section"])
+        input_tokens_arr = np.array(input_tokens)
+        image_start_indices = np.where(input_tokens_arr == image_token_id)[0].tolist()
+        seq_len = len(input_tokens)
+        p_index = np.arange(seq_len)
+        w_index = np.arange(seq_len)
+        h_index = np.arange(seq_len)
+        t_index = np.arange(seq_len)
+        # Process image positions if we have images
+        if image_grid_thw is not None and len(image_start_indices) > 0:
+            for image_index in range(len(image_start_indices)):
+                # +2: skip first image_token and account for xdrope positions
+                pos = int(image_start_indices[image_index]) + 1
+                _, h, w = image_grid_thw.flatten().tolist()
+                llm_grid_h = h // spatial_merge_size
+                llm_grid_w = w // spatial_merge_size
+                token_num = (llm_grid_w + 1) * llm_grid_h
+                # Ensure we don't go out of bounds
+                end_pos = min(pos + token_num, seq_len)
+                actual_token_num = end_pos - pos
+                if actual_token_num > 0:
+                    # w_index: [0, 1, ..., grid_w, 0, 1, ..., grid_w, ...] repeated for each row
+                    w_pattern = np.tile(np.arange(llm_grid_w + 1), llm_grid_h)[
+                        :actual_token_num
+                    ]
+                    w_index[pos:end_pos] = w_pattern
+                    # h_index: [0, 0, ..., 0, 1, 1, ..., 1, ...] each repeated (grid_w + 1) times
+                    h_pattern = np.repeat(np.arange(llm_grid_h), llm_grid_w + 1)[
+                        :actual_token_num
+                    ]
+                    h_index[pos:end_pos] = h_pattern
+                    # t_index: image index for temporal dimension
+                    t_index[pos:end_pos] = image_index
+        # Stack based on number of xdrope dimensions
+        if xd_num == 4:
+            llm_positions = mx.stack(
+                [
+                    mx.array(p_index),
+                    mx.array(t_index),
+                    mx.array(h_index),
+                    mx.array(w_index),
+                ]
+            )
+        elif xd_num == 3:
+            llm_positions = mx.stack(
+                [
+                    mx.array(t_index),
+                    mx.array(h_index),
+                    mx.array(w_index),
+                ]
+            )
+        else:
+            # Fallback: just use sequential positions
+            llm_positions = mx.stack([mx.array(p_index)] * xd_num)
+        return llm_positions
+    def __call__(
+        self,
+        inputs: Optional[mx.array] = None,
+        inputs_embeds: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache=None,
+        **kwargs,
+    ) -> LanguageModelOutput:
+        kwargs_position_ids = kwargs.pop("position_ids", None)
+        # Compute cache offset
+        cache_offset = 0
+        if cache is not None and cache[0] is not None:
+            offset = cache[0].offset
+            if isinstance(offset, int):
+                cache_offset = offset
+            elif isinstance(offset, mx.array):
+                cache_offset = (offset if offset.ndim == 0 else offset[0]).item()
+            else:
+                cache_offset = int(offset)
+        # Determine sequence length from inputs or inputs_embeds
+        if inputs_embeds is not None:
+            seq_length = inputs_embeds.shape[1]
+        elif inputs is not None:
+            seq_length = inputs.shape[1]
+        else:
+            seq_length = 0
+        position_ids = None
+        if cache is None or cache_offset == 0:
+            # Prefill phase - need xdrope position_ids
+            if self._position_ids is not None:
+                # Use stored position_ids (sliced for chunked prefill)
+                position_ids = self._position_ids[
+                    :, :, cache_offset : cache_offset + seq_length
+                ]
+            elif kwargs_position_ids is not None:
+                # Use position_ids from kwargs (e.g., from processor)
+                if not isinstance(kwargs_position_ids, mx.array):
+                    kwargs_position_ids = mx.array(kwargs_position_ids)
+                # Store for potential future chunks and slice for current chunk
+                self._position_ids = kwargs_position_ids
+                position_ids = self._position_ids[
+                    :, :, cache_offset : cache_offset + seq_length
+                ]
+            elif inputs is not None:
+                # Compute position_ids on the fly (for non-chunked prefill)
+                position_ids = self.get_xdrope_input_positions(
+                    input_tokens=inputs[0].tolist(),
+                    image_grid_thw=kwargs.get("image_grid_thw", None),
+                    image_token_id=self.config.image_token_id,
+                    spatial_merge_size=self.config.vision_config.spatial_merge_size,
+                )[None, ...]
+                # Store for potential future chunks
+                self._position_ids = position_ids
+        out = self.model(
+            input_ids=inputs,
+            inputs_embeds=inputs_embeds,
+            mask=mask,
+            cache=cache,
+            position_ids=position_ids,
+        )
+        if self.args.tie_word_embeddings:
+            logits = self.model.embed_tokens.as_linear(out)
+        else:
+            logits = self.lm_head(out)
+        return LanguageModelOutput(logits=logits)
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def head_dim(self):
+        return self.args.head_dim
+    @property
+    def n_kv_heads(self):
+        return self.args.num_key_value_heads