PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/utils.py ADDED Viewed

@@ -0,0 +1,1339 @@
+import glob
+import importlib
+import inspect
+import json
+import logging
+from io import BytesIO
+from pathlib import Path
+from textwrap import dedent
+from typing import Any, Dict, List, Optional, Tuple, Union
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+import requests
+import soundfile as sf
+from huggingface_hub import snapshot_download
+from mlx.utils import tree_flatten, tree_map
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast
+from .models.base import BaseImageProcessor
+from .tokenizer_utils import load_tokenizer
+from .trainer import apply_lora_layers
+# Constants
+MODEL_REMAPPING = {
+    "llava_qwen2": "fastvlm",  # Apple's FastVLM, note it's different to the one below
+    "llava-qwen2": "llava_bunny",
+    "bunny-llama": "llava_bunny",
+    "lfm2-vl": "lfm2_vl",
+    "cohere2_vision": "aya_vision",
+    "jvlm": "jina_vlm",
+    "moondream1": "moondream2",
+}
+MAX_FILE_SIZE_GB = 5
+MODEL_CONVERSION_DTYPES = ["float16", "bfloat16", "float32"]
+def skip_multimodal_module(path: str) -> bool:
+    """
+    Check if a multimodal module (vision/audio) should skip quantization.
+    Args:
+        path: The module path to check
+    Returns:
+        bool: True if the module is multimodal and should skip quantization, False otherwise
+    """
+    return (
+        "vision_model" in path
+        or "vision_tower" in path
+        or "vl_connector" in path
+        or "sam_model" in path
+        or "audio_model" in path
+        or "audio_tower" in path
+        or "code_predictor" in path
+    )
+def get_model_and_args(config: dict):
+    """
+    Retrieve the model object based on the configuration.
+    Args:
+        config (dict): The model configuration.
+    Returns:
+        A tuple containing the Model class and the ModelArgs class.
+    """
+    model_type = config["model_type"].lower()
+    model_type = MODEL_REMAPPING.get(model_type, model_type)
+    try:
+        arch = importlib.import_module(f"mlx_vlm.models.{model_type}")
+    except ImportError as e:
+        msg = f"Model type {model_type} not supported. Error: {e}"
+        logging.error(msg)
+        raise ValueError(msg)
+    return arch, model_type
+def get_model_path(
+    path_or_hf_repo: str, revision: Optional[str] = None, force_download: bool = False
+) -> Path:
+    """
+    Ensures the model is available locally. If the path does not exist locally,
+    it is downloaded from the Hugging Face Hub.
+    Args:
+        path_or_hf_repo (str): The local path or Hugging Face repository ID of the model.
+        revision (str, optional): A revision id which can be a branch name, a tag, or a commit hash.
+    Returns:
+        Path: The path to the model.
+    """
+    model_path = Path(path_or_hf_repo)
+    if not model_path.exists():
+        model_path = Path(
+            snapshot_download(
+                repo_id=path_or_hf_repo,
+                revision=revision,
+                allow_patterns=[
+                    "*.json",
+                    "*.safetensors",
+                    "*.py",
+                    "*.model",
+                    "*.tiktoken",
+                    "*.txt",
+                    "*.jinja",
+                ],
+                force_download=force_download,
+            )
+        )
+    return model_path
+def load_model(model_path: Path, lazy: bool = False, **kwargs) -> nn.Module:
+    """
+    Load and initialize the model from a given path.
+    Args:
+        model_path (Path): The path to load the model from.
+        lazy (bool): If False eval the model parameters to make sure they are
+            loaded in memory before returning, otherwise they will be loaded
+            when needed. Default: ``False``
+        revision (str, optional): A revision id which can be a branch name,
+            a tag, or a commit hash. Default: ``None``.
+    Returns:
+        nn.Module: The loaded and initialized model.
+    Raises:
+        FileNotFoundError: If the weight files (.safetensors) are not found.
+        ValueError: If the model class or args class are not found or cannot be instantiated.
+    """
+    config = load_config(model_path, **kwargs)
+    quantization = config.get("quantization", None)
+    # Find all .safetensors files in the model_path, excluding consolidated model weights
+    weight_files = [
+        wf
+        for wf in glob.glob(str(model_path / "*.safetensors"))
+        if not wf.endswith("consolidated.safetensors")
+    ]
+    if not weight_files:
+        logging.error(f"No safetensors found in {model_path}")
+        message = f"""
+No safetensors found in {model_path}
+Create safetensors using the following code:
+```
+from transformers import AutoModelForCausalLM, AutoProcessor
+model_id= "<huggingface_model_id>"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+model.save_pretrained("<local_dir>")
+processor.save_pretrained("<local_dir>")
+```
+Then use the <local_dir> as the --hf-path in the convert script.
+```
+python -m mlx_vlm.convert --hf-path <local_dir> --mlx-path <mlx_dir>
+```
+        """
+        raise FileNotFoundError(message)
+    weights = {}
+    for wf in weight_files:
+        weights.update(mx.load(wf))
+    import safetensors
+    with safetensors.safe_open(weight_files[0], framework="np") as f:
+        is_mlx_format = f.metadata() and f.metadata().get("format") == "mlx"
+    model_class, _ = get_model_and_args(config=config)
+    # Initialize text and vision configs if not present
+    config.setdefault("text_config", {})
+    config.setdefault("vision_config", {})
+    config.setdefault("audio_config", {})
+    # Initialize model config and update it with module configs
+    model_config = model_class.ModelConfig.from_dict(config)
+    modules = ["text", "vision", "perceiver", "projector", "audio"]
+    model_config = update_module_configs(model_config, model_class, config, modules)
+    model = model_class.Model(model_config)
+    # #region agent log
+    import json
+    log_file = "/Users/zekieldee/Desktop/code/mlx-vlm/.cursor/debug.log"
+    def log_weights(location, message, data, hypothesis_id):
+        try:
+            with open(log_file, "a") as f:
+                f.write(json.dumps({"sessionId": "debug-session", "runId": "load_model", "hypothesisId": hypothesis_id, "location": location, "message": message, "data": data, "timestamp": __import__("time").time_ns() // 1000000}) + "\n")
+        except: pass
+    # Get all model parameter keys
+    model_params = dict(tree_flatten(model.parameters()))
+    model_param_keys = sorted(model_params.keys())
+    log_weights("utils.py:load_model_params", "Model parameter keys before load_weights", {"n_params": len(model_param_keys), "sample_keys": model_param_keys[:10], "all_keys": model_param_keys}, "H3")
+    # #endregion
+    if not is_mlx_format:
+        # #region agent log
+        pre_sanitize_keys = sorted(weights.keys())
+        log_weights("utils.py:pre_sanitize", "Weights before sanitize", {"n_weights": len(weights), "sample_keys": pre_sanitize_keys[:10], "all_keys": pre_sanitize_keys}, "H1")
+        # #endregion
+        # Sanitize weights
+        weights = sanitize_weights(model, weights)
+        if hasattr(model, "thinker") and hasattr(model.thinker, "sanitize"):
+            weights = sanitize_weights(model.thinker, weights)
+            weights = sanitize_weights(model.thinker.vision_tower, weights)
+            weights = sanitize_weights(model.thinker.audio_tower, weights)
+            weights = sanitize_weights(model.thinker.language_model, weights)
+            weights = sanitize_weights(model.code2wav, weights)
+            weights = sanitize_weights(model.talker, weights)
+        else:
+            weights = sanitize_weights(
+                model_class.VisionModel, weights, model_config.vision_config
+            )
+            weights = sanitize_weights(
+                model_class.LanguageModel, weights, model_config.text_config
+            )
+            if hasattr(model_class, "AudioModel"):
+                weights = sanitize_weights(
+                    model_class.AudioModel, weights, model_config.audio_config
+                )
+        # #region agent log
+        post_sanitize_keys = sorted(weights.keys())
+        log_weights("utils.py:post_sanitize", "Weights after sanitize", {"n_weights": len(weights), "sample_keys": post_sanitize_keys[:10], "all_keys": post_sanitize_keys}, "H1")
+        # #endregion
+    if (quantization := config.get("quantization", None)) is not None:
+        # Handle legacy models which may or may not have vision quantized
+        # TODO: Re-upload the models with the new quantization config and remove this
+        skip_vision = config.get("vision_config", {}).get("skip_vision", False)
+        def get_class_predicate(p, m):
+            # Always skip vision and audio models
+            if skip_multimodal_module(p) and skip_vision:
+                return False
+            # Handle custom per layer quantizations
+            if p in config["quantization"]:
+                return config["quantization"][p]
+            if not hasattr(m, "to_quantized"):
+                return False
+            # Skip layers not divisible by 64
+            if hasattr(m, "weight") and m.weight.size % 64 != 0:
+                return False
+            # Handle legacy models which may not have everything quantized
+            return f"{p}.scales" in weights
+        nn.quantize(
+            model,
+            group_size=quantization["group_size"],
+            bits=quantization["bits"],
+            mode=quantization.get("mode", "affine"),
+            class_predicate=get_class_predicate,
+        )
+    # #region agent log
+    weights_to_load = sorted([k for k, v in weights.items()])
+    log_weights("utils.py:before_load_weights", "Weights being passed to load_weights", {"n_weights": len(weights_to_load), "sample_keys": weights_to_load[:10], "all_keys": weights_to_load}, "H2")
+    # #endregion
+    model.load_weights(list(weights.items()))
+    # #region agent log
+    # Get model parameters after load_weights to see what was actually loaded
+    loaded_params = dict(tree_flatten(model.parameters()))
+    loaded_param_keys = sorted(loaded_params.keys())
+    # Find which weights from sanitize matched model params
+    matched_keys = [k for k in weights_to_load if k in loaded_param_keys]
+    unmatched_weights = [k for k in weights_to_load if k not in loaded_param_keys]
+    unmatched_params = [k for k in loaded_param_keys if k not in weights_to_load]
+    # Categorize by subsystem
+    vision_weights = [k for k in matched_keys if k.startswith("vision_")]
+    projection_weights = [k for k in matched_keys if k.startswith("vision_projection")]
+    language_weights = [k for k in matched_keys if k.startswith("language_model")]
+    log_weights("utils.py:after_load_weights", "Weight loading results", {
+        "total_model_params": len(loaded_param_keys),
+        "total_sanitized_weights": len(weights_to_load),
+        "matched_weights": len(matched_keys),
+        "unmatched_weights_from_sanitize": len(unmatched_weights),
+        "unmatched_params_in_model": len(unmatched_params),
+        "vision_weights_loaded": len(vision_weights),
+        "projection_weights_loaded": len(projection_weights),
+        "language_weights_loaded": len(language_weights),
+        "matched_keys": matched_keys,
+        "unmatched_weights": unmatched_weights,
+        "unmatched_params": unmatched_params
+    }, "H2,H3,H4")
+    # #endregion
+    if not lazy:
+        mx.eval(model.parameters())
+    model.eval()
+    return model
+def sanitize_weights(model_obj, weights, config=None):
+    """Helper function to sanitize weights if the model has a sanitize method"""
+    if hasattr(model_obj, "sanitize"):
+        if config is not None:
+            model_obj = model_obj(config)
+        weights = model_obj.sanitize(weights)
+    return weights
+def update_module_configs(model_config, model_class, config, modules):
+    """Updates configuration for model modules like text and vision modules.
+    Args:
+        model_config: The model configuration object that will be updated
+        model_class: The model class containing component config classes
+        config: Dictionary containing configuration parameters
+        modules: List of module names to update configs for (e.g. ["text", "vision"])
+    Returns:
+        The updated model_config object
+    """
+    for config_name in modules:
+        config_attr = f"{config_name}_config"
+        if hasattr(model_config, config_attr):
+            config_class = getattr(model_class, f"{config_name.title()}Config")
+            setattr(
+                model_config, config_attr, config_class.from_dict(config[config_attr])
+            )
+    return model_config
+def load(
+    path_or_hf_repo: str,
+    adapter_path: Optional[str] = None,
+    lazy: bool = False,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> Tuple[nn.Module, Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]:
+    """
+    Load the model and tokenizer from a given path or a huggingface repository.
+    Args:
+        path_or_hf_repo (Path): The path or the huggingface repository to load the model from.
+        tokenizer_config (dict, optional): Configuration parameters specifically for the tokenizer.
+            Defaults to an empty dictionary.
+        adapter_path (str, optional): Path to the LoRA adapters. If provided, applies LoRA layers
+            to the model. Default: ``None``.
+        lazy (bool): If False eval the model parameters to make sure they are
+            loaded in memory before returning, otherwise they will be loaded
+            when needed. Default: ``False``
+        revision (str, optional): A revision id which can be a branch name,
+            a tag, or a commit hash. Default: ``None``.
+    Returns:
+        Tuple[nn.Module, TokenizerWrapper]: A tuple containing the loaded model and tokenizer.
+    Raises:
+        FileNotFoundError: If config file or safetensors are not found.
+        ValueError: If model class or args class are not found.
+    """
+    force_download = kwargs.get("force_download", False)
+    model_path = get_model_path(
+        path_or_hf_repo, force_download=force_download, revision=revision
+    )
+    model = load_model(model_path, lazy, **kwargs)
+    if adapter_path is not None:
+        model = apply_lora_layers(model, adapter_path)
+        model.eval()
+    image_processor = load_image_processor(model_path, **kwargs)
+    # Get the eos_token_id from the model config
+    eos_token_id = getattr(model.config, "eos_token_id", None)
+    processor = load_processor(model_path, True, eos_token_ids=eos_token_id, **kwargs)
+    if image_processor is not None:
+        processor.image_processor = image_processor
+    return model, processor
+def load_config(model_path: Union[str, Path], **kwargs) -> dict:
+    """Load model configuration from a path or Hugging Face repo.
+    Args:
+        model_path: Local path or Hugging Face repo ID to load config from
+        **kwargs: Additional keyword arguments to pass to the config loader
+    Returns:
+        dict: Model configuration
+    Raises:
+        FileNotFoundError: If config.json is not found at the path
+    """
+    if isinstance(model_path, str):
+        model_path = get_model_path(model_path)
+    try:
+        with open(model_path / "config.json", encoding="utf-8") as f:
+            config = json.load(f)
+        generation_config_file = model_path / "generation_config.json"
+        if generation_config_file.exists():
+            generation_config = {}
+            try:
+                with open(generation_config_file, "r") as f:
+                    generation_config = json.load(f)
+            except json.JSONDecodeError:
+                pass
+            if eos_token_id := generation_config.get("eos_token_id", False):
+                config["eos_token_id"] = eos_token_id
+        return config
+    except FileNotFoundError as exc:
+        raise FileNotFoundError(f"Config not found at {model_path}") from exc
+def load_image_processor(model_path: Union[str, Path], **kwargs) -> BaseImageProcessor:
+    if isinstance(model_path, str):
+        model_path = get_model_path(model_path)
+    if not kwargs:
+        config = load_config(model_path, trust_remote_code=True)
+    else:
+        config = load_config(model_path, **kwargs)
+    model_class, _ = get_model_and_args(config)
+    image_processor = None
+    if hasattr(model_class, "ImageProcessor"):
+        init_signature = inspect.signature(model_class.ImageProcessor.__init__)
+        if "config" in init_signature.parameters:
+            image_processor = model_class.ImageProcessor(config=config)
+        else:
+            image_processor = model_class.ImageProcessor()
+    return image_processor
+def load_processor(
+    model_path, add_detokenizer=True, eos_token_ids=None, **kwargs
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    processor = AutoProcessor.from_pretrained(model_path, use_fast=True, **kwargs)
+    if add_detokenizer:
+        detokenizer_class = load_tokenizer(model_path, return_tokenizer=False)
+        # Get the tokenizer object
+        tokenizer_obj = (
+            processor.tokenizer if hasattr(processor, "tokenizer") else processor
+        )
+        # Instantiate the detokenizer
+        processor.detokenizer = detokenizer_class(tokenizer_obj)
+        # Determine the EOS token IDs, prioritizing the function argument
+        final_eos_token_ids = (
+            eos_token_ids if eos_token_ids is not None else tokenizer_obj.eos_token_ids
+        )
+        # Create and assign the StoppingCriteria
+        criteria = StoppingCriteria(final_eos_token_ids, tokenizer_obj)
+        if hasattr(processor, "tokenizer"):
+            processor.tokenizer.stopping_criteria = criteria
+        else:
+            processor.stopping_criteria = criteria
+    return processor
+def fetch_from_hub(
+    model_path: Path, lazy: bool = False, **kwargs
+) -> Tuple[nn.Module, dict, PreTrainedTokenizer]:
+    model = load_model(model_path, lazy, **kwargs)
+    config = load_config(model_path, **kwargs)
+    processor = load_processor(
+        model_path,
+        add_detokenizer=False,
+        eos_token_ids=config.get("eos_token_id", None),
+        **kwargs,
+    )
+    return model, config, processor
+def make_shards(weights: dict, max_file_size_gb: int = MAX_FILE_SIZE_GB) -> list:
+    """
+    Splits the weights into smaller shards.
+    Args:
+        weights (dict): Model weights.
+        max_file_size_gb (int): Maximum size of each shard in gigabytes.
+    Returns:
+        list: List of weight shards.
+    """
+    max_file_size_bytes = max_file_size_gb << 30
+    shards = []
+    shard, shard_size = {}, 0
+    for k, v in weights.items():
+        if shard_size + v.nbytes > max_file_size_bytes:
+            shards.append(shard)
+            shard, shard_size = {}, 0
+        shard[k] = v
+        shard_size += v.nbytes
+    shards.append(shard)
+    return shards
+def upload_to_hub(path: str, upload_repo: str, hf_path: str):
+    """
+    Uploads the model to Hugging Face hub.
+    Args:
+        path (str): Local path to the model.
+        upload_repo (str): Name of the HF repo to upload to.
+        hf_path (str): Path to the original Hugging Face model.
+    """
+    import os
+    from huggingface_hub import HfApi, ModelCard, logging
+    from . import __version__
+    card = ModelCard.load(hf_path)
+    card.data.tags = ["mlx"] if card.data.tags is None else card.data.tags + ["mlx"]
+    card.text = dedent(
+        f"""
+        # {upload_repo}
+        This model was converted to MLX format from [`{hf_path}`]() using mlx-vlm version **{__version__}**.
+        Refer to the [original model card](https://huggingface.co/{hf_path}) for more details on the model.
+        ## Use with mlx
+        ```bash
+        pip install -U mlx-vlm
+        ```
+        ```bash
+        python -m mlx_vlm.generate --model {upload_repo} --max-tokens 100 --temperature 0.0 --prompt "Describe this image." --image <path_to_image>
+        ```
+        """
+    )
+    card.save(os.path.join(path, "README.md"))
+    logging.set_verbosity_info()
+    api = HfApi()
+    api.create_repo(repo_id=upload_repo, exist_ok=True)
+    api.upload_folder(
+        folder_path=path,
+        repo_id=upload_repo,
+        repo_type="model",
+    )
+    print(f"Upload successful, go to https://huggingface.co/{upload_repo} for details.")
+def apply_repetition_penalty(logits: mx.array, generated_tokens: Any, penalty: float):
+    """
+    Apply repetition penalty to specific logits based on the given context.
+    Paper: https://arxiv.org/abs/1909.05858
+    Args:
+        logits (mx.array): The logits produced by the language model.
+        generated_tokens (any): A list of N previous tokens.
+        penalty (float): The repetition penalty factor to be applied.
+    Returns:
+        logits (mx.array): Logits with repetition penalty applied to generated tokens.
+    """
+    if len(generated_tokens) > 0:
+        indices = mx.array([token for token in generated_tokens])
+        selected_logits = logits[:, indices]
+        selected_logits = mx.where(
+            selected_logits < 0, selected_logits * penalty, selected_logits / penalty
+        )
+        logits[:, indices] = selected_logits
+    return logits
+def save_weights(
+    save_path: Union[str, Path],
+    model: nn.Module,
+    *,
+    donate_weights: bool = False,
+) -> None:
+    """Save model weights into specified directory."""
+    if isinstance(save_path, str):
+        save_path = Path(save_path)
+    weights = dict(tree_flatten(model.parameters()))
+    save_path.mkdir(parents=True, exist_ok=True)
+    shards = make_shards(weights)
+    shards_count = len(shards)
+    shard_file_format = (
+        "model-{:05d}-of-{:05d}.safetensors"
+        if shards_count > 1
+        else "model.safetensors"
+    )
+    total_size = sum(v.nbytes for v in weights.values())
+    index_data = {"metadata": {"total_size": total_size}, "weight_map": {}}
+    # Write the weights and make sure no references are kept other than the
+    # necessary ones
+    if donate_weights:
+        model.update(tree_map(lambda _: mx.array([]), model.parameters()))
+    weights.clear()
+    del weights
+    for i in range(len(shards)):
+        shard = shards[i]
+        shards[i] = None
+        shard_name = shard_file_format.format(i + 1, shards_count)
+        shard_path = save_path / shard_name
+        mx.save_safetensors(str(shard_path), shard, metadata={"format": "mlx"})
+        for weight_name in shard.keys():
+            index_data["weight_map"][weight_name] = shard_name
+        del shard
+    index_data["weight_map"] = {
+        k: index_data["weight_map"][k] for k in sorted(index_data["weight_map"])
+    }
+    with open(save_path / "model.safetensors.index.json", "w") as f:
+        json.dump(
+            index_data,
+            f,
+            indent=4,
+        )
+def save_config(
+    config: dict,
+    config_path: Union[str, Path],
+) -> None:
+    """Save the model configuration to the ``config_path``.
+    The final configuration will be sorted before saving for better readability.
+    Args:
+        config (dict): The model configuration.
+        config_path (Union[str, Path]): Model configuration file path.
+    """
+    # Clean unused keys
+    config.pop("_name_or_path", None)
+    config.pop("torch_dtype", None)
+    # sort the config for better readability
+    config = dict(sorted(config.items()))
+    # write the updated config to the config_path (if provided)
+    with open(config_path, "w") as fid:
+        json.dump(config, fid, indent=4)
+def load_image(image_source: Union[str, Path, BytesIO], timeout: int = 10):
+    """
+    Helper function to load an image from either a URL or file.
+    """
+    if (
+        isinstance(image_source, BytesIO)
+        or (isinstance(image_source, str) and image_source.startswith("data:image/"))
+        or Path(image_source).is_file()
+    ):
+        # for base64 encoded images
+        try:
+            if image_source.startswith("data:image/"):
+                import base64
+                if "," not in image_source:
+                    raise ValueError(
+                        "Invalid data URI format - missing comma separator"
+                    )
+                _, data = image_source.split(",", 1)
+                image_source = BytesIO(base64.b64decode(data))
+            image = Image.open(image_source)
+        except IOError as e:
+            raise ValueError(
+                f"Failed to load image from {image_source} with error: {e}"
+            ) from e
+    elif image_source.startswith(("http://", "https://")):
+        try:
+            response = requests.get(image_source, stream=True, timeout=timeout)
+            response.raise_for_status()
+            image = Image.open(response.raw)
+        except Exception as e:
+            raise ValueError(
+                f"Failed to load image from URL: {image_source} with error {e}"
+            ) from e
+    else:
+        raise ValueError(
+            f"The image {image_source} must be a valid URL or existing file."
+        )
+    image = ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+def resize_image(img, max_size):
+    ratio = min(max_size[0] / img.width, max_size[1] / img.height)
+    new_size = (int(img.width * ratio), int(img.height * ratio))
+    return img.resize(new_size)
+def process_image(img, resize_shape, image_processor):
+    if isinstance(img, str):
+        img = load_image(img)
+    if resize_shape is not None and not isinstance(image_processor, BaseImageProcessor):
+        img = resize_image(img, resize_shape)
+    return img
+def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    """Resample audio using linear interpolation."""
+    if orig_sr == target_sr:
+        return audio
+    # Calculate the resampling ratio
+    ratio = target_sr / orig_sr
+    # Handle different audio shapes
+    if audio.ndim == 1:
+        # Mono audio - simple case
+        new_length = int(len(audio) * ratio)
+        old_indices = np.arange(len(audio))
+        new_indices = np.linspace(0, len(audio) - 1, new_length)
+        resampled = np.interp(new_indices, old_indices, audio)
+    elif audio.ndim == 2:
+        # Multi-channel audio - transpose to (samples, channels) if needed
+        if audio.shape[0] < audio.shape[1]:
+            audio = audio.T
+        # Resample each channel
+        n_samples, n_channels = audio.shape
+        new_length = int(n_samples * ratio)
+        old_indices = np.arange(n_samples)
+        new_indices = np.linspace(0, n_samples - 1, new_length)
+        resampled = np.zeros((new_length, n_channels))
+        for i in range(n_channels):
+            resampled[:, i] = np.interp(new_indices, old_indices, audio[:, i])
+    else:
+        raise ValueError(f"Audio array has unsupported shape: {audio.shape}")
+    return resampled
+def load_audio(
+    file: str,
+    sr: int,
+    timeout: int = 10,
+):
+    """
+    Helper function to load audio from either a URL or file.
+    """
+    if file.startswith(("http://", "https://")):
+        try:
+            response = requests.get(file, stream=True, timeout=timeout)
+            response.raise_for_status()
+            audio, sample_rate = sf.read(BytesIO(response.content), always_2d=True)
+        except Exception as e:
+            raise ValueError(
+                f"Failed to load audio from URL: {file} with error {e}"
+            ) from e
+    else:
+        audio, sample_rate = sf.read(file, always_2d=True)
+    if sample_rate != sr:
+        audio = resample_audio(audio, sample_rate, sr)
+    return np.array(audio).mean(axis=1)
+def normalize_audio_features(features: mx.array) -> mx.array:
+    """Normalize mel spectrogram features for lossy audio formats (e.g., MP3)."""
+    return (features - mx.mean(features)) / (mx.std(features) + 1e-6)
+def process_inputs(
+    processor,
+    prompts,
+    images=None,
+    audio=None,
+    add_special_tokens=False,
+    padding=True,
+    padding_side="left",
+    return_tensors="mlx",
+    **kwargs,
+):
+    # Get the process method from the processor
+    process_method = getattr(processor, "process", processor)
+    parameters = inspect.signature(process_method).parameters
+    # Prepare arguments
+    args = {
+        "text": prompts,
+        "images": images,
+        "padding": padding,
+        "return_tensors": return_tensors,
+    }
+    if "padding_side" in parameters:
+        args["padding_side"] = padding_side
+    # Add special tokens if supported
+    if "add_special_tokens" in parameters:
+        args["add_special_tokens"] = add_special_tokens
+    for param in parameters.keys():
+        if param in kwargs.keys():
+            args[param] = kwargs.get(param, None)
+    # Add audio if provided and supported
+    if audio is not None and len(audio) > 0:
+        if "audio" in parameters:
+            args["audio"] = audio
+        else:
+            raise ValueError(f"Processor {processor} does not support audio parameter")
+    return process_method(**args)
+def process_inputs_with_fallback(
+    processor,
+    prompts,
+    images,
+    audio,
+    add_special_tokens=False,
+    return_tensors="mlx",
+    **kwargs,
+):
+    # First attempt with specified return_tensors
+    try:
+        return process_inputs(
+            processor,
+            prompts=prompts,
+            images=images,
+            audio=audio,
+            add_special_tokens=add_special_tokens,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+    except Exception as e:
+        # Fallback to PyTorch tensors if MLX fails
+        if return_tensors != "pt":
+            try:
+                return process_inputs(
+                    processor,
+                    prompts=prompts,
+                    images=images,
+                    audio=audio,
+                    add_special_tokens=add_special_tokens,
+                    return_tensors="pt",
+                    **kwargs,
+                )
+            except Exception as fallback_error:
+                raise ValueError(
+                    f"Failed to process inputs with error: {fallback_error}"
+                ) from fallback_error
+        raise ValueError(f"Failed to process inputs with error: {e}")
+def prepare_inputs(
+    processor,
+    images=None,
+    audio=None,
+    prompts=None,
+    image_token_index=None,
+    resize_shape=None,
+    add_special_tokens=False,
+    padding=True,
+    padding_side="left",
+    pad_to_uniform_size=False,
+    **kwargs,
+):
+    if not images and not audio:
+        tokenizer = (
+            processor.tokenizer if hasattr(processor, "tokenizer") else processor
+        )
+        # Ensure pad_token exists when padding text-only inputs
+        if padding and tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        inputs = tokenizer(
+            prompts,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            padding_side=padding_side,
+        )
+        input_ids = mx.array([inputs.input_ids])
+        mask = mx.array([inputs.attention_mask])
+        return {
+            "input_ids": input_ids,
+            "attention_mask": mask,
+        }
+    # Process images
+    if images is not None:
+        if not isinstance(images, list):
+            images = [images]
+        image_processor = (
+            processor.image_processor if hasattr(processor, "image_processor") else None
+        )
+        images = [process_image(img, resize_shape, image_processor) for img in images]
+        # For batching, we need uniform image sizes. Instead of padding to the
+        # largest image (which adds white borders that hurt accuracy), we resize
+        # all images to the model's expected input size.
+        if len(images) > 1 and pad_to_uniform_size:
+            # Get target size from image processor if available
+            target_size = None
+            if image_processor is not None and hasattr(image_processor, "size"):
+                size = image_processor.size
+                if isinstance(size, tuple):
+                    target_size = size
+                elif isinstance(size, dict):
+                    target_size = (size.get("height", 384), size.get("width", 384))
+                elif isinstance(size, int):
+                    target_size = (size, size)
+            if target_size is not None:
+                # Resize all images to the target size
+                resized_images = []
+                for img in images:
+                    if img.size != (
+                        target_size[1],
+                        target_size[0],
+                    ):  # PIL uses (width, height)
+                        img = img.resize(
+                            (target_size[1], target_size[0]), Image.Resampling.BICUBIC
+                        )
+                    resized_images.append(img)
+                images = resized_images
+            else:
+                # Fallback: pad to largest size (original behavior)
+                max_width = max(img.width for img in images)
+                max_height = max(img.height for img in images)
+                padded_images = []
+                for img in images:
+                    if img.width != max_width or img.height != max_height:
+                        padded_img = Image.new(
+                            "RGB", (max_width, max_height), (255, 255, 255)
+                        )
+                        x_offset = (max_width - img.width) // 2
+                        y_offset = (max_height - img.height) // 2
+                        padded_img.paste(img, (x_offset, y_offset))
+                        padded_images.append(padded_img)
+                    else:
+                        padded_images.append(img)
+                images = padded_images
+    # Process audio
+    audio_inputs = None
+    audio_feature_lengths = None
+    is_qwen3_omni_moe = False
+    processor_class_name = (
+        processor.__class__.__name__ if hasattr(processor, "__class__") else ""
+    )
+    if (
+        "qwen3" in processor_class_name.lower()
+        and "omni" in processor_class_name.lower()
+    ):
+        is_qwen3_omni_moe = True
+    is_lossy_audio = False
+    if audio is not None and len(audio) > 0:
+        if not isinstance(audio, list):
+            audio = [audio]
+        # Check if any audio file is a lossy format (MP3, AAC, OGG, etc.)
+        lossy_extensions = {".mp3", ".m4a"}
+        is_lossy_audio = any(
+            str(f).lower().endswith(tuple(lossy_extensions)) for f in audio
+        )
+        if len(audio) > 1:
+            print(
+                "\033[33mWarning\033[0m: Single prompt with multiple audio files is not supported yet. Using the first audio file.\n"
+            )
+            audio = audio[:1]
+        if is_qwen3_omni_moe:
+            audio_arrays = [
+                load_audio(audio_file, sr=processor.feature_extractor.sampling_rate)
+                for audio_file in audio
+            ]
+            audio_arrays = [
+                audio_array.astype(np.float32) for audio_array in audio_arrays
+            ]
+            feature_extractor = getattr(processor, "feature_extractor", None)
+            if feature_extractor is None:
+                raise ValueError("Processor missing feature_extractor for audio prep.")
+            audio_inputs = feature_extractor(
+                audio_arrays,
+                sampling_rate=feature_extractor.sampling_rate,
+                padding=True,
+                return_attention_mask=True,
+            )
+            audio_feature_lengths = np.sum(
+                audio_inputs["attention_mask"], axis=-1, dtype=np.int32
+            )
+        else:
+            feature_extractor = getattr(processor, "feature_extractor", None)
+            if feature_extractor is not None:
+                audio = [
+                    load_audio(audio_file, sr=feature_extractor.sampling_rate)
+                    for audio_file in audio
+                ]
+            else:
+                audio = [
+                    load_audio(audio_file, sr=processor.feature_extractor.sampling_rate)
+                    for audio_file in audio
+                ]
+    model_inputs = {}
+    if hasattr(processor, "image_processor") and isinstance(
+        processor.image_processor, BaseImageProcessor
+    ):
+        if not isinstance(prompts, list):
+            prompts = [prompts]
+        if processor.pad_token is None:
+            processor.pad_token = processor.eos_token
+        # Moondream expects image patch tokens immediately after BOS. Its
+        # prompting is string-based, so we ignore literal "<image>" placement
+        # and always insert the image token block after the first token.
+        if processor.__class__.__name__ == "MoondreamProcessor":
+            # Clean up prompts: strip <image> markers and add generation suffix
+            cleaned_prompts = []
+            for prompt in prompts:
+                clean = prompt.replace("<image>", "").strip()
+                # Add the generation prompt suffix moondream expects
+                if not clean.endswith("Answer:") and not clean.endswith("Answer: "):
+                    clean = clean + "\n\nAnswer:"
+                cleaned_prompts.append(clean)
+            token_ids_per_prompt = [
+                processor(prompt, add_special_tokens=True).input_ids for prompt in cleaned_prompts
+            ]
+            text_chunks = []
+            for ids in token_ids_per_prompt:
+                if not ids:
+                    ids = [processor.bos_token_id]
+                if ids[0] != processor.bos_token_id:
+                    ids = [processor.bos_token_id] + ids
+                # Represent as [bos], [rest]
+                text_chunks.append([[ids[0]], ids[1:]])
+        else:
+            text_chunks = [
+                [processor(chunk).input_ids for chunk in prompt.split("<image>")]
+                for prompt in prompts
+            ]
+            # Normalize chunks to a 2-part [before, after] representation.
+            # - If prompt has no "<image>", we treat it as [full_prompt, ""]
+            # - If prompt has multiple "<image>", we only insert one image token and
+            #   concatenate the remaining text parts into the "after" section.
+            normalized_chunks = []
+            for chunks in text_chunks:
+                if len(chunks) == 1:
+                    before = chunks[0]
+                    after = []
+                elif len(chunks) >= 2:
+                    before = chunks[0]
+                    after = []
+                    for part in chunks[1:]:
+                        after += part
+                else:
+                    before = []
+                    after = []
+                normalized_chunks.append([before, after])
+            text_chunks = normalized_chunks
+        # Find the maximum length for padding.
+        # Note: for MoondreamProcessor we expand a single "<image>" marker into
+        # 729 patch tokens.
+        if processor.__class__.__name__ == "MoondreamProcessor":
+            max_length = max(
+                sum(len(chunk) for chunk in chunks) + 729 for chunks in text_chunks
+            )
+        else:
+            max_length = max(
+                sum(len(chunk) for chunk in chunks) + 1 for chunks in text_chunks
+            )
+        # Pad and create input_ids
+        input_ids = []
+        for chunks in text_chunks:
+            # Moondream2 uses a block of patch tokens (729) rather than a single
+            # placeholder token. Keep this model-specific to avoid impacting
+            # other multimodal models.
+            if processor.__class__.__name__ == "MoondreamProcessor":
+                image_tokens = [image_token_index] * 729
+            else:
+                image_tokens = [image_token_index]
+            ids = chunks[0] + image_tokens + chunks[1]
+            padding = [processor.pad_token_id] * (max_length - len(ids))
+            input_ids.append(mx.array(ids + padding))
+        model_inputs["input_ids"] = mx.array(input_ids)
+        # Handle Moondream's multi-crop preprocessing which returns
+        # (crops_list, crop_counts, tilings) instead of just pixel_values
+        if processor.__class__.__name__ == "MoondreamProcessor":
+            crops_list, crop_counts, tilings = processor.image_processor.preprocess(
+                images=images
+            )
+            # Concatenate all crops for batch processing
+            all_crops = np.concatenate(crops_list, axis=0)
+            model_inputs["pixel_values"] = mx.array(all_crops)
+            model_inputs["crop_counts"] = crop_counts
+            model_inputs["tilings"] = tilings
+        else:
+            pixel_values = processor.image_processor.preprocess(images=images)
+            model_inputs["pixel_values"] = mx.array(np.stack(pixel_values))
+        model_inputs["attention_mask"] = mx.array(
+            [(ids != processor.pad_token_id) for ids in input_ids]
+        ).astype(mx.int32)
+    else:
+        if hasattr(processor, "tokenizer") and processor.tokenizer.pad_token is None:
+            processor.tokenizer.pad_token = processor.tokenizer.eos_token
+        inputs = process_inputs_with_fallback(
+            processor,
+            images=images,
+            audio=audio,
+            prompts=prompts,
+            add_special_tokens=add_special_tokens,
+            **kwargs,
+        )
+        if "images" in inputs:
+            inputs["pixel_values"] = inputs["images"]
+            inputs.pop("images")
+        model_inputs["attention_mask"] = (
+            mx.array(inputs["attention_mask"]) if "attention_mask" in inputs else None
+        )
+        # Convert inputs to model_inputs with mx.array if present
+        for key, value in inputs.items():
+            if key not in model_inputs:
+                if isinstance(value, (str, list, mx.array)):
+                    model_inputs[key] = value
+                else:
+                    model_inputs[key] = mx.array(value)
+    if audio_inputs is not None:
+        model_inputs["input_features"] = mx.array(audio_inputs["input_features"])
+        model_inputs["feature_attention_mask"] = mx.array(
+            audio_inputs["attention_mask"]
+        ).astype(mx.int32)
+        model_inputs["audio_feature_lengths"] = mx.array(
+            audio_feature_lengths, dtype=mx.int32
+        )
+    if is_lossy_audio and "input_features" in model_inputs:
+        f = model_inputs["input_features"]
+        if isinstance(f, list):
+            model_inputs["input_features"] = [
+                normalize_audio_features(mx.array(x)) for x in f
+            ]
+        else:
+            model_inputs["input_features"] = normalize_audio_features(f)
+    return model_inputs
+def group_images_by_shape(
+    images: List[Image.Image],
+    disable_grouping: bool = False,
+) -> Tuple[Dict[Tuple[int, int], List[Image.Image]], Dict[Tuple[int, int], List[int]]]:
+    """
+    Group images by their dimensions for efficient batch processing.
+    Images with the same dimensions can be stacked and processed together,
+    which is much faster than processing individually (especially on GPU).
+    Args:
+        images: List of PIL images to group
+        disable_grouping: If True, each image gets its own group (useful for debugging)
+    Returns:
+        grouped_images: Dict mapping shape -> list of images with that shape
+        grouped_indices: Dict mapping shape -> list of original indices
+    Example:
+        >>> images = [img_400x300, img_800x600, img_400x300_2]
+        >>> grouped, indices = group_images_by_shape(images)
+        >>> grouped
+        {(300, 400): [img_400x300, img_400x300_2], (600, 800): [img_800x600]}
+        >>> indices
+        {(300, 400): [0, 2], (600, 800): [1]}
+    """
+    if disable_grouping:
+        # Each image in its own group
+        grouped_images = {}
+        grouped_indices = {}
+        for i, img in enumerate(images):
+            shape = (img.height, img.width)
+            # Make each shape unique by adding index
+            unique_shape = (img.height, img.width, i)
+            grouped_images[unique_shape] = [img]
+            grouped_indices[unique_shape] = [i]
+        return grouped_images, grouped_indices
+    grouped_images: Dict[Tuple[int, int], List[Image.Image]] = {}
+    grouped_indices: Dict[Tuple[int, int], List[int]] = {}
+    for i, img in enumerate(images):
+        shape = (img.height, img.width)
+        if shape not in grouped_images:
+            grouped_images[shape] = []
+            grouped_indices[shape] = []
+        grouped_images[shape].append(img)
+        grouped_indices[shape].append(i)
+    return grouped_images, grouped_indices
+class StoppingCriteria:
+    def __init__(self, eos_token_ids: List[int], tokenizer=None):
+        if isinstance(eos_token_ids, int):
+            self.eos_token_ids = [eos_token_ids]
+        else:
+            self.eos_token_ids = eos_token_ids
+        self.tokenizer = tokenizer
+    def add_eos_token_ids(self, new_eos_token_ids: Union[int, List[int]] = None):
+        """
+        Add new token IDs to the list of EOS token IDs.
+        Args:
+            new_eos_token_ids: Integer, string, or list of integers/strings representing token IDs to add.
+                               If strings are provided, they will be converted to integers if possible.
+        """
+        if new_eos_token_ids is None:
+            return
+        if self.tokenizer is None:
+            raise ValueError("Processor is not provided")
+        if new_eos_token_ids is not None:
+            if isinstance(new_eos_token_ids, str):
+                new_eos_token_ids = [new_eos_token_ids]
+            new_eos_token_ids = [
+                self.tokenizer.encode(" " + token, add_special_tokens=False)[-1]
+                for token in new_eos_token_ids
+            ]
+            self.eos_token_ids.extend(new_eos_token_ids)
+    def reset(self, eos_token_ids: List[int] = None):
+        eos_token_ids = (
+            eos_token_ids if eos_token_ids is not None else self.tokenizer.eos_token_ids
+        )
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+        if self.eos_token_ids != eos_token_ids:
+            self.eos_token_ids = eos_token_ids
+    def __call__(self, input_ids: mx.array) -> bool:
+        return input_ids in self.eos_token_ids
+def print_array_report(t: mx.array, label: Optional[str]) -> dict:
+    """
+    Return a dictionary report of an MLX array similar to PyTorch's tensor representation.
+    Args:
+        arr: MLX array to analyze
+    Returns:
+        Dictionary containing shape, dtype, value representation, and statistics
+    """
+    # Get basic statistics
+    mean_val = mx.mean(t)
+    std_val = mx.std(t)
+    min_val = mx.min(t)
+    max_val = mx.max(t)
+    report = {
+        "shape": f"{tuple(t.shape)}",
+        "dtype": str(t.dtype),
+        "value": repr(t),
+        "mean": f"array({mean_val}, dtype={t.dtype})",
+        "std": f"array({std_val}, dtype={t.dtype})",
+        "min": f"array({min_val}, dtype={t.dtype})",
+        "max": f"array({max_val}, dtype={t.dtype})",
+        "label": label if label else "array",
+    }
+    # Print each field, handling 'value' specially
+    print("{")
+    for key, value in report.items():
+        if key == "value":
+            print(f" '{key}': {value},")  # No quotes around value
+        else:
+            print(f" '{key}': {repr(value)},")
+    print("}")
+    return report