PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/generate.py ADDED Viewed

@@ -0,0 +1,1457 @@
+import argparse
+import codecs
+import contextlib
+import functools
+import json
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_reduce
+from mlx_lm.generate import maybe_quantize_kv_cache
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer
+from .models import cache
+from .prompt_utils import apply_chat_template
+from .sample_utils import top_p_sampling
+from .utils import (
+    StoppingCriteria,
+    apply_repetition_penalty,
+    group_images_by_shape,
+    load,
+    prepare_inputs,
+)
+DEFAULT_MODEL_PATH = "mlx-community/nanoLLaVA-1.5-8bit"
+DEFAULT_IMAGE = None
+DEFAULT_AUDIO = None
+DEFAULT_PROMPT = "What are these?"
+DEFAULT_MAX_TOKENS = 256
+DEFAULT_TEMPERATURE = 0.5
+DEFAULT_TOP_P = 1.0
+DEFAULT_SEED = 0
+DEFAULT_QUANTIZED_KV_START = 5000
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Generate text from an image using a model."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=DEFAULT_MODEL_PATH,
+        help="The path to the local model directory or Hugging Face repo.",
+    )
+    parser.add_argument(
+        "--adapter-path",
+        type=str,
+        default=None,
+        help="The path to the adapter weights.",
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        nargs="+",
+        default=DEFAULT_IMAGE,
+        help="URL or path of the image to process.",
+    )
+    parser.add_argument(
+        "--audio",
+        type=str,
+        nargs="+",
+        default=DEFAULT_AUDIO,
+        help="URL or path of the audio to process.",
+    )
+    parser.add_argument(
+        "--resize-shape",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Resize shape for the image.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        nargs="+",
+        default=DEFAULT_PROMPT,
+        help="Message to be processed by the model.",
+    )
+    parser.add_argument(
+        "--system",
+        type=str,
+        default=None,
+        help="System message for the model.",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=DEFAULT_MAX_TOKENS,
+        help="Maximum number of tokens to generate.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=DEFAULT_TEMPERATURE,
+        help="Temperature for sampling.",
+    )
+    parser.add_argument("--chat", action="store_true", help="Chat in multi-turn style.")
+    parser.add_argument("--verbose", action="store_false", help="Detailed output.")
+    parser.add_argument(
+        "--eos-tokens",
+        type=str,
+        nargs="+",
+        default=None,
+        help="EOS tokens to add to the tokenizer.",
+    )
+    parser.add_argument(
+        "--max-kv-size",
+        type=int,
+        default=None,
+        help="Maximum KV size for the prompt cache.",
+    )
+    parser.add_argument(
+        "--kv-bits",
+        type=int,
+        default=None,
+        help="Number of bits to quantize the KV cache to.",
+    )
+    parser.add_argument(
+        "--kv-group-size",
+        type=int,
+        default=64,
+        help="Group size for the KV cache.",
+    )
+    parser.add_argument(
+        "--quantized-kv-start",
+        type=int,
+        default=DEFAULT_QUANTIZED_KV_START,
+        help="Start index for the quantized KV cache.",
+    )
+    parser.add_argument(
+        "--skip-special-tokens",
+        action="store_true",
+        help="Skip special tokens in the detokenizer.",
+    )
+    parser.add_argument(
+        "--force-download",
+        action="store_true",
+        help="Force download the model from Hugging Face.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default="main",
+        help="The specific model version to use (branch, tag, commit).",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code when loading the model.",
+    )
+    parser.add_argument(
+        "--processor-kwargs",
+        type=json.loads,
+        default={},
+        help="Extra processor kwargs as JSON. "
+        'Example: --processor-kwargs \'{"cropping": false, "max_patches": 3}\'',
+    )
+    parser.add_argument(
+        "--prefill-step-size",
+        type=int,
+        default=None,
+        help="Number of tokens to process per prefill step. "
+        "Lower values reduce peak memory usage but may be slower. "
+        "Try 512 or 256 if you hit GPU memory errors during prefill.",
+    )
+    return parser.parse_args()
+# A stream on the default device just for generation
+generation_stream = mx.new_stream(mx.default_device())
+@contextlib.contextmanager
+def wired_limit(model: nn.Module, streams: Optional[List[mx.Stream]] = None):
+    """
+    A context manager to temporarily change the wired limit.
+    Note, the wired limit should not be changed during an async eval.  If an
+    async eval could be running pass in the streams to synchronize with prior
+    to exiting the context manager.
+    """
+    if not mx.metal.is_available():
+        yield
+        return
+    model_bytes = tree_reduce(
+        lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
+    )
+    max_rec_size = mx.device_info()["max_recommended_working_set_size"]
+    if model_bytes > 0.9 * max_rec_size:
+        model_mb = model_bytes // 2**20
+        max_rec_mb = max_rec_size // 2**20
+        print(
+            f"[WARNING] Generating with a model that requires {model_mb} MB "
+            f"which is close to the maximum recommended size of {max_rec_mb} "
+            "MB. This can be slow. See the documentation for possible work-arounds: "
+            "https://github.com/ml-explore/mlx-lm/tree/main#large-models"
+        )
+    old_limit = mx.set_wired_limit(max_rec_size)
+    try:
+        yield
+    finally:
+        if streams is not None:
+            for s in streams:
+                mx.synchronize(s)
+        else:
+            mx.synchronize()
+        mx.set_wired_limit(old_limit)
+@dataclass
+class GenerationResult:
+    text: str = ""
+    token: Optional[int] = None
+    logprobs: Optional[List[float]] = None
+    prompt_tokens: int = 0
+    generation_tokens: int = 0
+    total_tokens: int = 0
+    prompt_tps: float = 0.0
+    generation_tps: float = 0.0
+    peak_memory: float = 0.0
+def generate_step(
+    input_ids: mx.array,
+    model: nn.Module,
+    pixel_values,
+    mask,
+    *,
+    max_tokens: int = 256,
+    temperature: float = 0.0,
+    repetition_penalty: Optional[float] = None,
+    repetition_context_size: Optional[int] = 20,
+    top_p: float = 1.0,
+    logit_bias: Optional[Dict[int, float]] = None,
+    prompt_cache: Optional[List[Any]] = None,
+    max_kv_size: Optional[int] = None,
+    kv_bits: Optional[int] = None,
+    kv_group_size: int = 64,
+    quantized_kv_start: int = 0,
+    logits_processors: Optional[List[Callable[[mx.array, mx.array], mx.array]]] = None,
+    prefill_step_size: Optional[int] = 2048,
+    **kwargs,
+) -> Generator[Tuple[mx.array, mx.array], None, None]:
+    """
+    A generator producing token ids based on the given prompt from the model.
+    Args:
+        input_ids (mx.array): The input prompt token ids.
+        model (nn.Module): The model to use for generation.
+        pixel_values: The pixel values for vision models (optional).
+        mask: The attention mask (optional).
+        max_tokens (int): Maximum number of tokens to generate. Default: ``256``.
+        temperature (float): The temperature for sampling, if 0 the argmax is used.
+          Default: ``0``.
+        repetition_penalty (float, optional): The penalty factor for repeating
+          tokens.
+        repetition_context_size (int, optional): The number of tokens to
+          consider for repetition penalty. Default: ``20``.
+        top_p (float, optional): Nucleus sampling, higher means model considers
+          more less likely words.
+        logit_bias (dictionary, optional): Additive logit bias.
+        prompt_cache (list, optional): Pre-existing KV cache for the prompt.
+        max_kv_size (int, optional): Maximum KV cache size.
+        kv_bits (int, optional): Number of bits for KV cache quantization.
+        kv_group_size (int): Group size for KV cache quantization. Default: ``64``.
+        quantized_kv_start (int): Start index for quantized KV cache. Default: ``0``.
+        logits_processors (list, optional): List of logits processor functions.
+        prefill_step_size (int): Number of tokens to process per prefill step.
+          Chunked prefill processes prompts in smaller chunks to reduce peak
+          memory usage. Default: ``2048``.
+    Yields:
+        Generator[Tuple[mx.array, mx.array], None, None]: A generator producing
+          one token and a vector of log probabilities.
+    """
+    quantize_cache_fn = functools.partial(
+        maybe_quantize_kv_cache,
+        quantized_kv_start=quantized_kv_start,
+        kv_group_size=kv_group_size,
+        kv_bits=kv_bits,
+    )
+    def sample(logits: mx.array) -> Tuple[mx.array, float]:
+        if logit_bias:
+            indices = mx.array(list(logit_bias.keys()))
+            values = mx.array(list(logit_bias.values()))
+            logits[:, indices] += values
+        logprobs = logits - mx.logsumexp(logits)
+        if temperature == 0:
+            token = mx.argmax(logits, axis=-1)
+        else:
+            if top_p > 0 and top_p < 1.0:
+                token = top_p_sampling(logits, top_p, temperature)
+            else:
+                token = mx.random.categorical(logits * (1 / temperature))
+        return token, logprobs
+    if repetition_penalty and (
+        repetition_penalty < 0 or not isinstance(repetition_penalty, float)
+    ):
+        raise ValueError(
+            f"repetition_penalty must be a non-negative float, got {repetition_penalty}"
+        )
+    y = input_ids
+    tokens = None  # Track tokens for logits processors
+    # Create the KV cache for generation
+    if prompt_cache is None:
+        prompt_cache = cache.make_prompt_cache(
+            model.language_model,
+            max_kv_size=max_kv_size,
+        )
+    repetition_context = input_ids.reshape(-1).tolist()
+    if repetition_context_size:
+        repetition_context = repetition_context[-repetition_context_size:]
+    def _step(y, inputs_embeds=None):
+        nonlocal tokens, repetition_context, kwargs
+        with mx.stream(generation_stream):
+            if "decoder_input_ids" in kwargs:
+                outputs = model.language_model(
+                    cache=prompt_cache,
+                    **kwargs,
+                )
+            else:
+                outputs = model.language_model(
+                    y,
+                    inputs_embeds=inputs_embeds,
+                    cache=prompt_cache,
+                    **kwargs,
+                )
+            logits = outputs.logits[:, -1, :]
+            # Apply logits processors before repetition penalty
+            if logits_processors:
+                # Efficiently update tokens by concatenating only the new token
+                tokens = mx.concat([tokens, y])
+                for processor in logits_processors:
+                    logits = processor(tokens, logits)
+            if repetition_penalty:
+                logits = apply_repetition_penalty(
+                    logits, repetition_context, repetition_penalty
+                )
+                y, logprobs = sample(logits)
+                repetition_context.append(y.item())
+            else:
+                y, logprobs = sample(logits)
+            if repetition_context_size:
+                if len(repetition_context) > repetition_context_size:
+                    repetition_context = repetition_context[-repetition_context_size:]
+            quantize_cache_fn(prompt_cache)
+            if outputs.cross_attention_states is not None:
+                kwargs = {"cross_attention_states": outputs.cross_attention_states}
+            elif outputs.encoder_outputs is not None:
+                kwargs = {
+                    "decoder_input_ids": y[None],
+                    "encoder_outputs": outputs.encoder_outputs,
+                }
+            else:
+                kwargs = {}
+            return y, logprobs.squeeze(0)
+    with mx.stream(generation_stream):
+        # Get input embeddings (handles both multimodal and text-only)
+        embedding_output = model.get_input_embeddings(
+            input_ids, pixel_values, mask=mask, **kwargs
+        )
+        inputs_embeds = embedding_output.inputs_embeds
+        kwargs.update(
+            {
+                k: v
+                for k, v in embedding_output.to_dict().items()
+                if k != "inputs_embeds" and v is not None
+            }
+        )
+        if prefill_step_size is not None and inputs_embeds.shape[1] > prefill_step_size:
+            # Chunked prefill with embeddings
+            total_tokens = inputs_embeds.shape[1]
+            with tqdm(total=total_tokens, desc="Prefill", unit="tok") as pbar:
+                while inputs_embeds.shape[1] > 1:
+                    n_to_process = min(prefill_step_size, inputs_embeds.shape[1] - 1)
+                    model.language_model(
+                        inputs=input_ids[:, :n_to_process],
+                        inputs_embeds=inputs_embeds[:, :n_to_process],
+                        cache=prompt_cache,
+                        **kwargs,
+                    )
+                    quantize_cache_fn(prompt_cache)
+                    mx.eval([c.state for c in prompt_cache])
+                    inputs_embeds = inputs_embeds[:, n_to_process:]
+                    input_ids = input_ids[:, n_to_process:]
+                    mx.clear_cache()
+                    pbar.update(n_to_process)
+            input_ids = input_ids[:, -1:]
+        y, logprobs = _step(input_ids, inputs_embeds=inputs_embeds)
+    mx.async_eval(y)
+    n = 0
+    while True:
+        if n != max_tokens:
+            next_y, next_logprobs = _step(y[None])
+            mx.async_eval(next_y)
+        if n == 0:
+            mx.eval(y)
+        if n == max_tokens:
+            break
+        yield y.item(), logprobs
+        if n % 256 == 0:
+            mx.clear_cache()
+        y, logprobs = next_y, next_logprobs
+        n += 1
+def stream_generate(
+    model: nn.Module,
+    processor: PreTrainedTokenizer,
+    prompt: str,
+    image: Union[str, List[str]] = None,
+    audio: Union[str, List[str]] = None,
+    **kwargs,
+) -> Union[str, Generator[str, None, None]]:
+    """
+    A generator producing text based on the given prompt from the model.
+    Args:
+        model (nn.Module): The model to use for generation.
+        processor (PreTrainedTokenizer): The tokenizer/processor.
+        prompt (str): The input prompt text.
+        image (Union[str, List[str]], optional): Image path(s) or URL(s).
+        audio (Union[str, List[str]], optional): Audio file path(s).
+        prefill_step_size (int, optional): Number of tokens to process per prefill
+          step. When set, enables chunked prefill which processes long prompts in
+          smaller chunks to reduce peak memory usage.
+        kwargs: Additional options passed to :func:`generate_step`.
+          See :func:`generate_step` for more details.
+    Yields:
+        Generator[GenerationResult]: A generator producing GenerationResult objects
+          containing the generated text, tokens, and statistics.
+    """
+    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+    # Skip special tokens
+    skip_special_tokens = kwargs.pop("skip_special_tokens", False)
+    skip_special_token_ids = (
+        set(tokenizer.all_special_ids)
+        if skip_special_tokens and hasattr(tokenizer, "all_special_ids")
+        else []
+    )
+    add_special_tokens = (
+        not hasattr(processor, "chat_template")
+        if model.config.model_type in ["gemma3", "gemma3n"]
+        else True
+    )
+    resize_shape = kwargs.pop("resize_shape", None)
+    image_token_index = getattr(model.config, "image_token_index", None)
+    if kwargs.get("input_ids", None) is not None:
+        input_ids = kwargs.pop("input_ids")
+        pixel_values = kwargs.pop("pixel_values", None)
+        mask = kwargs.pop("mask", None)
+    else:
+        inputs = prepare_inputs(
+            processor,
+            images=image,
+            audio=audio,
+            prompts=prompt,
+            image_token_index=image_token_index,
+            resize_shape=resize_shape,
+            add_special_tokens=add_special_tokens,
+            **kwargs,
+        )
+        input_ids = inputs.get("input_ids", None)
+        pixel_values = inputs.get("pixel_values", None)
+        mask = inputs.get("attention_mask", None)
+        data_kwargs = {
+            k: v
+            for k, v in inputs.items()
+            if k not in ["input_ids", "pixel_values", "attention_mask"]
+        }
+        kwargs.update(data_kwargs)
+    with wired_limit(model, [generation_stream]):
+        detokenizer = processor.detokenizer
+        detokenizer.reset()
+        tic = time.perf_counter()
+        # #region agent log
+        import json
+        log_file = "/Users/zekieldee/Desktop/code/mlx-vlm/.cursor/debug.log"
+        def log_debug(location, message, data, hypothesis_id):
+            try:
+                with open(log_file, "a") as f:
+                    f.write(json.dumps({"sessionId": "debug-session", "runId": "generation", "hypothesisId": hypothesis_id, "location": location, "message": message, "data": data, "timestamp": __import__("time").time_ns() // 1000000}) + "\n")
+            except: pass
+        log_debug("generate.py:stream_generate_start", "Tokenizer and model info", {
+            "model_type": model.config.model_type if hasattr(model.config, "model_type") else "unknown",
+            "tokenizer_class": tokenizer.__class__.__name__,
+            "vocab_size": tokenizer.vocab_size if hasattr(tokenizer, "vocab_size") else "unknown",
+            "eos_token_id": tokenizer.eos_token_id if hasattr(tokenizer, "eos_token_id") else "unknown",
+            "bos_token_id": tokenizer.bos_token_id if hasattr(tokenizer, "bos_token_id") else "unknown",
+            "pad_token_id": tokenizer.pad_token_id if hasattr(tokenizer, "pad_token_id") else "unknown",
+        }, "H2,H3,H4")
+        # #endregion
+        try:
+            for n, (token, logprobs) in enumerate(
+                generate_step(input_ids, model, pixel_values, mask, **kwargs)
+            ):
+                if n == 0:
+                    prompt_time = time.perf_counter() - tic
+                    prompt_tps = input_ids.size / prompt_time
+                    tic = time.perf_counter()
+                    # #region agent log
+                    top5_indices = mx.argsort(logprobs)[-5:].tolist()
+                    top5_values = mx.sort(logprobs)[-5:].tolist()
+                    log_debug("generate.py:first_token", "First token generated", {
+                        "token_id": int(token),
+                        "token_str": tokenizer.decode([token]) if hasattr(tokenizer, "decode") else "N/A",
+                        "logprobs_shape": str(logprobs.shape),
+                        "logprobs_top5_indices": top5_indices,
+                        "logprobs_top5_values": top5_values,
+                    }, "H2,H4")
+                    # #endregion
+                # Stop generation if the token is in the eos_token_ids
+                if tokenizer.stopping_criteria(token):
+                    # #region agent log
+                    log_debug("generate.py:eos_detected", "EOS token detected", {"token_id": int(token), "iteration": n}, "H4")
+                    # #endregion
+                    break
+                # #region agent log
+                if n < 5:  # Log first 5 tokens
+                    decoded_token = tokenizer.decode([token]) if hasattr(tokenizer, "decode") else "N/A"
+                    log_debug("generate.py:token_decode", f"Token {n} decode", {
+                        "iteration": n,
+                        "token_id": int(token),
+                        "decoded_single": decoded_token,
+                        "detokenizer_segment": detokenizer.text if hasattr(detokenizer, "text") else "N/A",
+                    }, "H2,H4")
+                # #endregion
+                detokenizer.add_token(
+                    token, skip_special_token_ids=skip_special_token_ids
+                )
+                # Yield the last segment if streaming
+                yield GenerationResult(
+                    text=detokenizer.last_segment,
+                    token=token,
+                    logprobs=logprobs,
+                    prompt_tokens=input_ids.size,
+                    generation_tokens=n + 1,
+                    total_tokens=input_ids.size + n + 1,
+                    prompt_tps=prompt_tps,
+                    generation_tps=(n + 1) / (time.perf_counter() - tic),
+                    peak_memory=mx.get_peak_memory() / 1e9,
+                )
+            detokenizer.finalize()
+            yield GenerationResult(
+                text=detokenizer.last_segment,
+                token=token,
+                logprobs=logprobs,
+                prompt_tokens=input_ids.size,
+                generation_tokens=n + 1,
+                total_tokens=input_ids.size + n + 1,
+                prompt_tps=prompt_tps,
+                generation_tps=(n + 1) / (time.perf_counter() - tic),
+                peak_memory=mx.get_peak_memory() / 1e9,
+            )
+        except Exception as e:
+            raise
+        # Cleanup after generation
+        mx.clear_cache()
+def generate(
+    model: nn.Module,
+    processor: PreTrainedTokenizer,
+    prompt: str,
+    image: Union[str, List[str]] = None,
+    audio: Union[str, List[str]] = None,
+    verbose: bool = False,
+    **kwargs,
+) -> GenerationResult:
+    """
+    Generate text from the model.
+    Args:
+       model (nn.Module): The language model.
+       tokenizer (PreTrainedTokenizer): The tokenizer.
+       prompt (str): The string prompt.
+       temperature (float): The temperature for sampling (default 0).
+       max_tokens (int): The maximum number of tokens (default 100).
+       verbose (bool): If ``True``, print tokens and timing information
+           (default ``False``).
+       formatter (Optional[Callable]): A function which takes a token and a
+           probability and displays it.
+       repetition_penalty (float, optional): The penalty factor for repeating tokens.
+       repetition_context_size (int, optional): The number of tokens to consider for repetition penalty.
+    """
+    if verbose:
+        print("=" * 10)
+        files = []
+        if image is not None:
+            files.extend(image)
+        if audio is not None:
+            files.extend(audio)
+        if kwargs.get("video") is not None:
+            files.extend(kwargs.get("video"))
+        print(f"Files: {files}", "\n")
+        print("Prompt:", prompt)
+    text = ""
+    last_response = None
+    eos_tokens = kwargs.get("eos_tokens", None)
+    stopping_criteria = kwargs.get("stopping_criteria", None)
+    # Get the tokenizer
+    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+    # Add custom EOS tokens to the stopping criteria
+    if eos_tokens is not None:
+        tokenizer.stopping_criteria.add_eos_token_ids(eos_tokens)
+    # Use custom stopping criteria
+    elif stopping_criteria is not None:
+        if isinstance(stopping_criteria, StoppingCriteria) or callable(
+            stopping_criteria
+        ):
+            tokenizer.stopping_criteria = stopping_criteria
+        else:
+            raise ValueError(
+                "stopping_criteria must be an instance of StoppingCriteria or a callable"
+            )
+    else:
+        tokenizer.stopping_criteria.reset(model.config.eos_token_id)
+    for response in stream_generate(model, processor, prompt, image, audio, **kwargs):
+        if verbose:
+            print(response.text, end="", flush=True)
+        text += response.text
+        last_response = response
+    if verbose:
+        print("\n" + "=" * 10)
+        if len(text) == 0:
+            print("No text generated for this prompt")
+            return GenerationResult(
+                text=text,
+                token=None,
+                logprobs=None,
+                prompt_tokens=0,
+                generation_tokens=0,
+                total_tokens=0,
+                prompt_tps=0.0,
+                generation_tps=0.0,
+                peak_memory=mx.get_peak_memory() / 1e9,
+            )
+        print(
+            f"Prompt: {last_response.prompt_tokens} tokens, "
+            f"{last_response.prompt_tps:.3f} tokens-per-sec"
+        )
+        print(
+            f"Generation: {last_response.generation_tokens} tokens, "
+            f"{last_response.generation_tps:.3f} tokens-per-sec"
+        )
+        print(f"Peak memory: {last_response.peak_memory:.3f} GB")
+    return GenerationResult(
+        text=text,
+        token=last_response.token,
+        logprobs=last_response.logprobs,
+        prompt_tokens=last_response.prompt_tokens,
+        generation_tokens=last_response.generation_tokens,
+        total_tokens=last_response.total_tokens,
+        prompt_tps=last_response.prompt_tps,
+        generation_tps=last_response.generation_tps,
+        peak_memory=last_response.peak_memory,
+    )
+@dataclass
+class BatchGenerationResult:
+    """
+    Result of batch generation with optional image size tracking.
+    Attributes:
+        texts: Generated text for each sample
+        tokens: Last generated token for each sample
+        logprobs: Log probabilities for each sample
+        prompt_tokens: Number of prompt tokens per sample
+        generation_tokens: Number of generated tokens per sample
+        total_tokens: Total tokens (prompt + generation) per sample
+        prompt_tps: Prompt tokens per second per sample
+        generation_tps: Generation tokens per second per sample
+        peak_memory: Peak memory usage in GB
+        image_sizes: Original (height, width) for each image (for tracking)
+    """
+    texts: List[str]
+    tokens: List[Optional[int]]
+    logprobs: List[Optional[List[float]]]
+    prompt_tokens: List[int]
+    generation_tokens: List[int]
+    total_tokens: List[int]
+    prompt_tps: List[float]
+    generation_tps: List[float]
+    peak_memory: float = 0.0
+    image_sizes: Optional[List[Tuple[int, int]]] = None
+def _left_pad_prompts(prompts, max_length=None):
+    if max_length is None:
+        max_length = max(len(p) for p in prompts)
+    return mx.array([[0] * (max_length - len(p)) + p for p in prompts])
+def _make_cache(model, left_padding):
+    """
+    Convert a list of regular caches into their corresponding
+    batch-aware caches.
+    """
+    def to_batch_cache(c):
+        if isinstance(c, cache.KVCache):
+            return cache.BatchKVCache(left_padding)
+        elif isinstance(c, cache.ArraysCache):
+            c.left_padding = mx.array(left_padding)
+            return c
+        elif isinstance(c, cache.RotatingKVCache):
+            if c.keep > 0:
+                raise ValueError("RotatingKVCache with keep tokens is not supported.")
+            return cache.BatchRotatingKVCache(c.max_size, left_padding)
+        elif isinstance(c, cache.CacheList):
+            return cache.BatchCacheList(*(to_batch_cache(sub_c) for sub_c in c.caches))
+        else:
+            raise ValueError(f"{type(c)} does not yet support batching")
+    if hasattr(model, "make_cache"):
+        model_cache = model.make_cache()
+        return [to_batch_cache(c) for c in model_cache]
+    else:
+        return [cache.BatchKVCache(left_padding) for _ in model.layers]
+@dataclass
+class BatchStats:
+    """
+    An data object to hold generation stats.
+    Args:
+        prompt_tokens (int): The number of prompt tokens processed.
+        prompt_tps (float): The prompt processing tokens-per-second.
+        prompt_time (float): The time in seconds spent in prompt processing.
+        generation_tokens (int): The number of generated tokens.
+        generation_tps (float): The tokens-per-second for generation.
+        generation_time (float): The time in seconds spent in generation .
+        peak_memory (float): The peak memory used so far in GB.
+    """
+    prompt_tokens: int = 0
+    prompt_tps: float = 0
+    prompt_time: float = 0
+    generation_tokens: int = 0
+    generation_tps: float = 0
+    generation_time: float = 0
+    peak_memory: float = 0
+@dataclass
+class BatchResponse:
+    """
+    An data object to hold a batch generation response.
+    Args:
+        texts: (List[str]): The generated text for each prompt.
+        stats (BatchStats): Statistics about the generation.
+        image_sizes: (Optional[List[Tuple[int, int]]]): Original (height, width)
+            for each image. Useful for tracking which images produced which responses
+            and for debugging padding/batching behavior.
+    """
+    texts: List[str]
+    stats: BatchStats
+    image_sizes: Optional[List[Tuple[int, int]]] = None
+@dataclass
+class Batch:
+    uids: List[int]
+    y: mx.array
+    logprobs: mx.array
+    max_tokens: List[int]
+    num_tokens: List[int]
+    cache: List[Any]
+    def __len__(self):
+        return len(self.uids)
+    def filter(self, keep_idx: List[int]):
+        self.uids = [self.uids[k] for k in keep_idx]
+        self.max_tokens = [self.max_tokens[k] for k in keep_idx]
+        self.num_tokens = [self.num_tokens[k] for k in keep_idx]
+        keep_idx = mx.array(keep_idx, mx.int32)
+        self.y = self.y[keep_idx]
+        self.logprobs = self.logprobs[keep_idx]
+        for c in self.cache:
+            c.filter(keep_idx)
+    def extend(self, other):
+        self.uids.extend(other.uids)
+        self.y = mx.concatenate([self.y, other.y])
+        self.logprobs = mx.concatenate([self.logprobs, other.logprobs])
+        self.num_tokens.extend(other.num_tokens)
+        self.max_tokens.extend(other.max_tokens)
+        for c, o in zip(self.cache, other.cache):
+            c.extend(o)
+class BatchGenerator:
+    @dataclass
+    class Response:
+        uid: int
+        token: int
+        logprobs: mx.array
+        finish_reason: Optional[str]
+    def __init__(
+        self,
+        model,
+        processor,
+        max_tokens: int = 128,
+        stop_tokens: Optional[set] = None,
+        sampler: Optional[Callable[[mx.array], mx.array]] = None,
+        completion_batch_size: int = 32,
+        prefill_batch_size: int = 8,
+        prefill_step_size: int = 2048,
+        prompt_cache=None,
+    ):
+        self.model = model
+        self.unprocessed_prompts = []
+        self.max_tokens = max_tokens
+        self.processor = processor
+        self.tokenizer = (
+            processor.tokenizer if hasattr(processor, "tokenizer") else processor
+        )
+        self.sampler = sampler or (lambda x: mx.argmax(x, axis=-1))
+        self.uid_count = 0
+        self.prefill_step_size = prefill_step_size
+        self.prefill_batch_size = prefill_batch_size
+        self.completion_batch_size = completion_batch_size
+        self.prompt_cache = prompt_cache
+        self._stats = BatchStats()
+        self.tokenizer.stopping_criteria.add_eos_token_ids(stop_tokens)
+        self.active_batch = None
+    def insert(self, prompts, max_tokens: Union[List[int], int, None] = None):
+        uids = []
+        if max_tokens is None or isinstance(max_tokens, int):
+            max_tokens = [max_tokens or self.max_tokens] * len(prompts)
+        for p, m in zip(prompts, max_tokens):
+            self.unprocessed_prompts.append((self.uid_count, p, m))
+            uids.append(self.uid_count)
+            self.uid_count += 1
+        # Sort in ascending order of length
+        self.unprocessed_prompts = sorted(
+            self.unprocessed_prompts, key=lambda x: len(x[1])
+        )
+        return uids
+    def _process_prompts(self, prompts, **kwargs) -> Batch:
+        uids, inputs, max_tokens = zip(*prompts)
+        lengths = [len(p) for p in inputs]
+        max_length = max(lengths)
+        self._stats.prompt_tokens += sum(lengths)
+        left_padding = [max_length - l for l in lengths]
+        inputs = _left_pad_prompts(inputs, max_length=max_length)
+        prompt_cache = (
+            _make_cache(self.model, left_padding)
+            if self.prompt_cache is None
+            else self.prompt_cache
+        )
+        # Slice batch data in kwargs to match current batch size
+        batch_size = len(uids)
+        for key, value in kwargs.items():
+            if isinstance(value, mx.array) and value.ndim > 0:
+                kwargs[key] = value[:batch_size]
+        inputs_embeds = kwargs.pop("inputs_embeds", None)
+        if inputs_embeds is not None:
+            # Multimodal prefill
+            while inputs_embeds.shape[1] > 1:
+                n_to_process = min(self.prefill_step_size, inputs_embeds.shape[1] - 1)
+                self.model(
+                    inputs[:, :n_to_process],
+                    cache=prompt_cache,
+                    inputs_embeds=inputs_embeds[:, :n_to_process],
+                    n_to_process=n_to_process,
+                    **kwargs,
+                )
+                mx.eval([c.state for c in prompt_cache])
+                inputs_embeds = inputs_embeds[:, n_to_process:]
+                inputs = inputs[:, n_to_process:]
+                mx.clear_cache()
+            kwargs = {"inputs_embeds": inputs_embeds}
+        else:
+            # Text-only prefill
+            while inputs.shape[1] > 1 and inputs_embeds is None:
+                n_to_process = min(self.prefill_step_size, inputs.shape[1] - 1)
+                self.model(inputs[:, :n_to_process], cache=prompt_cache)
+                mx.eval([c.state for c in prompt_cache])
+                inputs = inputs[:, n_to_process:]
+                mx.clear_cache()
+        y, logprobs = self._step(inputs, prompt_cache, **kwargs)
+        mx.async_eval(y, logprobs)
+        mx.clear_cache()
+        return Batch(
+            list(uids), y, logprobs, list(max_tokens), [0] * len(uids), prompt_cache
+        )
+    def _step(self, input_tokens: mx.array, prompt_cache: List[Any], **kwargs):
+        output = self.model(input_tokens, cache=prompt_cache, **kwargs)
+        logits = output.logits[:, -1, :]
+        logprobs = logits - mx.logsumexp(logits, axis=-1, keepdims=True)
+        sampled = self.sampler(logprobs)
+        # TODO: Add KV cache quantization if specified
+        return sampled, logprobs
+    def stats(self):
+        self._stats.prompt_tps = self._stats.prompt_tokens / self._stats.prompt_time
+        self._stats.generation_tps = (
+            self._stats.generation_tokens / self._stats.generation_time
+        )
+        self._stats.peak_memory = mx.get_peak_memory() / 1e9
+        return self._stats
+    def _next(self, **kwargs):
+        tic = time.perf_counter()
+        prompt_processing = False
+        batch = self.active_batch
+        num_active = len(batch) if batch else 0
+        num_to_add = self.completion_batch_size - num_active
+        while num_to_add >= self.prefill_batch_size:
+            prompts = self.unprocessed_prompts[: self.prefill_batch_size]
+            # Finish processing the last examples of the last batch
+            if len(prompts) == 0 and num_active > 0:
+                break
+            # No more prompts and no more completions, all done
+            elif len(prompts) == 0:
+                self.active_batch = None
+                return []
+            # Process prompts
+            if batch is not None and not prompt_processing:
+                # Finish any active completion tokens
+                mx.eval(batch.y, batch.logprobs)
+                self._stats.generation_time += time.perf_counter() - tic
+                tic = time.perf_counter()
+            batch = self._process_prompts(prompts, **kwargs)
+            self.unprocessed_prompts = self.unprocessed_prompts[
+                self.prefill_batch_size :
+            ]
+            prompt_processing = True
+            # If there was no active batch, set it
+            if self.active_batch is None:
+                self.active_batch = batch
+            else:
+                self.active_batch.extend(batch)
+            num_active = len(self.active_batch)
+            num_to_add -= len(batch)
+        batch = self.active_batch
+        y, logprobs = batch.y, batch.logprobs
+        batch.y, batch.logprobs = self._step(y[:, None], batch.cache)
+        mx.async_eval(batch.y, batch.logprobs)
+        y = y.tolist()
+        toc = time.perf_counter()
+        if prompt_processing:
+            self._stats.prompt_time += toc - tic
+        else:
+            self._stats.generation_time += toc - tic
+        keep_idx = []
+        end_idx = []
+        responses = []
+        for e, (t, uid, num_tok, max_tok) in enumerate(
+            zip(y, batch.uids, batch.num_tokens, batch.max_tokens)
+        ):
+            num_tok += 1
+            batch.num_tokens[e] = num_tok
+            if self.tokenizer.stopping_criteria(t):
+                finish_reason = "stop"
+                end_idx.append(e)
+            elif num_tok >= max_tok:
+                finish_reason = "length"
+                end_idx.append(e)
+            else:
+                finish_reason = None
+                keep_idx.append(e)
+            responses.append(self.Response(uid, t, logprobs[e], finish_reason))
+        # Remove any finished completions
+        if len(end_idx):
+            if len(keep_idx) > 0:
+                batch.filter(keep_idx)
+            else:
+                self.active_batch = None
+        self._stats.generation_tokens += len(responses)
+        if len(responses) > 0 and self._stats.generation_tokens % 100 == 0:
+            mx.clear_cache()
+        return responses
+    def next(self, **kwargs):
+        with mx.stream(generation_stream):
+            return self._next(**kwargs)
+def batch_generate(
+    model,
+    processor,
+    images: Union[str, List[str]] = None,
+    audios: Union[str, List[str]] = None,
+    prompts: List[str] = None,
+    max_tokens: Union[int, List[int]] = 128,
+    verbose: bool = False,
+    group_by_shape: bool = True,
+    track_image_sizes: bool = True,
+    **kwargs,
+):
+    """
+    Generate responses for the given batch of prompts with variable-sized images.
+    This function implements the transformers-style approach to batching:
+    1. Group images with the same shape for efficient batch processing
+    2. Process each group as a batch (no padding waste within groups)
+    3. Track original image sizes for proper attention masking
+    4. Restore results to original batch order
+    Key insight: Instead of padding all images to the same spatial dimensions
+    (which wastes computation and may hurt accuracy), we group same-sized
+    images together so there's zero padding within each group.
+    Args:
+       model (nn.Module): The language model.
+       processor (PreTrainedTokenizer): The tokenizer/processor.
+       images (Union[str, List[str]]): Images (paths, URLs, or PIL images).
+       audios (Union[str, List[str]]): Audio files (not yet supported for batching).
+       prompts (List[str]): The input prompts.
+       max_tokens (Union[int, List[int]]): Maximum number of output tokens. This
+          can be per prompt if a list is provided.
+       verbose (bool): If ``True``, print tokens and timing information.
+          Default: ``False``.
+       group_by_shape (bool): If ``True``, group same-shaped images for efficient
+          batch processing. Default: ``True``.
+       track_image_sizes (bool): If ``True``, track and return original image sizes.
+          Default: ``True``.
+       kwargs: The remaining options get passed to :obj:`BatchGenerator`.
+          See :obj:`BatchGenerator` for more details.
+    Returns:
+        BatchResponse with generated texts, statistics, and optionally image_sizes.
+    """
+    from PIL import Image
+    from .utils import process_image
+    processor.detokenizer.reset()
+    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+    # Handle single image case
+    if isinstance(images, str):
+        images = [images]
+    # Handle no images case
+    if images is None:
+        texts, stats = _generate_batch(
+            model, processor, prompts, None, max_tokens, verbose, **kwargs
+        )
+        return BatchResponse(texts, stats)
+    # Load and preprocess images
+    image_processor = (
+        processor.image_processor if hasattr(processor, "image_processor") else None
+    )
+    processed_images = []
+    image_sizes_original = []
+    for img in images:
+        if isinstance(img, str):
+            pil_img = process_image(img, None, image_processor)
+        elif isinstance(img, Image.Image):
+            pil_img = img
+        else:
+            pil_img = img
+        processed_images.append(pil_img)
+        # Track original size
+        if hasattr(pil_img, "height"):
+            image_sizes_original.append((pil_img.height, pil_img.width))
+        else:
+            image_sizes_original.append((0, 0))
+    # Group images by shape for efficient processing (no padding within groups)
+    if group_by_shape and len(processed_images) > 1:
+        grouped_images, grouped_indices = group_images_by_shape(processed_images)
+        if verbose:
+            print(f"[batch_generate] Found {len(grouped_images)} unique image shapes")
+    else:
+        # Single image or grouping disabled - treat as one group
+        shape = (
+            (processed_images[0].height, processed_images[0].width)
+            if processed_images
+            else (0, 0)
+        )
+        grouped_images = {shape: processed_images}
+        grouped_indices = {shape: list(range(len(processed_images)))}
+    # Process each shape group
+    all_texts = [None] * len(prompts)
+    all_image_sizes = [None] * len(prompts)
+    total_stats = BatchStats()
+    for shape, indices in grouped_indices.items():
+        # Get images and prompts for this shape group
+        group_images = [processed_images[i] for i in indices]
+        group_prompts = [prompts[i] for i in indices]
+        group_sizes = [image_sizes_original[i] for i in indices]
+        # Handle per-sample max_tokens
+        if isinstance(max_tokens, list):
+            group_max_tokens = [max_tokens[i] for i in indices]
+        else:
+            group_max_tokens = max_tokens
+        # Process the entire group at once (same shape = no padding needed)
+        chunk_texts, chunk_stats = _generate_batch(
+            model,
+            processor,
+            group_prompts,
+            group_images,
+            group_max_tokens,
+            **kwargs,
+        )
+        # Store results in original order
+        for j, orig_idx in enumerate(indices):
+            all_texts[orig_idx] = chunk_texts[j]
+            all_image_sizes[orig_idx] = group_sizes[j]
+        # Accumulate stats
+        total_stats.prompt_tokens += chunk_stats.prompt_tokens
+        total_stats.prompt_time += chunk_stats.prompt_time
+        total_stats.generation_tokens += chunk_stats.generation_tokens
+        total_stats.generation_time += chunk_stats.generation_time
+    mx.clear_cache()
+    # Compute final stats
+    if total_stats.prompt_time > 0:
+        total_stats.prompt_tps = total_stats.prompt_tokens / total_stats.prompt_time
+    if total_stats.generation_time > 0:
+        total_stats.generation_tps = (
+            total_stats.generation_tokens / total_stats.generation_time
+        )
+    total_stats.peak_memory = mx.get_peak_memory() / 1e9
+    if verbose:
+        print(f"[batch_generate] Finished processing {len(prompts)} samples")
+        print(
+            f"[batch_generate] Prompt: {total_stats.prompt_tokens} tokens, {total_stats.prompt_tps:.3f} tokens-per-sec"
+        )
+        print(
+            f"[batch_generate] Generation: {total_stats.generation_tokens} tokens, "
+            f"{total_stats.generation_tps:.3f} tokens-per-sec"
+        )
+        print(f"[batch_generate] Peak memory: {total_stats.peak_memory:.3f} GB")
+    response = BatchResponse(all_texts, total_stats)
+    if track_image_sizes:
+        response.image_sizes = all_image_sizes
+    return response
+def _generate_batch(
+    model,
+    processor,
+    prompts: List[str],
+    images: List = None,
+    max_tokens: Union[int, List[int]] = 100,
+    verbose: bool = False,
+    **kwargs,
+) -> Tuple[List[str], BatchStats]:
+    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+    batch_size = len(prompts)
+    num_images_list = [
+        1 if i < (len(images) if images is not None else 0) else 0
+        for i in range(len(prompts))
+    ]
+    formatted_prompts = [
+        apply_chat_template(
+            processor,
+            model.config,
+            p,
+            num_images=num_images_list[i],
+        )
+        for i, p in enumerate(prompts)
+    ]
+    add_special_tokens = (
+        not hasattr(processor, "chat_template")
+        if model.config.model_type in ["gemma3", "gemma3n"]
+        else True
+    )
+    resize_shape = kwargs.pop("resize_shape", None)
+    image_token_index = getattr(model.config, "image_token_index", None)
+    inputs = prepare_inputs(
+        processor,
+        images=images,
+        audio=None,
+        prompts=formatted_prompts,
+        image_token_index=image_token_index,
+        resize_shape=resize_shape,
+        add_special_tokens=add_special_tokens,
+        pad_to_uniform_size=False,  # Since images are pre-grouped by shape, they're already uniform size
+    )
+    input_ids = inputs.get("input_ids", None)
+    pixel_values = inputs.get("pixel_values", None)
+    data_kwargs = {
+        k: v
+        for k, v in inputs.items()
+        if k not in ["input_ids", "pixel_values", "attention_mask"]
+    }
+    # Use batch_size for prefill and completion to ensure consistent processing
+    gen = BatchGenerator(
+        model.language_model,
+        processor,
+        prefill_batch_size=batch_size,
+        completion_batch_size=batch_size,
+        **kwargs,
+    )
+    with wired_limit(model, [generation_stream]):
+        if pixel_values is not None:
+            embedding_output = model.get_input_embeddings(
+                input_ids, pixel_values, **data_kwargs
+            )
+            # Normalize embedding output to a kwargs dict expected by BatchGenerator
+            if isinstance(embedding_output, dict):
+                embed_kwargs = embedding_output
+            elif hasattr(embedding_output, "to_dict"):
+                # Convert to dict and keep non-None fields
+                embed_kwargs = {
+                    k: v for k, v in embedding_output.to_dict().items() if v is not None
+                }
+            else:
+                # Assume it's directly an inputs_embeds array
+                embed_kwargs = {"inputs_embeds": embedding_output}
+            gen_kwargs = {
+                "pixel_values": pixel_values,
+                **data_kwargs,
+                **embed_kwargs,
+            }
+        else:
+            input_ids = mx.squeeze(input_ids, axis=0)
+            gen_kwargs = {}
+        uids = gen.insert(input_ids.tolist(), max_tokens)
+        results = {uid: [] for uid in uids}
+        while responses := gen.next(**gen_kwargs):
+            for r in responses:
+                if r.finish_reason != "stop":
+                    results[r.uid].append(r.token)
+    texts = [tokenizer.decode(results[uid]) for uid in uids]
+    return texts, gen.stats()
+def main():
+    args = parse_arguments()
+    if isinstance(args.image, str):
+        args.image = [args.image]
+    model, processor = load(
+        args.model,
+        args.adapter_path,
+        revision=args.revision,
+        trust_remote_code=args.trust_remote_code,
+    )
+    config = model.config
+    prompt = args.prompt
+    num_images = len(args.image) if args.image is not None else 0
+    num_audios = (
+        1 if args.audio is not None else 0
+    )  # TODO: Support multiple audio files
+    prompt = apply_chat_template(
+        processor, config, prompt, num_images=num_images, num_audios=num_audios
+    )
+    kwargs = {}
+    if args.resize_shape is not None:
+        if len(args.resize_shape) not in [1, 2]:
+            raise ValueError("Resize shape must be 1 or 2 integers")
+        kwargs["resize_shape"] = (
+            (args.resize_shape[0],) * 2
+            if len(args.resize_shape) == 1
+            else tuple(args.resize_shape)
+        )
+    if args.eos_tokens is not None:
+        eos_tokens = []
+        for token in args.eos_tokens:
+            try:
+                decoded_token = codecs.decode(token, "unicode_escape")
+                eos_tokens.append(decoded_token)
+            except (UnicodeDecodeError, UnicodeError):
+                eos_tokens.append(token)
+        kwargs["eos_tokens"] = eos_tokens
+    if args.skip_special_tokens:
+        kwargs["skip_special_tokens"] = args.skip_special_tokens
+    # Add processor kwargs from JSON
+    if args.processor_kwargs:
+        kwargs.update(args.processor_kwargs)
+    if args.chat:
+        chat = []
+        if args.system:
+            chat.append({"role": "system", "content": args.system})
+        while user := input("User:"):
+            chat.append({"role": "user", "content": user})
+            prompt = apply_chat_template(processor, config, chat, num_images=num_images)
+            response = ""
+            print("Assistant:", end="")
+            stream_kwargs = {
+                "max_tokens": args.max_tokens,
+                "temperature": args.temperature,
+                **kwargs,
+            }
+            if args.prefill_step_size is not None:
+                stream_kwargs["prefill_step_size"] = args.prefill_step_size
+            for chunk in stream_generate(
+                model,
+                processor,
+                prompt,
+                args.image,
+                args.audio,
+                **stream_kwargs,
+            ):
+                response += chunk.text
+                print(chunk.text, end="")
+            chat.append({"role": "assistant", "content": response})
+            print()
+    else:
+        gen_kwargs = {
+            "image": args.image,
+            "audio": args.audio,
+            "temperature": args.temperature,
+            "max_tokens": args.max_tokens,
+            "verbose": args.verbose,
+            "max_kv_size": args.max_kv_size,
+            "kv_bits": args.kv_bits,
+            "kv_group_size": args.kv_group_size,
+            "quantized_kv_start": args.quantized_kv_start,
+            **kwargs,
+        }
+        if args.prefill_step_size is not None:
+            gen_kwargs["prefill_step_size"] = args.prefill_step_size
+        result = generate(
+            model,
+            processor,
+            prompt,
+            **gen_kwargs,
+        )
+        if not args.verbose:
+            print(result.text)
+if __name__ == "__main__":
+    print(
+        "Calling `python -m mlx_vlm.generate ...` directly is deprecated."
+        " Use `mlx_vlm generate` or `python -m mlx_vlm generate` instead."
+    )
+    main()