PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/lora.py ADDED Viewed

@@ -0,0 +1,207 @@
+import argparse
+import json
+import logging
+import os
+import mlx.optimizers as optim
+from datasets import load_dataset
+from tqdm import tqdm
+from .prompt_utils import apply_chat_template
+from .trainer import Dataset, Trainer, save_adapter
+from .trainer.utils import apply_lora_layers, find_all_linear_names, get_peft_model
+from .utils import load, load_image_processor
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def custom_print(*args, **kwargs):
+    tqdm.write(" ".join(map(str, args)), **kwargs)
+def main(args):
+    logger.info(f"\033[32mLoading model from {args.model_path}\033[0m")
+    model, processor = load(
+        args.model_path, processor_config={"trust_remote_code": True}
+    )
+    config = model.config.__dict__
+    image_processor = load_image_processor(args.model_path)
+    logger.info(f"\033[32mLoading dataset from {args.dataset}\033[0m")
+    dataset = load_dataset(args.dataset, split=args.split)
+    if "messages" not in dataset.column_names:
+        raise ValueError("Dataset must have a 'messages' column")
+    if "images" not in dataset.column_names:
+        raise ValueError("Dataset must have an 'images' column")
+    if args.apply_chat_template:
+        logger.info(f"\033[32mApplying chat template to the dataset\033[0m")
+        def process_data(examples):
+            if config["model_type"] == "pixtral":
+                conversations = apply_chat_template(
+                    config=config,
+                    processor=processor,
+                    prompt=examples["messages"],
+                    return_messages=True,
+                )
+                examples["messages"] = [
+                    json.dumps(item, ensure_ascii=False) for item in conversations
+                ]
+            else:
+                examples["messages"] = apply_chat_template(
+                    config=config,
+                    processor=processor,
+                    prompt=examples["messages"],
+                    return_messages=True,
+                )
+            return examples
+        dataset = dataset.map(process_data)
+    dataset = Dataset(
+        dataset,
+        config,
+        processor,
+        image_processor=image_processor,
+        image_resize_shape=args.image_resize_shape,
+    )
+    adapter_path = args.adapter_path
+    if adapter_path:
+        logger.info(f"\033[32mResuming from adapter path {adapter_path}\033[0m")
+        logger.info(
+            f"\033[32mLora rank, alpha, and dropout will be loaded from adapter_config.json file\033[0m"
+        )
+        model = apply_lora_layers(model, adapter_path)
+    else:
+        logger.info(f"\033[32mSetting up LoRA\033[0m")
+        list_of_modules = find_all_linear_names(model.language_model)
+        model = get_peft_model(
+            model,
+            list_of_modules,
+            rank=args.lora_rank,
+            alpha=args.lora_alpha,
+            dropout=args.lora_dropout,
+        )
+    logger.info(f"\033[32mSetting up optimizer\033[0m")
+    optimizer = optim.Adam(learning_rate=args.learning_rate)
+    logger.info(f"\033[32mSetting up trainer\033[0m")
+    trainer = Trainer(model, optimizer)
+    model.train()
+    # Training loop
+    logger.info(f"\033[32mTraining model\033[0m")
+    for epoch in range(args.epochs):
+        if args.steps == 0:
+            args.steps = len(dataset) // args.batch_size
+        progress_bar = tqdm(range(args.steps), position=0, leave=True)
+        for i in progress_bar:
+            loss = trainer.train_step(
+                dataset[i * args.batch_size : (i + 1) * args.batch_size]
+            )
+            # Update progress bar
+            progress_bar.update(1)
+            progress_bar.set_postfix(
+                {"Epoch": epoch, "Step": i, "Loss": f"{loss.item():.4f}"}
+            )
+            if i % args.print_every == 0:
+                # Log additional information
+                custom_print(
+                    {
+                        "Epoch": epoch,
+                        "Step": i,
+                        "Loss": f"{loss.item():.4f}",
+                    }
+                )
+        # Save the interim adapter after each epoch except the last.
+        if args.save_after_epoch and (epoch < (args.epochs - 1)):
+            head, tail = os.path.split(args.output_path)
+            save_adapter(model, head + os.sep + "epoch_" + str(epoch) + "_" + tail)
+    # Save the adapter
+    save_adapter(model, args.output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train NanoLLaVA model")
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="mlx-community/Qwen2-VL-2B-Instruct-bf16",
+        help="Path to the pre-trained model",
+    )
+    parser.add_argument(
+        "--dataset", type=str, required=True, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--split", type=str, default="train", help="Split to use for training"
+    )
+    parser.add_argument(
+        "--image-resize-shape",
+        type=int,
+        nargs=2,
+        default=None,
+        help="Resize images to this shape",
+    )
+    parser.add_argument(
+        "--apply-chat-template",
+        action="store_false",
+        help="Apply chat template to the dataset",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=1e-4,
+        help="Learning rate for the optimizer",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=1, help="Batch size for training"
+    )
+    parser.add_argument(
+        "--epochs", type=int, default=1, help="Number of epochs to train"
+    )
+    parser.add_argument(
+        "--steps", type=int, default=0, help="Number of steps per epoch"
+    )
+    parser.add_argument(
+        "--print-every", type=int, default=10, help="Print loss every n steps"
+    )
+    parser.add_argument(
+        "--lora-alpha",
+        type=float,
+        default=0.1,
+        help="LoRA scaling factor (alpha / rank)",
+    )
+    parser.add_argument("--lora-rank", type=int, default=10, help="LoRA rank")
+    parser.add_argument("--lora-dropout", type=float, default=0.1, help="LoRA dropout")
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        default="adapters",
+        help="Path to save the trained adapter",
+    )
+    parser.add_argument(
+        "--adapter-path",
+        type=str,
+        default=None,
+        help="Load path to resume training from a previously saved adapter",
+    )
+    parser.add_argument(
+        "--save-after-epoch",
+        action="store_true",
+        help="Save interim versions of adapter files after each epoch",
+    )
+    args = parser.parse_args()
+    main(args)

mlx_vlm/models/__init__.py ADDED Viewed

File without changes

mlx_vlm/models/aya_vision/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .aya_vision import LanguageModel, Model, VisionModel
2	+ from .config import ModelConfig, TextConfig, VisionConfig

mlx_vlm/models/aya_vision/aya_vision.py ADDED Viewed

@@ -0,0 +1,188 @@
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from ..base import InputEmbeddingsFeatures
+from .config import ModelConfig
+from .language import LanguageModel
+from .vision import VisionModel
+class AyaVisionMultiModalProjector(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size", config.text_config.hidden_size
+        )
+        if config.model_type == "aya_vision":
+            self.layernorm = nn.LayerNorm(
+                config.vision_config.hidden_size * (config.downsample_factor**2),
+                eps=config.adapter_layer_norm_eps,
+            )
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+        self.act = nn.SiLU()  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(
+            self.alignment_intermediate_size // 2,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+    def __call__(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        if self.config.model_type == "aya_vision":
+            image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+        # Split along last dimension and apply SwiGLU
+        x, gate = mx.split(hidden_states, 2, axis=-1)
+        hidden_states = self.act(gate) * x
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(
+            image_features.shape[0], width, height, -1
+        )
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size,
+            width,
+            int(height / self.downsample_factor),
+            int(channels * self.downsample_factor),
+        )
+        image_features = image_features.transpose(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size,
+            int(height / self.downsample_factor),
+            int(width / self.downsample_factor),
+            -1,
+        )
+        image_features = image_features.transpose(0, 2, 1, 3)
+        return image_features
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.vision_tower = VisionModel(config.vision_config)
+        self.language_model = LanguageModel(config.text_config)
+        self.multi_modal_projector = AyaVisionMultiModalProjector(config)
+        self.vision_feature_layer = config.vision_feature_layer
+        self.vision_feature_select_strategy = config.vision_feature_select_strategy
+    def get_input_embeddings(
+        self,
+        input_ids: Optional[mx.array] = None,
+        pixel_values: Optional[mx.array] = None,
+        **kwargs,
+    ):
+        if pixel_values is None:
+            return InputEmbeddingsFeatures(
+                inputs_embeds=self.language_model.model.embed_tokens(input_ids)
+            )
+        # Get the input embeddings from the language model
+        inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+        spatial_shapes = kwargs.get("spatial_shapes", None)
+        # Get the ouptut hidden states from the vision model
+        *_, hidden_states = self.vision_tower(
+            pixel_values.transpose(0, 2, 3, 1),
+            spatial_shapes=spatial_shapes,
+            output_hidden_states=True,
+        )
+        # Select the hidden states from the desired layer
+        selected_image_feature = hidden_states[self.vision_feature_layer]
+        if self.vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                "Unexpected feature selection strategy: "
+                f"{self.vision_feature_select_strategy}"
+            )
+        # Pass image features through the multi-modal projector
+        image_features = self.multi_modal_projector(selected_image_feature)
+        # Insert special image tokens in the input_ids
+        final_inputs_embeds = self._merge_input_ids_with_image_features(
+            image_features, inputs_embeds, input_ids
+        )
+        return InputEmbeddingsFeatures(inputs_embeds=final_inputs_embeds)
+    def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, input_ids
+    ):
+        image_token_index = self.config.image_token_index
+        # Positions of <image> tokens in input_ids, assuming batch size is 1
+        image_positions = np.where(input_ids[0] == image_token_index)[0].tolist()
+        num_images, _, _, vision_hidden_size = image_features.shape
+        reshaped_image_hidden_states = image_features.reshape(-1, vision_hidden_size)
+        # cast to the dtype of the input_embeds to support quantized models
+        reshaped_image_hidden_states = reshaped_image_hidden_states.astype(
+            inputs_embeds.dtype
+        )
+        inputs_embeds[:, image_positions, :] = reshaped_image_hidden_states
+        return inputs_embeds
+    @property
+    def layers(self):
+        return self.language_model.model.layers
+    def __call__(
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
+    ):
+        input_embeddings_features = self.get_input_embeddings(
+            input_ids, pixel_values, **kwargs
+        )
+        logits = self.language_model(
+            input_ids,
+            cache=cache,
+            inputs_embeds=input_embeddings_features.inputs_embeds,
+        )
+        return logits
+    def sanitize(self, weights):
+        def transform_key(key):
+            if "model.vision_tower" in key:
+                key = key.replace("model.vision_tower", "vision_tower")
+            if "model.multi_modal_projector" in key:
+                key = key.replace(
+                    "model.multi_modal_projector", "multi_modal_projector"
+                )
+            if "model.language_model" in key:
+                key = key.replace("model.language_model", "language_model.model")
+            if "lm_head" in key and not key.startswith("language_model"):
+                key = key.replace("lm_head", "language_model.lm_head")
+            return key
+        return {transform_key(k): v for k, v in weights.items()}

mlx_vlm/models/aya_vision/config.py ADDED Viewed

@@ -0,0 +1,52 @@
+from dataclasses import dataclass
+from typing import List, Optional
+from ..base import BaseModelConfig
+@dataclass
+class TextConfig(BaseModelConfig):
+    model_type: str
+    hidden_size: int = 8192
+    head_dim: int = 128
+    num_hidden_layers: int = 40
+    intermediate_size: int = 14336
+    num_attention_heads: int = 64
+    num_key_value_heads: int = 8
+    rope_theta: float = 50000.0
+    vocab_size: int = 256000
+    layer_norm_eps: float = 1e-05
+    logit_scale: float = 0.0625
+    attention_bias: bool = False
+    layer_norm_bias: bool = False
+    sliding_window: int = 4096
+    sliding_window_pattern: int = 4
+    max_position_embeddings: int = 4096
+@dataclass
+class VisionConfig(BaseModelConfig):
+    model_type: str
+    hidden_size: int
+    num_attention_heads: int
+    patch_size: int
+    num_hidden_layers: int = 12
+    intermediate_size: int = 3072
+    image_size: int = 224
+    num_channels: int = 3
+    layer_norm_eps: float = 1e-6
+@dataclass
+class ModelConfig(BaseModelConfig):
+    text_config: TextConfig
+    vision_config: VisionConfig
+    model_type: str
+    image_token_index: int = 255036
+    max_splits_per_img: int = 12
+    downsample_factor: int = 2
+    alignment_intermediate_size: int = 28672
+    adapter_layer_norm_eps: float = 1e-06
+    vision_feature_layer: int = -1
+    vision_feature_select_strategy: str = "full"
+    eos_token_id: Optional[List[int]] = None

mlx_vlm/models/aya_vision/language.py ADDED Viewed

@@ -0,0 +1,202 @@
+from typing import Optional, Tuple
+import mlx.core as mx
+import mlx.nn as nn
+from ..base import (
+    LanguageModelOutput,
+    create_attention_mask,
+    scaled_dot_product_attention,
+)
+from ..cache import KVCache, RotatingKVCache
+from .config import TextConfig
+class Attention(nn.Module):
+    def __init__(self, config: TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        dim = config.hidden_size
+        self.n_heads = n_heads = config.num_attention_heads
+        self.n_kv_heads = n_kv_heads = config.num_key_value_heads
+        self.head_dim = head_dim = config.head_dim
+        if (head_dim * n_heads) != dim:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {dim}"
+                f" and `num_heads`: {n_heads})."
+            )
+        self.scale = head_dim**-0.5
+        attetion_bias = config.attention_bias
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=attetion_bias)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attetion_bias)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attetion_bias)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attetion_bias)
+        self.rope = nn.RoPE(head_dim, traditional=True, base=config.rope_theta)
+        self.use_sliding_window = (layer_idx + 1) % config.sliding_window_pattern != 0
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        # Apply RoPE only if sliding window is enabled
+        if self.use_sliding_window:
+            if cache is None:
+                queries = self.rope(queries)
+                keys = self.rope(keys)
+            else:
+                queries = self.rope(queries, offset=cache.offset)
+                keys = self.rope(keys, offset=cache.offset)
+        if cache is not None:
+            keys, values = cache.update_and_fetch(keys, values)
+        if self.use_sliding_window and mask is not None and isinstance(mask, mx.array):
+            key_len = keys.shape[-2]
+            if mask.shape[-1] != key_len:
+                mask = mask[..., -key_len:]
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+class MLP(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+    def __call__(self, x):
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+class TransformerBlock(nn.Module):
+    def __init__(self, config: TextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.n_heads = config.num_attention_heads
+        self.self_attn = Attention(config, layer_idx)
+        self.mlp = MLP(config.hidden_size, config.intermediate_size)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps, bias=config.layer_norm_bias
+        )
+        self.config = config
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        h = self.input_layernorm(x)
+        attn_h = self.self_attn(h, mask, cache)
+        ff_h = self.mlp(h)
+        return attn_h + ff_h + x
+class CohereModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        assert self.vocab_size > 0
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = [
+            TransformerBlock(config, layer_idx=i)
+            for i in range(config.num_hidden_layers)
+        ]
+        self.norm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps, bias=config.layer_norm_bias
+        )
+    def __call__(
+        self,
+        inputs: mx.array,
+        inputs_embeds: mx.array = None,
+        mask: mx.array = None,
+        cache=None,
+    ):
+        if inputs_embeds is None:
+            h = self.embed_tokens(inputs)
+        else:
+            h = inputs_embeds
+        if cache is None:
+            cache = [None] * len(self.layers)
+        if mask is None:
+            j = self.config.sliding_window_pattern
+            mask = create_attention_mask(h, cache[j - 1 : j])
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+        return self.norm(h)
+class LanguageModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.model_type = config.model_type
+        self.model = CohereModel(config)
+        self.config = config
+    def __call__(
+        self,
+        inputs: mx.array,
+        inputs_embeds: mx.array = None,
+        mask: mx.array = None,
+        cache=None,
+    ):
+        out = self.model(inputs, inputs_embeds, mask, cache)
+        out = self.model.embed_tokens.as_linear(out)
+        out = out * self.model.config.logit_scale
+        return LanguageModelOutput(logits=out)
+    def make_cache(self):
+        caches = []
+        for i in range(self.config.num_hidden_layers):
+            if (
+                i % self.config.sliding_window_pattern
+                == self.config.sliding_window_pattern - 1
+            ):
+                caches.append(KVCache())
+            else:
+                caches.append(
+                    RotatingKVCache(max_size=self.config.sliding_window, keep=0)
+                )
+        return caches
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def head_dim(self):
+        return self.model.config.head_dim
+    @property
+    def n_kv_heads(self):
+        return self.model.config.num_key_value_heads