fount-vlm-nell-02 0.3.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fount_vlm_nell_02-0.3.11/LICENSE +21 -0
- fount_vlm_nell_02-0.3.11/PKG-INFO +418 -0
- fount_vlm_nell_02-0.3.11/README.md +372 -0
- fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/PKG-INFO +418 -0
- fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/SOURCES.txt +262 -0
- fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/dependency_links.txt +1 -0
- fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/entry_points.txt +5 -0
- fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/requires.txt +28 -0
- fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/top_level.txt +1 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/__init__.py +16 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/__main__.py +24 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/chat.py +234 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/chat_ui.py +508 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/convert.py +284 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/deprecation.py +52 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/evals/__init__.py +0 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/evals/math_vista.py +565 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/evals/mmmu.py +528 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/evals/mmstar.py +343 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/evals/ocrbench.py +453 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/evals/utils.py +37 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/generate.py +1457 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/lora.py +207 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/__init__.py +0 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/aya_vision.py +188 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/config.py +52 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/language.py +202 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/vision.py +340 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/base.py +356 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/cache.py +238 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/config.py +173 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/conversation.py +264 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/language.py +547 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/sam.py +489 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/vision.py +263 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/config.py +216 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/vision.py +439 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/config.py +79 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/fastvlm.py +198 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/language.py +49 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/vision.py +692 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/config.py +84 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/florence2.py +383 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/language.py +452 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/processing_florence2.py +30 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/vision.py +552 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/config.py +52 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/gemma3.py +194 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/language.py +293 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/vision.py +215 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/audio.py +1038 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/config.py +130 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/gemma3n.py +322 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/language.py +631 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/vision.py +994 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/__init__.py +3 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/config.py +79 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/glm4v.py +188 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/language.py +574 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/processing.py +220 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/vision.py +406 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/__init__.py +3 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/config.py +81 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/language.py +674 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/processing.py +229 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/vision.py +405 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/__init__.py +3 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/config.py +93 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/language.py +585 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/processing.py +208 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/vision.py +342 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/config.py +136 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/language.py +509 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/vision.py +322 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/config.py +65 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/idefics2.py +321 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/language.py +161 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/vision.py +244 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/config.py +54 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/idefics3.py +221 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/language.py +157 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/vision.py +265 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/__init__.py +3 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/config.py +89 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/language.py +187 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/processor.py +395 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/vision.py +265 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/interpolate.py +183 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/__init__.py +3 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/config.py +142 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/image_processor.py +430 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/language.py +272 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/vision.py +202 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kernels.py +447 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/config.py +84 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/language.py +460 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/vision.py +485 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/config.py +94 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/language.py +49 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/vision.py +223 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/config.py +83 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/language.py +334 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/llama4.py +146 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/vision.py +526 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/config.py +61 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/language.py +200 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/llava.py +132 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/vision.py +233 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/config.py +85 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/language.py +194 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/vision.py +278 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/config.py +60 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/language.py +192 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/llava_next.py +138 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/vision.py +217 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/config.py +59 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/language.py +269 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/mistral3.py +383 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/config.py +74 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/language.py +377 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/mllama.py +210 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/vision.py +458 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/__init__.py +5 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/config.py +93 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/language.py +208 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/molmo.py +108 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/processing_molmo.py +763 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/vision.py +408 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/__init__.py +6 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/config.py +137 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/language.py +206 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/molmo2.py +330 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/processing.py +773 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/vision.py +286 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/__init__.py +11 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/config.py +92 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/image_crops.py +269 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/language.py +267 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/moondream2.py +522 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/processing_moondream.py +144 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/vision.py +200 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/config.py +108 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/language.py +191 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/multi_modality.py +338 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/sam.py +543 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/vision.py +450 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/config.py +93 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/language.py +522 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/vision.py +358 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/config.py +50 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/language.py +253 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/paligemma.py +140 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/vision.py +218 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/__init__.py +5 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/config.py +55 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/language.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/phi3_v.py +239 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/vision.py +294 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/config.py +69 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/language.py +195 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/pixtral.py +208 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/vision.py +293 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/config.py +90 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/language.py +541 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/config.py +86 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/language.py +539 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/vision.py +308 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/config.py +103 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/language.py +596 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/vision.py +441 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/smolvlm/__init__.py +4 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/smolvlm/config.py +59 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/models/smolvlm/smolvlm.py +60 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/prompt_utils.py +565 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/sample_utils.py +39 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/server.py +1107 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/smolvlm_video_generate.py +109 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/tokenizer_utils.py +371 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/__init__.py +9 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/lora.py +70 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/trainer.py +299 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/utils.py +160 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/utils.py +1339 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/version.py +1 -0
- fount_vlm_nell_02-0.3.11/mlx_vlm/video_generate.py +611 -0
- fount_vlm_nell_02-0.3.11/pyproject.toml +47 -0
- fount_vlm_nell_02-0.3.11/requirements.txt +13 -0
- fount_vlm_nell_02-0.3.11/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright © 2025 Prince Canuma
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fount-vlm-nell-02
|
|
3
|
+
Version: 0.3.11
|
|
4
|
+
Summary: fork of mlx-vlm for fount
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/Blaizzy/mlx-vlm
|
|
7
|
+
Project-URL: Repository, https://github.com/Blaizzy/mlx-vlm
|
|
8
|
+
Project-URL: Issues, https://github.com/Blaizzy/mlx-vlm/issues
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: mlx>=0.30.0
|
|
22
|
+
Requires-Dist: datasets>=2.19.1
|
|
23
|
+
Requires-Dist: tqdm>=4.66.2
|
|
24
|
+
Requires-Dist: transformers>=5.0.0rc3
|
|
25
|
+
Requires-Dist: mlx-lm>=0.30.5
|
|
26
|
+
Requires-Dist: Pillow>=10.3.0
|
|
27
|
+
Requires-Dist: requests>=2.31.0
|
|
28
|
+
Requires-Dist: fastapi>=0.95.1
|
|
29
|
+
Requires-Dist: soundfile>=0.13.1
|
|
30
|
+
Requires-Dist: opencv-python>=4.12.0.88
|
|
31
|
+
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: uvicorn
|
|
33
|
+
Provides-Extra: ui
|
|
34
|
+
Requires-Dist: gradio>=5.19.0; extra == "ui"
|
|
35
|
+
Provides-Extra: torch
|
|
36
|
+
Requires-Dist: torch; extra == "torch"
|
|
37
|
+
Requires-Dist: torchvision; extra == "torch"
|
|
38
|
+
Requires-Dist: einops; extra == "torch"
|
|
39
|
+
Requires-Dist: blobfile; extra == "torch"
|
|
40
|
+
Requires-Dist: tiktoken; extra == "torch"
|
|
41
|
+
Provides-Extra: cuda
|
|
42
|
+
Requires-Dist: mlx-cuda; extra == "cuda"
|
|
43
|
+
Provides-Extra: cpu
|
|
44
|
+
Requires-Dist: mlx-cpu; extra == "cpu"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
[](https://github.com/Blaizzy/mlx-vlm/actions/workflows/python-publish.yml)
|
|
48
|
+
# MLX-VLM
|
|
49
|
+
|
|
50
|
+
MLX-VLM is a package for inference and fine-tuning of Vision Language Models (VLMs) and Omni Models (VLMs with audio and video support) on your Mac using MLX.
|
|
51
|
+
|
|
52
|
+
## Table of Contents
|
|
53
|
+
- [Installation](#installation)
|
|
54
|
+
- [Usage](#usage)
|
|
55
|
+
- [Command Line Interface (CLI)](#command-line-interface-cli)
|
|
56
|
+
- [Chat UI with Gradio](#chat-ui-with-gradio)
|
|
57
|
+
- [Python Script](#python-script)
|
|
58
|
+
- [Multi-Image Chat Support](#multi-image-chat-support)
|
|
59
|
+
- [Supported Models](#supported-models)
|
|
60
|
+
- [Usage Examples](#usage-examples)
|
|
61
|
+
- [Model-Specific Documentation](#model-specific-documentation)
|
|
62
|
+
- [Fine-tuning](#fine-tuning)
|
|
63
|
+
|
|
64
|
+
## Model-Specific Documentation
|
|
65
|
+
|
|
66
|
+
Some models have detailed documentation with prompt formats, examples, and best practices:
|
|
67
|
+
|
|
68
|
+
| Model | Documentation |
|
|
69
|
+
|-------|---------------|
|
|
70
|
+
| DeepSeek-OCR | [Docs](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/models/deepseekocr/README.md) |
|
|
71
|
+
| DeepSeek-OCR-2 | [Docs](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/models/deepseekocr_2/README.md) |
|
|
72
|
+
| GLM-OCR | [Docs](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/models/glm_ocr/README.md) |
|
|
73
|
+
|
|
74
|
+
## Installation
|
|
75
|
+
|
|
76
|
+
The easiest way to get started is to install the `mlx-vlm` package using pip:
|
|
77
|
+
|
|
78
|
+
```sh
|
|
79
|
+
pip install -U mlx-vlm
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Usage
|
|
83
|
+
|
|
84
|
+
### Command Line Interface (CLI)
|
|
85
|
+
|
|
86
|
+
Generate output from a model using the CLI:
|
|
87
|
+
|
|
88
|
+
```sh
|
|
89
|
+
# Text generation
|
|
90
|
+
mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Hello, how are you?"
|
|
91
|
+
|
|
92
|
+
# Image generation
|
|
93
|
+
mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --temperature 0.0 --image http://images.cocodataset.org/val2017/000000039769.jpg
|
|
94
|
+
|
|
95
|
+
# Audio generation (New)
|
|
96
|
+
mlx_vlm.generate --model mlx-community/gemma-3n-E2B-it-4bit --max-tokens 100 --prompt "Describe what you hear" --audio /path/to/audio.wav
|
|
97
|
+
|
|
98
|
+
# Multi-modal generation (Image + Audio)
|
|
99
|
+
mlx_vlm.generate --model mlx-community/gemma-3n-E2B-it-4bit --max-tokens 100 --prompt "Describe what you see and hear" --image /path/to/image.jpg --audio /path/to/audio.wav
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Chat UI with Gradio
|
|
103
|
+
|
|
104
|
+
Launch a chat interface using Gradio:
|
|
105
|
+
|
|
106
|
+
```sh
|
|
107
|
+
mlx_vlm.chat_ui --model mlx-community/Qwen2-VL-2B-Instruct-4bit
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Python Script
|
|
111
|
+
|
|
112
|
+
Here's an example of how to use MLX-VLM in a Python script:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import mlx.core as mx
|
|
116
|
+
from mlx_vlm import load, generate
|
|
117
|
+
from mlx_vlm.prompt_utils import apply_chat_template
|
|
118
|
+
from mlx_vlm.utils import load_config
|
|
119
|
+
|
|
120
|
+
# Load the model
|
|
121
|
+
model_path = "mlx-community/Qwen2-VL-2B-Instruct-4bit"
|
|
122
|
+
model, processor = load(model_path)
|
|
123
|
+
config = load_config(model_path)
|
|
124
|
+
|
|
125
|
+
# Prepare input
|
|
126
|
+
image = ["http://images.cocodataset.org/val2017/000000039769.jpg"]
|
|
127
|
+
# image = [Image.open("...")] can also be used with PIL.Image.Image objects
|
|
128
|
+
prompt = "Describe this image."
|
|
129
|
+
|
|
130
|
+
# Apply chat template
|
|
131
|
+
formatted_prompt = apply_chat_template(
|
|
132
|
+
processor, config, prompt, num_images=len(image)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Generate output
|
|
136
|
+
output = generate(model, processor, formatted_prompt, image, verbose=False)
|
|
137
|
+
print(output)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
#### Audio Example
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from mlx_vlm import load, generate
|
|
144
|
+
from mlx_vlm.prompt_utils import apply_chat_template
|
|
145
|
+
from mlx_vlm.utils import load_config
|
|
146
|
+
|
|
147
|
+
# Load model with audio support
|
|
148
|
+
model_path = "mlx-community/gemma-3n-E2B-it-4bit"
|
|
149
|
+
model, processor = load(model_path)
|
|
150
|
+
config = model.config
|
|
151
|
+
|
|
152
|
+
# Prepare audio input
|
|
153
|
+
audio = ["/path/to/audio1.wav", "/path/to/audio2.mp3"]
|
|
154
|
+
prompt = "Describe what you hear in these audio files."
|
|
155
|
+
|
|
156
|
+
# Apply chat template with audio
|
|
157
|
+
formatted_prompt = apply_chat_template(
|
|
158
|
+
processor, config, prompt, num_audios=len(audio)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Generate output with audio
|
|
162
|
+
output = generate(model, processor, formatted_prompt, audio=audio, verbose=False)
|
|
163
|
+
print(output)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
#### Multi-Modal Example (Image + Audio)
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from mlx_vlm import load, generate
|
|
170
|
+
from mlx_vlm.prompt_utils import apply_chat_template
|
|
171
|
+
from mlx_vlm.utils import load_config
|
|
172
|
+
|
|
173
|
+
# Load multi-modal model
|
|
174
|
+
model_path = "mlx-community/gemma-3n-E2B-it-4bit"
|
|
175
|
+
model, processor = load(model_path)
|
|
176
|
+
config = model.config
|
|
177
|
+
|
|
178
|
+
# Prepare inputs
|
|
179
|
+
image = ["/path/to/image.jpg"]
|
|
180
|
+
audio = ["/path/to/audio.wav"]
|
|
181
|
+
prompt = ""
|
|
182
|
+
|
|
183
|
+
# Apply chat template
|
|
184
|
+
formatted_prompt = apply_chat_template(
|
|
185
|
+
processor, config, prompt,
|
|
186
|
+
num_images=len(image),
|
|
187
|
+
num_audios=len(audio)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Generate output
|
|
191
|
+
output = generate(model, processor, formatted_prompt, image, audio=audio, verbose=False)
|
|
192
|
+
print(output)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Server (FastAPI)
|
|
196
|
+
|
|
197
|
+
Start the server:
|
|
198
|
+
```sh
|
|
199
|
+
mlx_vlm.server --port 8080
|
|
200
|
+
|
|
201
|
+
# With trust remote code enabled (required for some models)
|
|
202
|
+
mlx_vlm.server --trust-remote-code
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
#### Server Options
|
|
206
|
+
|
|
207
|
+
- `--host`: Host address (default: `0.0.0.0`)
|
|
208
|
+
- `--port`: Port number (default: `8080`)
|
|
209
|
+
- `--trust-remote-code`: Trust remote code when loading models from Hugging Face Hub
|
|
210
|
+
|
|
211
|
+
You can also set trust remote code via environment variable:
|
|
212
|
+
```sh
|
|
213
|
+
MLX_TRUST_REMOTE_CODE=true mlx_vlm.server
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
The server provides multiple endpoints for different use cases and supports dynamic model loading/unloading with caching (one model at a time).
|
|
217
|
+
|
|
218
|
+
#### Available Endpoints
|
|
219
|
+
|
|
220
|
+
- `/models` - List models available locally
|
|
221
|
+
- `/chat/completions` - OpenAI-compatible chat-style interaction endpoint with support for images, audio, and text
|
|
222
|
+
- `/responses` - OpenAI-compatible responses endpoint
|
|
223
|
+
- `/health` - Check server status
|
|
224
|
+
- `/unload` - Unload current model from memory
|
|
225
|
+
|
|
226
|
+
#### Usage Examples
|
|
227
|
+
|
|
228
|
+
##### List available models
|
|
229
|
+
|
|
230
|
+
```sh
|
|
231
|
+
curl "http://localhost:8080/models"
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
##### Text Input
|
|
235
|
+
|
|
236
|
+
```sh
|
|
237
|
+
curl -X POST "http://localhost:8080/chat/completions" \
|
|
238
|
+
-H "Content-Type: application/json" \
|
|
239
|
+
-d '{
|
|
240
|
+
"model": "mlx-community/Qwen2-VL-2B-Instruct-4bit",
|
|
241
|
+
"messages": [
|
|
242
|
+
{
|
|
243
|
+
"role": "user",
|
|
244
|
+
"content": "Hello, how are you",
|
|
245
|
+
}
|
|
246
|
+
],
|
|
247
|
+
"stream": true,
|
|
248
|
+
"max_tokens": 100
|
|
249
|
+
}'
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
##### Image Input
|
|
253
|
+
|
|
254
|
+
```sh
|
|
255
|
+
curl -X POST "http://localhost:8080/chat/completions" \
|
|
256
|
+
-H "Content-Type: application/json" \
|
|
257
|
+
-d '{
|
|
258
|
+
"model": "mlx-community/Qwen2.5-VL-32B-Instruct-8bit",
|
|
259
|
+
[
|
|
260
|
+
{
|
|
261
|
+
"role": "system",
|
|
262
|
+
"content": "You are a helpful assistant."
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"role": "user",
|
|
266
|
+
"content": [
|
|
267
|
+
{
|
|
268
|
+
"type": "text",
|
|
269
|
+
"text": This is today's chart for energy demand in California. Can you provide an analysis of the chart and comment on the implications for renewable energy in California?"
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
"type": "input_image",
|
|
273
|
+
"image_url": "/path/to/repo/examples/images/renewables_california.png"
|
|
274
|
+
}
|
|
275
|
+
]
|
|
276
|
+
}
|
|
277
|
+
],
|
|
278
|
+
"stream": true,
|
|
279
|
+
"max_tokens": 1000
|
|
280
|
+
}'
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
##### Audio Support (New)
|
|
284
|
+
```sh
|
|
285
|
+
curl -X POST "http://localhost:8080/generate" \
|
|
286
|
+
-H "Content-Type: application/json" \
|
|
287
|
+
-d '{
|
|
288
|
+
"model": "mlx-community/gemma-3n-E2B-it-4bit",
|
|
289
|
+
"messages": [
|
|
290
|
+
{
|
|
291
|
+
"role": "user",
|
|
292
|
+
"content": [
|
|
293
|
+
{ "type": "text", "text": "Describe what you hear in these audio files" },
|
|
294
|
+
{"type": "input_audio", "input_audio": "/path/to/audio1.wav"}
|
|
295
|
+
{"type": "input_audio", "input_audio": "https://example.com/audio2.mp3"}
|
|
296
|
+
]
|
|
297
|
+
}
|
|
298
|
+
],
|
|
299
|
+
"stream": true,
|
|
300
|
+
"max_tokens": 500
|
|
301
|
+
}'
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
##### Multi-Modal (Image + Audio)
|
|
305
|
+
```sh
|
|
306
|
+
curl -X POST "http://localhost:8080/generate" \
|
|
307
|
+
-H "Content-Type: application/json" \
|
|
308
|
+
-d '{
|
|
309
|
+
"model": "mlx-community/gemma-3n-E2B-it-4bit",
|
|
310
|
+
"messages": [
|
|
311
|
+
{
|
|
312
|
+
"role": "user",
|
|
313
|
+
"content": [
|
|
314
|
+
{"type": "input_image", "image_url": "/path/to/image.jpg"},
|
|
315
|
+
{"type": "input_audio", "input_audio": "/path/to/audio.wav"}
|
|
316
|
+
]
|
|
317
|
+
}
|
|
318
|
+
],
|
|
319
|
+
"max_tokens": 100
|
|
320
|
+
}'
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
##### Responses Endpoint
|
|
324
|
+
```sh
|
|
325
|
+
curl -X POST "http://localhost:8080/responses" \
|
|
326
|
+
-H "Content-Type: application/json" \
|
|
327
|
+
-d '{
|
|
328
|
+
"model": "mlx-community/Qwen2-VL-2B-Instruct-4bit",
|
|
329
|
+
"messages": [
|
|
330
|
+
{
|
|
331
|
+
"role": "user",
|
|
332
|
+
"content": [
|
|
333
|
+
{"type": "input_text", "text": "What is in this image?"},
|
|
334
|
+
{"type": "input_image", "image_url": "/path/to/image.jpg"}
|
|
335
|
+
]
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
"max_tokens": 100
|
|
339
|
+
}'
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
#### Request Parameters
|
|
343
|
+
|
|
344
|
+
- `model`: Model identifier (required)
|
|
345
|
+
- `messages`: Chat messages for chat/OpenAI endpoints
|
|
346
|
+
- `max_tokens`: Maximum tokens to generate
|
|
347
|
+
- `temperature`: Sampling temperature
|
|
348
|
+
- `top_p`: Top-p sampling parameter
|
|
349
|
+
- `stream`: Enable streaming responses
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
## Multi-Image Chat Support
|
|
353
|
+
|
|
354
|
+
MLX-VLM supports analyzing multiple images simultaneously with select models. This feature enables more complex visual reasoning tasks and comprehensive analysis across multiple images in a single conversation.
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
### Usage Examples
|
|
358
|
+
|
|
359
|
+
#### Python Script
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
from mlx_vlm import load, generate
|
|
363
|
+
from mlx_vlm.prompt_utils import apply_chat_template
|
|
364
|
+
from mlx_vlm.utils import load_config
|
|
365
|
+
|
|
366
|
+
model_path = "mlx-community/Qwen2-VL-2B-Instruct-4bit"
|
|
367
|
+
model, processor = load(model_path)
|
|
368
|
+
config = model.config
|
|
369
|
+
|
|
370
|
+
images = ["path/to/image1.jpg", "path/to/image2.jpg"]
|
|
371
|
+
prompt = "Compare these two images."
|
|
372
|
+
|
|
373
|
+
formatted_prompt = apply_chat_template(
|
|
374
|
+
processor, config, prompt, num_images=len(images)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
output = generate(model, processor, formatted_prompt, images, verbose=False)
|
|
378
|
+
print(output)
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
#### Command Line
|
|
382
|
+
|
|
383
|
+
```sh
|
|
384
|
+
mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Compare these images" --image path/to/image1.jpg path/to/image2.jpg
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
## Video Understanding
|
|
388
|
+
|
|
389
|
+
MLX-VLM also supports video analysis such as captioning, summarization, and more, with select models.
|
|
390
|
+
|
|
391
|
+
### Supported Models
|
|
392
|
+
|
|
393
|
+
The following models support video chat:
|
|
394
|
+
|
|
395
|
+
1. Qwen2-VL
|
|
396
|
+
2. Qwen2.5-VL
|
|
397
|
+
3. Idefics3
|
|
398
|
+
4. LLaVA
|
|
399
|
+
|
|
400
|
+
With more coming soon.
|
|
401
|
+
|
|
402
|
+
### Usage Examples
|
|
403
|
+
|
|
404
|
+
#### Command Line
|
|
405
|
+
```sh
|
|
406
|
+
mlx_vlm.video_generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Describe this video" --video path/to/video.mp4 --max-pixels 224 224 --fps 1.0
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
These examples demonstrate how to use multiple images with MLX-VLM for more complex visual reasoning tasks.
|
|
411
|
+
|
|
412
|
+
# Fine-tuning
|
|
413
|
+
|
|
414
|
+
MLX-VLM supports fine-tuning models with LoRA and QLoRA.
|
|
415
|
+
|
|
416
|
+
## LoRA & QLoRA
|
|
417
|
+
|
|
418
|
+
To learn more about LoRA, please refer to the [LoRA.md](./mlx_vlm/LORA.MD) file.
|