fount-vlm-nell-02 0.3.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. fount_vlm_nell_02-0.3.11/LICENSE +21 -0
  2. fount_vlm_nell_02-0.3.11/PKG-INFO +418 -0
  3. fount_vlm_nell_02-0.3.11/README.md +372 -0
  4. fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/PKG-INFO +418 -0
  5. fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/SOURCES.txt +262 -0
  6. fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/dependency_links.txt +1 -0
  7. fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/entry_points.txt +5 -0
  8. fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/requires.txt +28 -0
  9. fount_vlm_nell_02-0.3.11/fount_vlm_nell_02.egg-info/top_level.txt +1 -0
  10. fount_vlm_nell_02-0.3.11/mlx_vlm/__init__.py +16 -0
  11. fount_vlm_nell_02-0.3.11/mlx_vlm/__main__.py +24 -0
  12. fount_vlm_nell_02-0.3.11/mlx_vlm/chat.py +234 -0
  13. fount_vlm_nell_02-0.3.11/mlx_vlm/chat_ui.py +508 -0
  14. fount_vlm_nell_02-0.3.11/mlx_vlm/convert.py +284 -0
  15. fount_vlm_nell_02-0.3.11/mlx_vlm/deprecation.py +52 -0
  16. fount_vlm_nell_02-0.3.11/mlx_vlm/evals/__init__.py +0 -0
  17. fount_vlm_nell_02-0.3.11/mlx_vlm/evals/math_vista.py +565 -0
  18. fount_vlm_nell_02-0.3.11/mlx_vlm/evals/mmmu.py +528 -0
  19. fount_vlm_nell_02-0.3.11/mlx_vlm/evals/mmstar.py +343 -0
  20. fount_vlm_nell_02-0.3.11/mlx_vlm/evals/ocrbench.py +453 -0
  21. fount_vlm_nell_02-0.3.11/mlx_vlm/evals/utils.py +37 -0
  22. fount_vlm_nell_02-0.3.11/mlx_vlm/generate.py +1457 -0
  23. fount_vlm_nell_02-0.3.11/mlx_vlm/lora.py +207 -0
  24. fount_vlm_nell_02-0.3.11/mlx_vlm/models/__init__.py +0 -0
  25. fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/__init__.py +2 -0
  26. fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/aya_vision.py +188 -0
  27. fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/config.py +52 -0
  28. fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/language.py +202 -0
  29. fount_vlm_nell_02-0.3.11/mlx_vlm/models/aya_vision/vision.py +340 -0
  30. fount_vlm_nell_02-0.3.11/mlx_vlm/models/base.py +356 -0
  31. fount_vlm_nell_02-0.3.11/mlx_vlm/models/cache.py +238 -0
  32. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
  33. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
  34. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
  35. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
  36. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
  37. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
  38. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
  39. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/__init__.py +2 -0
  40. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/config.py +173 -0
  41. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/conversation.py +264 -0
  42. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
  43. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/language.py +547 -0
  44. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
  45. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/sam.py +489 -0
  46. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr/vision.py +263 -0
  47. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
  48. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/config.py +216 -0
  49. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
  50. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
  51. fount_vlm_nell_02-0.3.11/mlx_vlm/models/deepseekocr_2/vision.py +439 -0
  52. fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
  53. fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
  54. fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
  55. fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
  56. fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
  57. fount_vlm_nell_02-0.3.11/mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
  58. fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/__init__.py +2 -0
  59. fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/config.py +79 -0
  60. fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/fastvlm.py +198 -0
  61. fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/language.py +49 -0
  62. fount_vlm_nell_02-0.3.11/mlx_vlm/models/fastvlm/vision.py +692 -0
  63. fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/__init__.py +2 -0
  64. fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/config.py +84 -0
  65. fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/florence2.py +383 -0
  66. fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/language.py +452 -0
  67. fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/processing_florence2.py +30 -0
  68. fount_vlm_nell_02-0.3.11/mlx_vlm/models/florence2/vision.py +552 -0
  69. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/__init__.py +2 -0
  70. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/config.py +52 -0
  71. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/gemma3.py +194 -0
  72. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/language.py +293 -0
  73. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3/vision.py +215 -0
  74. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/__init__.py +2 -0
  75. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/audio.py +1038 -0
  76. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/config.py +130 -0
  77. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/gemma3n.py +322 -0
  78. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/language.py +631 -0
  79. fount_vlm_nell_02-0.3.11/mlx_vlm/models/gemma3n/vision.py +994 -0
  80. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/__init__.py +3 -0
  81. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/config.py +79 -0
  82. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/glm4v.py +188 -0
  83. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/language.py +574 -0
  84. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/processing.py +220 -0
  85. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v/vision.py +406 -0
  86. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/__init__.py +3 -0
  87. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/config.py +81 -0
  88. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
  89. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/language.py +674 -0
  90. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/processing.py +229 -0
  91. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm4v_moe/vision.py +405 -0
  92. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/__init__.py +3 -0
  93. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/config.py +93 -0
  94. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
  95. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/language.py +585 -0
  96. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/processing.py +208 -0
  97. fount_vlm_nell_02-0.3.11/mlx_vlm/models/glm_ocr/vision.py +342 -0
  98. fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
  99. fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/config.py +136 -0
  100. fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
  101. fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/language.py +509 -0
  102. fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
  103. fount_vlm_nell_02-0.3.11/mlx_vlm/models/hunyuan_vl/vision.py +322 -0
  104. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/__init__.py +2 -0
  105. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/config.py +65 -0
  106. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/idefics2.py +321 -0
  107. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/language.py +161 -0
  108. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics2/vision.py +244 -0
  109. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/__init__.py +4 -0
  110. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/config.py +54 -0
  111. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/idefics3.py +221 -0
  112. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/language.py +157 -0
  113. fount_vlm_nell_02-0.3.11/mlx_vlm/models/idefics3/vision.py +265 -0
  114. fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/__init__.py +3 -0
  115. fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/config.py +89 -0
  116. fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
  117. fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/language.py +187 -0
  118. fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/processor.py +395 -0
  119. fount_vlm_nell_02-0.3.11/mlx_vlm/models/internvl_chat/vision.py +265 -0
  120. fount_vlm_nell_02-0.3.11/mlx_vlm/models/interpolate.py +183 -0
  121. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/__init__.py +3 -0
  122. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/config.py +142 -0
  123. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/image_processor.py +430 -0
  124. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
  125. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/language.py +272 -0
  126. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
  127. fount_vlm_nell_02-0.3.11/mlx_vlm/models/jina_vlm/vision.py +202 -0
  128. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kernels.py +447 -0
  129. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/__init__.py +4 -0
  130. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/config.py +84 -0
  131. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
  132. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/language.py +460 -0
  133. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
  134. fount_vlm_nell_02-0.3.11/mlx_vlm/models/kimi_vl/vision.py +485 -0
  135. fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/__init__.py +2 -0
  136. fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/config.py +94 -0
  137. fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/language.py +49 -0
  138. fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
  139. fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
  140. fount_vlm_nell_02-0.3.11/mlx_vlm/models/lfm2_vl/vision.py +223 -0
  141. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/__init__.py +2 -0
  142. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/config.py +83 -0
  143. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/language.py +334 -0
  144. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/llama4.py +146 -0
  145. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llama4/vision.py +526 -0
  146. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/__init__.py +2 -0
  147. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/config.py +61 -0
  148. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/language.py +200 -0
  149. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/llava.py +132 -0
  150. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava/vision.py +233 -0
  151. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/__init__.py +2 -0
  152. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/config.py +85 -0
  153. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/language.py +194 -0
  154. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
  155. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_bunny/vision.py +278 -0
  156. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/__init__.py +2 -0
  157. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/config.py +60 -0
  158. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/language.py +192 -0
  159. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/llava_next.py +138 -0
  160. fount_vlm_nell_02-0.3.11/mlx_vlm/models/llava_next/vision.py +217 -0
  161. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/__init__.py +2 -0
  162. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/config.py +59 -0
  163. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/language.py +269 -0
  164. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mistral3/mistral3.py +383 -0
  165. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/__init__.py +4 -0
  166. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/config.py +74 -0
  167. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/language.py +377 -0
  168. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/mllama.py +210 -0
  169. fount_vlm_nell_02-0.3.11/mlx_vlm/models/mllama/vision.py +458 -0
  170. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/__init__.py +5 -0
  171. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/config.py +93 -0
  172. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/language.py +208 -0
  173. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/molmo.py +108 -0
  174. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/processing_molmo.py +763 -0
  175. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo/vision.py +408 -0
  176. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/__init__.py +6 -0
  177. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/config.py +137 -0
  178. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/language.py +206 -0
  179. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/molmo2.py +330 -0
  180. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/processing.py +773 -0
  181. fount_vlm_nell_02-0.3.11/mlx_vlm/models/molmo2/vision.py +286 -0
  182. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/__init__.py +11 -0
  183. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/config.py +92 -0
  184. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/image_crops.py +269 -0
  185. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/language.py +267 -0
  186. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/moondream2.py +522 -0
  187. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/processing_moondream.py +144 -0
  188. fount_vlm_nell_02-0.3.11/mlx_vlm/models/moondream2/vision.py +200 -0
  189. fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/__init__.py +4 -0
  190. fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/config.py +108 -0
  191. fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/language.py +191 -0
  192. fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/multi_modality.py +338 -0
  193. fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/sam.py +543 -0
  194. fount_vlm_nell_02-0.3.11/mlx_vlm/models/multi_modality/vision.py +450 -0
  195. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
  196. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/config.py +93 -0
  197. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/language.py +522 -0
  198. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
  199. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
  200. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paddleocr_vl/vision.py +358 -0
  201. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/__init__.py +4 -0
  202. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/config.py +50 -0
  203. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/language.py +253 -0
  204. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/paligemma.py +140 -0
  205. fount_vlm_nell_02-0.3.11/mlx_vlm/models/paligemma/vision.py +218 -0
  206. fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/__init__.py +5 -0
  207. fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/config.py +55 -0
  208. fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/language.py +2 -0
  209. fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/phi3_v.py +239 -0
  210. fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
  211. fount_vlm_nell_02-0.3.11/mlx_vlm/models/phi3_v/vision.py +294 -0
  212. fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/__init__.py +4 -0
  213. fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/config.py +69 -0
  214. fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/language.py +195 -0
  215. fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/pixtral.py +208 -0
  216. fount_vlm_nell_02-0.3.11/mlx_vlm/models/pixtral/vision.py +293 -0
  217. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
  218. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/config.py +90 -0
  219. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/language.py +541 -0
  220. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
  221. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
  222. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/__init__.py +2 -0
  223. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/config.py +86 -0
  224. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/language.py +539 -0
  225. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
  226. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen2_vl/vision.py +308 -0
  227. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
  228. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
  229. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
  230. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
  231. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
  232. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
  233. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
  234. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
  235. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
  236. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
  237. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/__init__.py +2 -0
  238. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/config.py +103 -0
  239. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/language.py +596 -0
  240. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
  241. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl/vision.py +441 -0
  242. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
  243. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
  244. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
  245. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
  246. fount_vlm_nell_02-0.3.11/mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
  247. fount_vlm_nell_02-0.3.11/mlx_vlm/models/smolvlm/__init__.py +4 -0
  248. fount_vlm_nell_02-0.3.11/mlx_vlm/models/smolvlm/config.py +59 -0
  249. fount_vlm_nell_02-0.3.11/mlx_vlm/models/smolvlm/smolvlm.py +60 -0
  250. fount_vlm_nell_02-0.3.11/mlx_vlm/prompt_utils.py +565 -0
  251. fount_vlm_nell_02-0.3.11/mlx_vlm/sample_utils.py +39 -0
  252. fount_vlm_nell_02-0.3.11/mlx_vlm/server.py +1107 -0
  253. fount_vlm_nell_02-0.3.11/mlx_vlm/smolvlm_video_generate.py +109 -0
  254. fount_vlm_nell_02-0.3.11/mlx_vlm/tokenizer_utils.py +371 -0
  255. fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/__init__.py +9 -0
  256. fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/lora.py +70 -0
  257. fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/trainer.py +299 -0
  258. fount_vlm_nell_02-0.3.11/mlx_vlm/trainer/utils.py +160 -0
  259. fount_vlm_nell_02-0.3.11/mlx_vlm/utils.py +1339 -0
  260. fount_vlm_nell_02-0.3.11/mlx_vlm/version.py +1 -0
  261. fount_vlm_nell_02-0.3.11/mlx_vlm/video_generate.py +611 -0
  262. fount_vlm_nell_02-0.3.11/pyproject.toml +47 -0
  263. fount_vlm_nell_02-0.3.11/requirements.txt +13 -0
  264. fount_vlm_nell_02-0.3.11/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright © 2025 Prince Canuma
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,418 @@
1
+ Metadata-Version: 2.4
2
+ Name: fount-vlm-nell-02
3
+ Version: 0.3.11
4
+ Summary: fork of mlx-vlm for fount
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/Blaizzy/mlx-vlm
7
+ Project-URL: Repository, https://github.com/Blaizzy/mlx-vlm
8
+ Project-URL: Issues, https://github.com/Blaizzy/mlx-vlm/issues
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: mlx>=0.30.0
22
+ Requires-Dist: datasets>=2.19.1
23
+ Requires-Dist: tqdm>=4.66.2
24
+ Requires-Dist: transformers>=5.0.0rc3
25
+ Requires-Dist: mlx-lm>=0.30.5
26
+ Requires-Dist: Pillow>=10.3.0
27
+ Requires-Dist: requests>=2.31.0
28
+ Requires-Dist: fastapi>=0.95.1
29
+ Requires-Dist: soundfile>=0.13.1
30
+ Requires-Dist: opencv-python>=4.12.0.88
31
+ Requires-Dist: numpy
32
+ Requires-Dist: uvicorn
33
+ Provides-Extra: ui
34
+ Requires-Dist: gradio>=5.19.0; extra == "ui"
35
+ Provides-Extra: torch
36
+ Requires-Dist: torch; extra == "torch"
37
+ Requires-Dist: torchvision; extra == "torch"
38
+ Requires-Dist: einops; extra == "torch"
39
+ Requires-Dist: blobfile; extra == "torch"
40
+ Requires-Dist: tiktoken; extra == "torch"
41
+ Provides-Extra: cuda
42
+ Requires-Dist: mlx-cuda; extra == "cuda"
43
+ Provides-Extra: cpu
44
+ Requires-Dist: mlx-cpu; extra == "cpu"
45
+ Dynamic: license-file
46
+
47
+ [![Upload Python Package](https://github.com/Blaizzy/mlx-vlm/actions/workflows/python-publish.yml/badge.svg)](https://github.com/Blaizzy/mlx-vlm/actions/workflows/python-publish.yml)
48
+ # MLX-VLM
49
+
50
+ MLX-VLM is a package for inference and fine-tuning of Vision Language Models (VLMs) and Omni Models (VLMs with audio and video support) on your Mac using MLX.
51
+
52
+ ## Table of Contents
53
+ - [Installation](#installation)
54
+ - [Usage](#usage)
55
+ - [Command Line Interface (CLI)](#command-line-interface-cli)
56
+ - [Chat UI with Gradio](#chat-ui-with-gradio)
57
+ - [Python Script](#python-script)
58
+ - [Multi-Image Chat Support](#multi-image-chat-support)
59
+ - [Supported Models](#supported-models)
60
+ - [Usage Examples](#usage-examples)
61
+ - [Model-Specific Documentation](#model-specific-documentation)
62
+ - [Fine-tuning](#fine-tuning)
63
+
64
+ ## Model-Specific Documentation
65
+
66
+ Some models have detailed documentation with prompt formats, examples, and best practices:
67
+
68
+ | Model | Documentation |
69
+ |-------|---------------|
70
+ | DeepSeek-OCR | [Docs](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/models/deepseekocr/README.md) |
71
+ | DeepSeek-OCR-2 | [Docs](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/models/deepseekocr_2/README.md) |
72
+ | GLM-OCR | [Docs](https://github.com/Blaizzy/mlx-vlm/blob/main/mlx_vlm/models/glm_ocr/README.md) |
73
+
74
+ ## Installation
75
+
76
+ The easiest way to get started is to install the `mlx-vlm` package using pip:
77
+
78
+ ```sh
79
+ pip install -U mlx-vlm
80
+ ```
81
+
82
+ ## Usage
83
+
84
+ ### Command Line Interface (CLI)
85
+
86
+ Generate output from a model using the CLI:
87
+
88
+ ```sh
89
+ # Text generation
90
+ mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Hello, how are you?"
91
+
92
+ # Image generation
93
+ mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --temperature 0.0 --image http://images.cocodataset.org/val2017/000000039769.jpg
94
+
95
+ # Audio generation (New)
96
+ mlx_vlm.generate --model mlx-community/gemma-3n-E2B-it-4bit --max-tokens 100 --prompt "Describe what you hear" --audio /path/to/audio.wav
97
+
98
+ # Multi-modal generation (Image + Audio)
99
+ mlx_vlm.generate --model mlx-community/gemma-3n-E2B-it-4bit --max-tokens 100 --prompt "Describe what you see and hear" --image /path/to/image.jpg --audio /path/to/audio.wav
100
+ ```
101
+
102
+ ### Chat UI with Gradio
103
+
104
+ Launch a chat interface using Gradio:
105
+
106
+ ```sh
107
+ mlx_vlm.chat_ui --model mlx-community/Qwen2-VL-2B-Instruct-4bit
108
+ ```
109
+
110
+ ### Python Script
111
+
112
+ Here's an example of how to use MLX-VLM in a Python script:
113
+
114
+ ```python
115
+ import mlx.core as mx
116
+ from mlx_vlm import load, generate
117
+ from mlx_vlm.prompt_utils import apply_chat_template
118
+ from mlx_vlm.utils import load_config
119
+
120
+ # Load the model
121
+ model_path = "mlx-community/Qwen2-VL-2B-Instruct-4bit"
122
+ model, processor = load(model_path)
123
+ config = load_config(model_path)
124
+
125
+ # Prepare input
126
+ image = ["http://images.cocodataset.org/val2017/000000039769.jpg"]
127
+ # image = [Image.open("...")] can also be used with PIL.Image.Image objects
128
+ prompt = "Describe this image."
129
+
130
+ # Apply chat template
131
+ formatted_prompt = apply_chat_template(
132
+ processor, config, prompt, num_images=len(image)
133
+ )
134
+
135
+ # Generate output
136
+ output = generate(model, processor, formatted_prompt, image, verbose=False)
137
+ print(output)
138
+ ```
139
+
140
+ #### Audio Example
141
+
142
+ ```python
143
+ from mlx_vlm import load, generate
144
+ from mlx_vlm.prompt_utils import apply_chat_template
145
+ from mlx_vlm.utils import load_config
146
+
147
+ # Load model with audio support
148
+ model_path = "mlx-community/gemma-3n-E2B-it-4bit"
149
+ model, processor = load(model_path)
150
+ config = model.config
151
+
152
+ # Prepare audio input
153
+ audio = ["/path/to/audio1.wav", "/path/to/audio2.mp3"]
154
+ prompt = "Describe what you hear in these audio files."
155
+
156
+ # Apply chat template with audio
157
+ formatted_prompt = apply_chat_template(
158
+ processor, config, prompt, num_audios=len(audio)
159
+ )
160
+
161
+ # Generate output with audio
162
+ output = generate(model, processor, formatted_prompt, audio=audio, verbose=False)
163
+ print(output)
164
+ ```
165
+
166
+ #### Multi-Modal Example (Image + Audio)
167
+
168
+ ```python
169
+ from mlx_vlm import load, generate
170
+ from mlx_vlm.prompt_utils import apply_chat_template
171
+ from mlx_vlm.utils import load_config
172
+
173
+ # Load multi-modal model
174
+ model_path = "mlx-community/gemma-3n-E2B-it-4bit"
175
+ model, processor = load(model_path)
176
+ config = model.config
177
+
178
+ # Prepare inputs
179
+ image = ["/path/to/image.jpg"]
180
+ audio = ["/path/to/audio.wav"]
181
+ prompt = ""
182
+
183
+ # Apply chat template
184
+ formatted_prompt = apply_chat_template(
185
+ processor, config, prompt,
186
+ num_images=len(image),
187
+ num_audios=len(audio)
188
+ )
189
+
190
+ # Generate output
191
+ output = generate(model, processor, formatted_prompt, image, audio=audio, verbose=False)
192
+ print(output)
193
+ ```
194
+
195
+ ### Server (FastAPI)
196
+
197
+ Start the server:
198
+ ```sh
199
+ mlx_vlm.server --port 8080
200
+
201
+ # With trust remote code enabled (required for some models)
202
+ mlx_vlm.server --trust-remote-code
203
+ ```
204
+
205
+ #### Server Options
206
+
207
+ - `--host`: Host address (default: `0.0.0.0`)
208
+ - `--port`: Port number (default: `8080`)
209
+ - `--trust-remote-code`: Trust remote code when loading models from Hugging Face Hub
210
+
211
+ You can also set trust remote code via environment variable:
212
+ ```sh
213
+ MLX_TRUST_REMOTE_CODE=true mlx_vlm.server
214
+ ```
215
+
216
+ The server provides multiple endpoints for different use cases and supports dynamic model loading/unloading with caching (one model at a time).
217
+
218
+ #### Available Endpoints
219
+
220
+ - `/models` - List models available locally
221
+ - `/chat/completions` - OpenAI-compatible chat-style interaction endpoint with support for images, audio, and text
222
+ - `/responses` - OpenAI-compatible responses endpoint
223
+ - `/health` - Check server status
224
+ - `/unload` - Unload current model from memory
225
+
226
+ #### Usage Examples
227
+
228
+ ##### List available models
229
+
230
+ ```sh
231
+ curl "http://localhost:8080/models"
232
+ ```
233
+
234
+ ##### Text Input
235
+
236
+ ```sh
237
+ curl -X POST "http://localhost:8080/chat/completions" \
238
+ -H "Content-Type: application/json" \
239
+ -d '{
240
+ "model": "mlx-community/Qwen2-VL-2B-Instruct-4bit",
241
+ "messages": [
242
+ {
243
+ "role": "user",
244
+ "content": "Hello, how are you",
245
+ }
246
+ ],
247
+ "stream": true,
248
+ "max_tokens": 100
249
+ }'
250
+ ```
251
+
252
+ ##### Image Input
253
+
254
+ ```sh
255
+ curl -X POST "http://localhost:8080/chat/completions" \
256
+ -H "Content-Type: application/json" \
257
+ -d '{
258
+ "model": "mlx-community/Qwen2.5-VL-32B-Instruct-8bit",
259
+ [
260
+ {
261
+ "role": "system",
262
+ "content": "You are a helpful assistant."
263
+ },
264
+ {
265
+ "role": "user",
266
+ "content": [
267
+ {
268
+ "type": "text",
269
+ "text": This is today's chart for energy demand in California. Can you provide an analysis of the chart and comment on the implications for renewable energy in California?"
270
+ },
271
+ {
272
+ "type": "input_image",
273
+ "image_url": "/path/to/repo/examples/images/renewables_california.png"
274
+ }
275
+ ]
276
+ }
277
+ ],
278
+ "stream": true,
279
+ "max_tokens": 1000
280
+ }'
281
+ ```
282
+
283
+ ##### Audio Support (New)
284
+ ```sh
285
+ curl -X POST "http://localhost:8080/generate" \
286
+ -H "Content-Type: application/json" \
287
+ -d '{
288
+ "model": "mlx-community/gemma-3n-E2B-it-4bit",
289
+ "messages": [
290
+ {
291
+ "role": "user",
292
+ "content": [
293
+ { "type": "text", "text": "Describe what you hear in these audio files" },
294
+ {"type": "input_audio", "input_audio": "/path/to/audio1.wav"}
295
+ {"type": "input_audio", "input_audio": "https://example.com/audio2.mp3"}
296
+ ]
297
+ }
298
+ ],
299
+ "stream": true,
300
+ "max_tokens": 500
301
+ }'
302
+ ```
303
+
304
+ ##### Multi-Modal (Image + Audio)
305
+ ```sh
306
+ curl -X POST "http://localhost:8080/generate" \
307
+ -H "Content-Type: application/json" \
308
+ -d '{
309
+ "model": "mlx-community/gemma-3n-E2B-it-4bit",
310
+ "messages": [
311
+ {
312
+ "role": "user",
313
+ "content": [
314
+ {"type": "input_image", "image_url": "/path/to/image.jpg"},
315
+ {"type": "input_audio", "input_audio": "/path/to/audio.wav"}
316
+ ]
317
+ }
318
+ ],
319
+ "max_tokens": 100
320
+ }'
321
+ ```
322
+
323
+ ##### Responses Endpoint
324
+ ```sh
325
+ curl -X POST "http://localhost:8080/responses" \
326
+ -H "Content-Type: application/json" \
327
+ -d '{
328
+ "model": "mlx-community/Qwen2-VL-2B-Instruct-4bit",
329
+ "messages": [
330
+ {
331
+ "role": "user",
332
+ "content": [
333
+ {"type": "input_text", "text": "What is in this image?"},
334
+ {"type": "input_image", "image_url": "/path/to/image.jpg"}
335
+ ]
336
+ }
337
+ ],
338
+ "max_tokens": 100
339
+ }'
340
+ ```
341
+
342
+ #### Request Parameters
343
+
344
+ - `model`: Model identifier (required)
345
+ - `messages`: Chat messages for chat/OpenAI endpoints
346
+ - `max_tokens`: Maximum tokens to generate
347
+ - `temperature`: Sampling temperature
348
+ - `top_p`: Top-p sampling parameter
349
+ - `stream`: Enable streaming responses
350
+
351
+
352
+ ## Multi-Image Chat Support
353
+
354
+ MLX-VLM supports analyzing multiple images simultaneously with select models. This feature enables more complex visual reasoning tasks and comprehensive analysis across multiple images in a single conversation.
355
+
356
+
357
+ ### Usage Examples
358
+
359
+ #### Python Script
360
+
361
+ ```python
362
+ from mlx_vlm import load, generate
363
+ from mlx_vlm.prompt_utils import apply_chat_template
364
+ from mlx_vlm.utils import load_config
365
+
366
+ model_path = "mlx-community/Qwen2-VL-2B-Instruct-4bit"
367
+ model, processor = load(model_path)
368
+ config = model.config
369
+
370
+ images = ["path/to/image1.jpg", "path/to/image2.jpg"]
371
+ prompt = "Compare these two images."
372
+
373
+ formatted_prompt = apply_chat_template(
374
+ processor, config, prompt, num_images=len(images)
375
+ )
376
+
377
+ output = generate(model, processor, formatted_prompt, images, verbose=False)
378
+ print(output)
379
+ ```
380
+
381
+ #### Command Line
382
+
383
+ ```sh
384
+ mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Compare these images" --image path/to/image1.jpg path/to/image2.jpg
385
+ ```
386
+
387
+ ## Video Understanding
388
+
389
+ MLX-VLM also supports video analysis such as captioning, summarization, and more, with select models.
390
+
391
+ ### Supported Models
392
+
393
+ The following models support video chat:
394
+
395
+ 1. Qwen2-VL
396
+ 2. Qwen2.5-VL
397
+ 3. Idefics3
398
+ 4. LLaVA
399
+
400
+ With more coming soon.
401
+
402
+ ### Usage Examples
403
+
404
+ #### Command Line
405
+ ```sh
406
+ mlx_vlm.video_generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Describe this video" --video path/to/video.mp4 --max-pixels 224 224 --fps 1.0
407
+ ```
408
+
409
+
410
+ These examples demonstrate how to use multiple images with MLX-VLM for more complex visual reasoning tasks.
411
+
412
+ # Fine-tuning
413
+
414
+ MLX-VLM supports fine-tuning models with LoRA and QLoRA.
415
+
416
+ ## LoRA & QLoRA
417
+
418
+ To learn more about LoRA, please refer to the [LoRA.md](./mlx_vlm/LORA.MD) file.