nexaai 1.0.19rc5__cp310-cp310-macosx_14_0_universal2.whl → 1.0.19rc7__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Files changed (221) hide show
  1. nexaai/_stub.cpython-310-darwin.so +0 -0
  2. nexaai/_version.py +1 -1
  3. nexaai/binds/libnexa_bridge.dylib +0 -0
  4. nexaai/binds/nexa_llama_cpp/libggml-base.dylib +0 -0
  5. nexaai/binds/nexa_llama_cpp/libggml-cpu.so +0 -0
  6. nexaai/binds/nexa_llama_cpp/libggml-metal.so +0 -0
  7. nexaai/binds/nexa_llama_cpp/libggml.dylib +0 -0
  8. nexaai/binds/nexa_llama_cpp/libllama.dylib +0 -0
  9. nexaai/binds/nexa_llama_cpp/libmtmd.dylib +0 -0
  10. nexaai/binds/nexa_llama_cpp/libnexa_plugin.dylib +0 -0
  11. nexaai/binds/nexa_mlx/libnexa_plugin.dylib +0 -0
  12. nexaai/binds/nexa_mlx/py-lib/asr/__init__.py +12 -0
  13. nexaai/binds/nexa_mlx/py-lib/asr/interface.py +122 -0
  14. nexaai/binds/nexa_mlx/py-lib/common/__init__.py +0 -0
  15. nexaai/binds/nexa_mlx/py-lib/common/utils.py +25 -0
  16. nexaai/binds/nexa_mlx/py-lib/cv/__init__.py +0 -0
  17. nexaai/binds/nexa_mlx/py-lib/cv/generate.py +195 -0
  18. nexaai/binds/nexa_mlx/py-lib/cv/interface.py +151 -0
  19. nexaai/binds/nexa_mlx/py-lib/cv/main.py +81 -0
  20. nexaai/binds/nexa_mlx/py-lib/cv/modeling/pp_ocr_v4.py +1736 -0
  21. nexaai/binds/nexa_mlx/py-lib/embedding/__init__.py +0 -0
  22. nexaai/binds/nexa_mlx/py-lib/embedding/generate.py +333 -0
  23. nexaai/binds/nexa_mlx/py-lib/embedding/interface.py +617 -0
  24. nexaai/binds/nexa_mlx/py-lib/embedding/main.py +173 -0
  25. nexaai/binds/nexa_mlx/py-lib/embedding/modeling/__init__.py +0 -0
  26. nexaai/binds/nexa_mlx/py-lib/embedding/modeling/nexa_jina_v2.py +399 -0
  27. nexaai/binds/nexa_mlx/py-lib/image_gen/__init__.py +1 -0
  28. nexaai/binds/nexa_mlx/py-lib/image_gen/generate_sd.py +244 -0
  29. nexaai/binds/nexa_mlx/py-lib/image_gen/interface.py +82 -0
  30. nexaai/binds/nexa_mlx/py-lib/image_gen/main.py +281 -0
  31. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/__init__.py +306 -0
  32. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/clip.py +116 -0
  33. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/config.py +65 -0
  34. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/model_io.py +386 -0
  35. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/sampler.py +105 -0
  36. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/tokenizer.py +100 -0
  37. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/unet.py +460 -0
  38. nexaai/binds/nexa_mlx/py-lib/image_gen/stable_diffusion/vae.py +274 -0
  39. nexaai/binds/nexa_mlx/py-lib/llm/__init__.py +0 -0
  40. nexaai/binds/nexa_mlx/py-lib/llm/generate.py +149 -0
  41. nexaai/binds/nexa_mlx/py-lib/llm/interface.py +764 -0
  42. nexaai/binds/nexa_mlx/py-lib/llm/main.py +68 -0
  43. nexaai/binds/nexa_mlx/py-lib/rerank/__init__.py +0 -0
  44. nexaai/binds/nexa_mlx/py-lib/rerank/generate.py +174 -0
  45. nexaai/binds/nexa_mlx/py-lib/rerank/interface.py +287 -0
  46. nexaai/binds/nexa_mlx/py-lib/rerank/main.py +127 -0
  47. nexaai/binds/nexa_mlx/py-lib/rerank/modeling/__init__.py +0 -0
  48. nexaai/binds/nexa_mlx/py-lib/rerank/modeling/nexa_jina_rerank.py +330 -0
  49. nexaai/binds/nexa_mlx/py-lib/sd/__init__.py +1 -0
  50. nexaai/binds/nexa_mlx/py-lib/sd/interface.py +362 -0
  51. nexaai/binds/nexa_mlx/py-lib/sd/main.py +286 -0
  52. nexaai/binds/nexa_mlx/py-lib/sd/modeling/__init__.py +306 -0
  53. nexaai/binds/nexa_mlx/py-lib/sd/modeling/clip.py +116 -0
  54. nexaai/binds/nexa_mlx/py-lib/sd/modeling/config.py +65 -0
  55. nexaai/binds/nexa_mlx/py-lib/sd/modeling/model_io.py +385 -0
  56. nexaai/binds/nexa_mlx/py-lib/sd/modeling/sampler.py +105 -0
  57. nexaai/binds/nexa_mlx/py-lib/sd/modeling/tokenizer.py +100 -0
  58. nexaai/binds/nexa_mlx/py-lib/sd/modeling/unet.py +460 -0
  59. nexaai/binds/nexa_mlx/py-lib/sd/modeling/vae.py +274 -0
  60. nexaai/binds/nexa_mlx/py-lib/tts/__init__.py +12 -0
  61. nexaai/binds/nexa_mlx/py-lib/tts/interface.py +276 -0
  62. nexaai/binds/nexa_mlx/py-lib/vlm/__init__.py +3 -0
  63. nexaai/binds/nexa_mlx/py-lib/vlm/generate.py +572 -0
  64. nexaai/binds/nexa_mlx/py-lib/vlm/generate_qwen3_vl.py +294 -0
  65. nexaai/binds/nexa_mlx/py-lib/vlm/generate_qwen3_vl_moe.py +276 -0
  66. nexaai/binds/nexa_mlx/py-lib/vlm/interface.py +504 -0
  67. nexaai/binds/nexa_mlx/py-lib/vlm/main.py +320 -0
  68. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/__init__.py +0 -0
  69. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/convert.py +68 -0
  70. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/__init__.py +0 -0
  71. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/aya_vision/__init__.py +8 -0
  72. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  73. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  74. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/aya_vision/language.py +233 -0
  75. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/aya_vision/vision.py +503 -0
  76. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/base.py +202 -0
  77. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/cache.py +230 -0
  78. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  79. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  80. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  81. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  82. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  83. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  84. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/florence2/__init__.py +8 -0
  85. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/florence2/florence2.py +366 -0
  86. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/florence2/language.py +488 -0
  87. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/florence2/vision.py +591 -0
  88. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3/__init__.py +8 -0
  89. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3/gemma3.py +213 -0
  90. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3/language.py +315 -0
  91. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3/vision.py +238 -0
  92. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3n/__init__.py +2 -0
  93. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3n/audio.py +1038 -0
  94. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3n/config.py +139 -0
  95. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  96. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3n/language.py +629 -0
  97. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/gemma3n/vision.py +1022 -0
  98. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics2/__init__.py +9 -0
  99. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics2/idefics2.py +294 -0
  100. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics2/language.py +191 -0
  101. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics2/vision.py +267 -0
  102. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics3/__init__.py +8 -0
  103. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics3/idefics3.py +175 -0
  104. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics3/language.py +192 -0
  105. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/idefics3/vision.py +233 -0
  106. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  107. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  108. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/internvl_chat/language.py +220 -0
  109. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/internvl_chat/processor.py +393 -0
  110. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/internvl_chat/vision.py +293 -0
  111. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/kernels.py +307 -0
  112. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  113. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  114. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/kimi_vl/language.py +509 -0
  115. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/kimi_vl/vision.py +522 -0
  116. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llama4/__init__.py +8 -0
  117. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llama4/language.py +386 -0
  118. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llama4/llama4.py +138 -0
  119. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llama4/vision.py +560 -0
  120. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava/__init__.py +8 -0
  121. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava/language.py +240 -0
  122. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava/llava.py +153 -0
  123. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava/vision.py +259 -0
  124. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  125. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_bunny/language.py +236 -0
  126. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  127. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_bunny/vision.py +303 -0
  128. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_next/__init__.py +8 -0
  129. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_next/language.py +230 -0
  130. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_next/llava_next.py +160 -0
  131. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/llava_next/vision.py +243 -0
  132. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/mistral3/__init__.py +8 -0
  133. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/mistral3/mistral3.py +283 -0
  134. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/mllama/__init__.py +8 -0
  135. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/mllama/language.py +416 -0
  136. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/mllama/mllama.py +172 -0
  137. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/mllama/vision.py +499 -0
  138. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/molmo/__init__.py +8 -0
  139. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/molmo/language.py +243 -0
  140. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/molmo/molmo.py +133 -0
  141. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/molmo/vision.py +465 -0
  142. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/multi_modality/__init__.py +10 -0
  143. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/multi_modality/language.py +230 -0
  144. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  145. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/multi_modality/sam.py +557 -0
  146. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/multi_modality/vision.py +526 -0
  147. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/paligemma/__init__.py +8 -0
  148. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/paligemma/language.py +282 -0
  149. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/paligemma/paligemma.py +160 -0
  150. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/paligemma/vision.py +242 -0
  151. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/phi3_v/__init__.py +8 -0
  152. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/phi3_v/language.py +21 -0
  153. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  154. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  155. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/phi3_v/vision.py +324 -0
  156. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/pixtral/__init__.py +8 -0
  157. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/pixtral/language.py +229 -0
  158. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/pixtral/pixtral.py +161 -0
  159. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/pixtral/vision.py +320 -0
  160. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  161. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  162. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  163. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  164. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  165. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  166. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_vl/config.py +104 -0
  167. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_vl/language.py +490 -0
  168. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  169. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  170. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
  171. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
  172. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
  173. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
  174. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
  175. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
  176. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
  177. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/processor.py +476 -0
  178. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3_vl/qwen3vl.py +1223 -0
  179. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
  180. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
  181. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
  182. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
  183. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
  184. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
  185. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
  186. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
  187. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1309 -0
  188. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
  189. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/smolvlm/__init__.py +8 -0
  190. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  191. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  192. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/processing_qwen2_vl.py +215 -0
  193. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/prompt_utils.py +474 -0
  194. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/sample_utils.py +39 -0
  195. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/tokenizer_utils.py +344 -0
  196. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/trainer/__init__.py +9 -0
  197. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/trainer/lora.py +70 -0
  198. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/trainer/trainer.py +296 -0
  199. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/trainer/utils.py +160 -0
  200. nexaai/binds/nexa_mlx/py-lib/vlm/modeling/utils.py +928 -0
  201. nexaai/binds/nexa_nexaml/libggml-base.dylib +0 -0
  202. nexaai/binds/nexa_nexaml/libggml-cpu.so +0 -0
  203. nexaai/binds/nexa_nexaml/libggml-metal.so +0 -0
  204. nexaai/binds/nexa_nexaml/libggml.dylib +0 -0
  205. nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +276 -0
  206. nexaai/mlx_backend/vlm/interface.py +21 -4
  207. nexaai/mlx_backend/vlm/main.py +6 -2
  208. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
  209. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
  210. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
  211. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
  212. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
  213. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
  214. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
  215. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
  216. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1309 -0
  217. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
  218. {nexaai-1.0.19rc5.dist-info → nexaai-1.0.19rc7.dist-info}/METADATA +1 -1
  219. {nexaai-1.0.19rc5.dist-info → nexaai-1.0.19rc7.dist-info}/RECORD +221 -21
  220. {nexaai-1.0.19rc5.dist-info → nexaai-1.0.19rc7.dist-info}/WHEEL +0 -0
  221. {nexaai-1.0.19rc5.dist-info → nexaai-1.0.19rc7.dist-info}/top_level.txt +0 -0
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,276 @@
1
+ import argparse
2
+ import json
3
+ import sys
4
+ import os
5
+ import mlx.core as mx
6
+ import mlx.nn as nn
7
+ import time
8
+ from PIL import Image
9
+ import requests
10
+ import numpy as np
11
+ from pathlib import Path
12
+ from huggingface_hub import snapshot_download
13
+
14
+ # Add current directory to path for imports
15
+ curr_dir = os.path.dirname(os.path.abspath(__file__))
16
+ sys.path.append(curr_dir)
17
+ sys.path.append(os.path.dirname(curr_dir))
18
+
19
+ # Add the qwen3vl model directory to path
20
+ qwen3vl_dir = os.path.join(curr_dir, "modeling", "models", "qwen3vl_moe")
21
+ sys.path.append(qwen3vl_dir)
22
+
23
+ # Import required modules for quantized loading
24
+ from transformers import AutoTokenizer
25
+
26
+ # Try relative imports first, fallback to sys.path approach for Nuitka compatibility
27
+ try:
28
+ from .modeling.models.qwen3_vl_moe.llm_common.generate import nexa_generate_step
29
+ from .modeling.models.qwen3_vl_moe.llm_common.cache import make_prompt_cache
30
+ from .modeling.models.qwen3_vl_moe.qwen3vl_moe import (
31
+ VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
32
+ )
33
+ from .modeling.models.qwen3_vl_moe.processor import Qwen3VLProcessor
34
+ except ImportError:
35
+ # Fallback for Nuitka compiled environment - use sys.path approach
36
+ from llm_common.generate import nexa_generate_step
37
+ from llm_common.cache import make_prompt_cache
38
+ from qwen3vl_moe import VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
39
+ from processor import Qwen3VLProcessor
40
+
41
+ from ml import ChatMessage
42
+ from dataclasses import dataclass
43
+ from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
44
+ from .generate import GenerationResult
45
+
46
+ @dataclass
47
+ class Qwen3VLBundledModel:
48
+ """Container for Qwen3-VL MoE vision and language models."""
49
+ vision_model: VEGModel
50
+ llm_model: LLMModel
51
+
52
+
53
+ def _ensure_list(x: Union[str, List[str], None]) -> Optional[List[str]]:
54
+ if x is None:
55
+ return None
56
+ return x if isinstance(x, list) else [x]
57
+
58
+
59
+ def load_qwen3_vl(
60
+ path_or_repo: str,
61
+ adapter_path: Optional[str] = None,
62
+ lazy: bool = False,
63
+ revision: Optional[str] = None,
64
+ **kwargs,
65
+ ) -> Tuple[Qwen3VLBundledModel, Qwen3VLProcessor]:
66
+ """Load Qwen3-VL MoE quantized models and processor.
67
+
68
+ Parameters are aligned with .generate.load for compatibility.
69
+ """
70
+ model_path = Path(path_or_repo)
71
+ if not model_path.exists():
72
+ if "/" in path_or_repo:
73
+ model_path = Path(snapshot_download(
74
+ repo_id=path_or_repo, repo_type="model", revision=revision))
75
+ else:
76
+ # Fallback to local modelfiles directory
77
+ model_path = Path(qwen3vl_dir) / "modelfiles"
78
+ if not model_path.exists():
79
+ model_path = Path(curr_dir) / "modelfiles"
80
+
81
+ # Model configs - Updated to match Qwen3VL-MoE specifications
82
+ vision_config = VisionConfig(
83
+ hidden_size=1152,
84
+ intermediate_size=4304,
85
+ num_heads=16,
86
+ num_hidden_layers=27,
87
+ patch_size=16,
88
+ temporal_patch_size=2,
89
+ in_channels=3,
90
+ hidden_act="gelu_pytorch_tanh",
91
+ spatial_merge_size=2,
92
+ out_hidden_size=2048,
93
+ num_position_embeddings=2304,
94
+ deepstack_visual_indexes=[8, 16, 24],
95
+ )
96
+
97
+ text_config = TextConfig(
98
+ model_type="qwen3_vl_moe_text",
99
+ hidden_size=2048,
100
+ num_hidden_layers=48,
101
+ intermediate_size=6144,
102
+ num_attention_heads=32,
103
+ num_key_value_heads=4,
104
+ rms_norm_eps=1e-6,
105
+ vocab_size=152064,
106
+ max_position_embeddings=128000,
107
+ rope_theta=1000000.0,
108
+ head_dim=128,
109
+ tie_word_embeddings=False,
110
+ attention_bias=False,
111
+ attention_dropout=0.0,
112
+ rope_scaling={
113
+ "mrope_interleaved": True,
114
+ "mrope_section": [24, 20, 20],
115
+ "rope_type": "default"
116
+ },
117
+ # MoE specific parameters
118
+ num_experts=128,
119
+ num_experts_per_tok=8,
120
+ moe_intermediate_size=768,
121
+ shared_expert_intermediate_size=0,
122
+ norm_topk_prob=True,
123
+ decoder_sparse_step=1,
124
+ max_window_layers=48,
125
+ sliding_window=32768,
126
+ mlp_only_layers=[],
127
+ use_qk_norm=True,
128
+ layer_types=[],
129
+ )
130
+
131
+ vision_model = VEGModel(vision_config)
132
+ llm_model = LLMModel(text_config)
133
+
134
+ # Try to load LLM model from available files in order of preference
135
+ preferred_order = [
136
+ ("qwen3vl-moe-llm-30B-A3B-q4_0.safetensors", 4),
137
+ ("qwen3vl-moe-llm-30B-A3B-q8_0.safetensors", 8),
138
+ ("qwen3vl-moe-llm-30B-A3B-f32.safetensors", 32),
139
+ ]
140
+
141
+ llm_weights_path = None
142
+ quantization_bits = None
143
+
144
+ # Try loading in order of preference
145
+ for filename, bits in preferred_order:
146
+ candidate_path = model_path / filename
147
+ if candidate_path.exists():
148
+ llm_weights_path = candidate_path
149
+ quantization_bits = bits
150
+ break
151
+
152
+ if llm_weights_path is None:
153
+ # Fallback to original hardcoded path for backward compatibility
154
+ llm_weights_path = model_path / "qwen3vl-moe-llm-30B-A3B-q4_0.safetensors"
155
+ quantization_bits = 4
156
+
157
+ vision_weights_path = model_path / "qwen3vl-moe-vision-30B-A3B-f16.safetensors"
158
+
159
+ if not vision_weights_path.exists():
160
+ raise FileNotFoundError(
161
+ f"Missing vision weights: {vision_weights_path}"
162
+ )
163
+
164
+ # Load weights (vision fp16, llm with detected quantization)
165
+ vision_model.set_dtype(mx.float16)
166
+ vision_model.load_weights(str(vision_weights_path), strict=True)
167
+
168
+ # Apply quantization if needed and load LLM weights
169
+ if quantization_bits in [4, 8]:
170
+ nn.quantize(llm_model, bits=quantization_bits, group_size=64,
171
+ class_predicate=quant_predicate)
172
+ # For f32 (32-bit), no quantization needed
173
+
174
+ llm_model.load_weights(str(llm_weights_path), strict=True)
175
+
176
+ # Tokenizer and processor
177
+ tokenizer = AutoTokenizer.from_pretrained(path_or_repo)
178
+ processor = Qwen3VLProcessor(tokenizer=tokenizer)
179
+
180
+ return Qwen3VLBundledModel(vision_model=vision_model, llm_model=llm_model), processor
181
+
182
+ def apply_chat_template_qwen3_vl(messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = False) -> str:
183
+ """Apply chat template: serialize messages with content as a list of typed items."""
184
+ messages_dict = []
185
+ for msg in messages:
186
+ content_items = [{"type": "text", "text": msg.content}]
187
+ messages_dict.append({"role": msg.role, "content": content_items})
188
+ return json.dumps(messages_dict)
189
+
190
+
191
+ def stream_generate_qwen3_vl(
192
+ model: Qwen3VLBundledModel,
193
+ processor: Qwen3VLProcessor,
194
+ prompt: str,
195
+ image: Union[str, List[str]] = None,
196
+ audio: Union[str, List[str]] = None,
197
+ max_tokens: int = 512,
198
+ **kwargs,
199
+
200
+ ) -> Generator[Any, None, None]:
201
+ """Stream generation yielding .generate.GenerationResult-compatible chunks."""
202
+ messages = json.loads(prompt)
203
+ if image is not None:
204
+ image_list = image if isinstance(image, list) else [image]
205
+ pil_images = []
206
+ for p in image_list:
207
+ try:
208
+ pil_images.append(Image.open(p))
209
+ except Exception:
210
+ continue
211
+ contents = [{"type": "image", "image": img} for img in pil_images]
212
+ if messages:
213
+ if "content" not in messages[-1] or not isinstance(messages[-1]["content"], list):
214
+ messages[-1]["content"] = []
215
+ messages[-1]["content"].extend(contents)
216
+
217
+ raw_text, processed_images = processor.messages_to_text(
218
+ messages, add_generation_prompt=True)
219
+
220
+ inputs = processor.text_to_input_ids(
221
+ raw_text, images=processed_images, return_tensors="mlx")
222
+
223
+ input_ids = inputs["input_ids"]
224
+ pixel_values = inputs.get("pixel_values")
225
+ image_grid_thw = inputs.get("image_grid_thw")
226
+
227
+ inputs_embeds, deepstack_visual_embeds, visual_pos_masks, cos, sin, rope_deltas = handle_multimodal_embeds(
228
+ model.vision_model, model.llm_model, input_ids, pixel_values, image_grid_thw
229
+ )
230
+
231
+ prompt_cache = make_prompt_cache(model.llm_model, max_kv_size=4096)
232
+ tokenizer = processor.tokenizer
233
+
234
+ # Rough prompt TPS estimation based on input size
235
+ prompt_start = time.perf_counter()
236
+ prompt_tps = input_ids.size / max(1e-6, (time.perf_counter() - prompt_start))
237
+
238
+ gen_count = 0
239
+ tic = time.perf_counter()
240
+
241
+ for token, logprobs in nexa_generate_step(
242
+ model=model.llm_model,
243
+ prompt=None,
244
+ input_embeddings=inputs_embeds,
245
+ max_tokens=max_tokens,
246
+ max_kv_size=4096,
247
+ prompt_cache=prompt_cache,
248
+ visual_pos_masks=visual_pos_masks,
249
+ deepstack_visual_embeds=deepstack_visual_embeds,
250
+ cos=cos,
251
+ sin=sin,
252
+ rope_deltas=rope_deltas,
253
+ ):
254
+ if token == tokenizer.eos_token_id:
255
+ break
256
+
257
+ text_piece = tokenizer.decode([token])
258
+ gen_count += 1
259
+
260
+ yield GenerationResult(
261
+ text=text_piece,
262
+ token=token,
263
+ logprobs=logprobs,
264
+ prompt_tokens=int(input_ids.size),
265
+ generation_tokens=gen_count,
266
+ prompt_tps=float(prompt_tps),
267
+ generation_tps=float(
268
+ gen_count / max(1e-6, (time.perf_counter() - tic))),
269
+ peak_memory=float(mx.get_peak_memory() / 1e9),
270
+ )
271
+
272
+ def quant_predicate(path: str, mod: nn.Module) -> bool:
273
+ """Quantization predicate to exclude certain layers from quantization."""
274
+ if path.endswith("lm_head") or "norm" in path.lower() or "embed" in path.lower():
275
+ return False
276
+ return isinstance(mod, (nn.Linear, nn.Embedding))
@@ -27,6 +27,10 @@ from profiling import ProfilingMixin, ProfilingData, StopReason
27
27
  from .generate import generate, stream_generate, load
28
28
  from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl, ContextLengthExceededError
29
29
 
30
+ from .generate_qwen3_vl_moe import apply_chat_template_qwen3_vl as apply_chat_template_qwen3_vl_moe
31
+ from .generate_qwen3_vl_moe import stream_generate_qwen3_vl as stream_generate_qwen3_vl_moe
32
+ from .generate_qwen3_vl_moe import load_qwen3_vl as load_qwen3_vl_moe
33
+
30
34
  from .modeling.prompt_utils import apply_chat_template
31
35
 
32
36
  # --------------------------------------------------------------------------------------
@@ -75,7 +79,13 @@ class VLM(ProfilingMixin):
75
79
  self.context_length = context_length
76
80
  self.device = device
77
81
 
78
- load_impl = load_qwen3_vl if model_name == "qwen3vl" else load
82
+ if model_name == "qwen3vl-moe":
83
+ load_impl = load_qwen3_vl_moe
84
+ elif model_name == "qwen3vl":
85
+ load_impl = load_qwen3_vl
86
+ else:
87
+ load_impl = load
88
+
79
89
  self.model, self.processor = load_impl(str(model_path))
80
90
 
81
91
  # Init deafutl sampler config with defualt.
@@ -284,7 +294,13 @@ class VLM(ProfilingMixin):
284
294
  text = ""
285
295
  last_result = None
286
296
  first_token = True
287
- stream_generate_impl = stream_generate_qwen3_vl if self.model_name == "qwen3vl" else stream_generate
297
+
298
+ if self.model_name == "qwen3vl-moe":
299
+ stream_generate_impl = stream_generate_qwen3_vl_moe
300
+ elif self.model_name == "qwen3vl":
301
+ stream_generate_impl = stream_generate_qwen3_vl
302
+ else:
303
+ stream_generate_impl = stream_generate
288
304
 
289
305
  try:
290
306
  token_count = 0
@@ -430,8 +446,9 @@ class VLM(ProfilingMixin):
430
446
  """Apply chat template to messages with proper image/audio token insertion and optional tools support."""
431
447
  if self.model_name == "qwen3vl":
432
448
  return apply_chat_template_qwen3_vl(messages, num_images=num_images, num_audios=num_audios, tools=tools, enable_thinking=enable_thinking)
433
-
434
- # Convert ChatMessage objects to dictionaries for the processor
449
+ if self.model_name == "qwen3vl-moe":
450
+ return apply_chat_template_qwen3_vl_moe(messages, num_images=num_images, num_audios=num_audios, tools=tools, enable_thinking=enable_thinking)
451
+ # Convert ChatMessage objects to dictionaries for the processor
435
452
  messages_dict = [{"role": msg.role, "content": msg.content} for msg in messages]
436
453
 
437
454
  parsed_tools = None
@@ -55,7 +55,7 @@ def parse_arguments():
55
55
  "--model_name",
56
56
  type=str,
57
57
  default="",
58
- help="Specific model name/type (e.g., 'qwen3vl', 'gemma3'). If empty, auto-detect from model_path."
58
+ help="Specific model name/type (e.g., 'qwen3vl', 'qwen3vl-moe', 'gemma3'). If empty, auto-detect from model_path."
59
59
  )
60
60
  parser.add_argument(
61
61
  "--context_length",
@@ -89,8 +89,12 @@ def main():
89
89
 
90
90
  # Auto-detect model name if not provided
91
91
  model_name = args.model_name
92
+
93
+ # TODO: avoid such hardcoded model name detection
92
94
  if not model_name:
93
- if "qwen" in args.model_path.lower():
95
+ if "qwen3vl-30B" in args.model_path.lower():
96
+ model_name = "qwen3vl-moe"
97
+ elif "qwen3" in args.model_path.lower():
94
98
  model_name = "qwen3vl"
95
99
  elif "gemma" in args.model_path.lower():
96
100
  model_name = "gemma3"
@@ -0,0 +1,117 @@
1
+ import inspect
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional
4
+
5
+ import mlx.core as mx
6
+ from mlx.utils import tree_map
7
+
8
+ from .cache import QuantizedKVCache
9
+
10
+
11
+ @dataclass
12
+ class BaseModelArgs:
13
+ @classmethod
14
+ def from_dict(cls, params):
15
+ return cls(**{k: v for k, v in params.items() if k in inspect.signature(cls).parameters})
16
+
17
+
18
+ def create_causal_mask(
19
+ N: int,
20
+ offset: int = 0,
21
+ window_size: Optional[int] = None,
22
+ lengths: Optional[mx.array] = None,
23
+ ):
24
+ rinds = mx.arange(offset + N)
25
+ linds = mx.arange(offset, offset + N) if offset else rinds
26
+ linds = linds[:, None]
27
+ rinds = rinds[None]
28
+ mask = linds >= rinds
29
+ if window_size is not None:
30
+ mask = mask & (linds <= rinds + window_size)
31
+ if lengths is not None:
32
+ lengths = lengths[:, None, None, None]
33
+ mask = mask & (rinds < lengths)
34
+ return mask
35
+
36
+
37
+ def create_attention_mask(h: mx.array, cache: Optional[Any] = None, return_array: bool = False):
38
+ T = h.shape[1]
39
+ if T > 1:
40
+ offset = 0
41
+ window_size = None
42
+ if cache is not None and cache[0] is not None:
43
+ c = cache[0]
44
+ offset = c.offset
45
+ if hasattr(c, "max_size"):
46
+ window_size = c.max_size
47
+ offset = min(window_size, offset)
48
+ return_array = return_array or offset + T > window_size
49
+ if return_array:
50
+ return create_causal_mask(T, offset, window_size=window_size)
51
+ else:
52
+ return "causal"
53
+ else:
54
+ mask = None
55
+ return mask
56
+
57
+
58
+ def quantized_scaled_dot_product_attention(
59
+ queries: mx.array,
60
+ q_keys: tuple[mx.array, mx.array, mx.array],
61
+ q_values: tuple[mx.array, mx.array, mx.array],
62
+ scale: float,
63
+ mask: Optional[mx.array],
64
+ group_size: int = 64,
65
+ bits: int = 8,
66
+ ) -> mx.array:
67
+ B, n_q_heads, L, D = queries.shape
68
+ n_kv_heads = q_keys[0].shape[-3]
69
+ n_repeats = n_q_heads // n_kv_heads
70
+
71
+ queries *= scale
72
+
73
+ if n_repeats > 1:
74
+ queries = mx.reshape(queries, (B, n_kv_heads, n_repeats, L, D))
75
+ q_keys = tree_map(lambda x: mx.expand_dims(x, axis=-3), q_keys)
76
+ q_values = tree_map(lambda x: mx.expand_dims(x, axis=-3), q_values)
77
+
78
+ scores = mx.quantized_matmul(queries, *q_keys, transpose=True, group_size=group_size, bits=bits)
79
+ if mask is not None:
80
+ if isinstance(mask, str):
81
+ qL, kL = scores.shape[-2:]
82
+ q_indices = mx.arange(kL - qL, kL)
83
+ k_indices = mx.arange(kL)
84
+ mask = q_indices[:, None] >= k_indices[None]
85
+ if mask.dtype == mx.bool_:
86
+ scores = mx.where(mask, scores, mx.finfo(scores.dtype).min)
87
+ else:
88
+ scores += mask
89
+ scores = mx.softmax(scores, axis=-1, precise=True)
90
+ out = mx.quantized_matmul(scores, *q_values, transpose=False, group_size=group_size, bits=bits)
91
+
92
+ if n_repeats > 1:
93
+ out = mx.reshape(out, (B, n_q_heads, L, D))
94
+
95
+ return out
96
+
97
+
98
+ def scaled_dot_product_attention(
99
+ queries,
100
+ keys,
101
+ values,
102
+ cache,
103
+ scale: float,
104
+ mask: Optional[mx.array],
105
+ ) -> mx.array:
106
+ if isinstance(cache, QuantizedKVCache):
107
+ return quantized_scaled_dot_product_attention(
108
+ queries,
109
+ keys,
110
+ values,
111
+ scale=scale,
112
+ mask=mask,
113
+ group_size=cache.group_size,
114
+ bits=cache.bits,
115
+ )
116
+ else:
117
+ return mx.fast.scaled_dot_product_attention(queries, keys, values, scale=scale, mask=mask)