nexaai 1.0.18__cp310-cp310-macosx_14_0_universal2.whl → 1.0.19__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Files changed (215) hide show
  1. nexaai/_stub.cpython-310-darwin.so +0 -0
  2. nexaai/_version.py +1 -1
  3. nexaai/asr.py +2 -1
  4. nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-base.dylib +0 -0
  5. nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libmtmd.dylib +0 -0
  6. nexaai/binds/{nexa_llama_cpp/libllama.dylib → cpu_gpu/libnexa_cpu_gpu.dylib} +0 -0
  7. nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libnexa_plugin.dylib +0 -0
  8. nexaai/binds/libnexa_bridge.dylib +0 -0
  9. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  10. nexaai/binds/{nexa_mlx → metal}/libnexa_plugin.dylib +0 -0
  11. nexaai/binds/{nexa_nexaml → nexaml}/libggml-base.dylib +0 -0
  12. nexaai/binds/{nexa_nexaml → nexaml}/libnexa-mm-process.dylib +0 -0
  13. nexaai/binds/{nexa_nexaml → nexaml}/libnexa-sampling.dylib +0 -0
  14. nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
  15. nexaai/binds/nexaml/libnexaproc.dylib +0 -0
  16. nexaai/binds/{nexa_nexaml → nexaml}/libomp.dylib +0 -0
  17. nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
  18. nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
  19. nexaai/cv.py +2 -1
  20. nexaai/embedder.py +1 -1
  21. nexaai/image_gen.py +2 -1
  22. nexaai/llm.py +5 -3
  23. nexaai/llm_impl/mlx_llm_impl.py +2 -0
  24. nexaai/llm_impl/pybind_llm_impl.py +2 -0
  25. nexaai/mlx_backend/vlm/generate_qwen3_vl.py +176 -96
  26. nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
  27. nexaai/mlx_backend/vlm/interface.py +99 -30
  28. nexaai/mlx_backend/vlm/main.py +58 -9
  29. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +338 -299
  30. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
  31. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
  32. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
  33. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
  34. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
  35. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
  36. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
  37. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
  38. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
  39. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
  40. nexaai/rerank.py +2 -1
  41. nexaai/tts.py +2 -1
  42. nexaai/utils/manifest_utils.py +222 -15
  43. nexaai/utils/model_manager.py +120 -14
  44. nexaai/utils/model_types.py +2 -0
  45. nexaai/vlm.py +2 -1
  46. {nexaai-1.0.18.dist-info → nexaai-1.0.19.dist-info}/METADATA +1 -2
  47. {nexaai-1.0.18.dist-info → nexaai-1.0.19.dist-info}/RECORD +211 -200
  48. nexaai/binds/nexa_nexaml/libnexa_plugin.dylib +0 -0
  49. nexaai/binds/nexa_nexaml/libnexaproc.dylib +0 -0
  50. nexaai/binds/nexa_nexaml/libqwen3-vl.dylib +0 -0
  51. nexaai/binds/nexa_nexaml/libqwen3vl-vision.dylib +0 -0
  52. /nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-cpu.so +0 -0
  53. /nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-metal.so +0 -0
  54. /nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml.dylib +0 -0
  55. /nexaai/binds/{nexa_mlx → metal}/py-lib/ml.py +0 -0
  56. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/__init__.py +0 -0
  57. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/__init__.py +0 -0
  58. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/__init__.py +0 -0
  59. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +0 -0
  60. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/bigvgan/activation.py +0 -0
  61. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/bigvgan/amp.py +0 -0
  62. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +0 -0
  63. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/bigvgan/conv.py +0 -0
  64. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/bigvgan/resample.py +0 -0
  65. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/descript/__init__.py +0 -0
  66. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/descript/base.py +0 -0
  67. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/descript/dac.py +0 -0
  68. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +0 -0
  69. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/descript/nn/layers.py +0 -0
  70. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +0 -0
  71. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/encodec/__init__.py +0 -0
  72. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/encodec/encodec.py +0 -0
  73. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/__init__.py +0 -0
  74. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/mimi.py +0 -0
  75. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +0 -0
  76. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +0 -0
  77. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +0 -0
  78. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +0 -0
  79. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +0 -0
  80. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +0 -0
  81. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/s3/__init__.py +0 -0
  82. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/s3/model.py +0 -0
  83. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/s3/model_v2.py +0 -0
  84. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/s3/utils.py +0 -0
  85. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/snac/__init__.py +0 -0
  86. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/snac/attention.py +0 -0
  87. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/snac/layers.py +0 -0
  88. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/snac/snac.py +0 -0
  89. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/snac/vq.py +0 -0
  90. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/vocos/__init__.py +0 -0
  91. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/vocos/mel.py +0 -0
  92. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/models/vocos/vocos.py +0 -0
  93. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  94. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_bigvgan.py +0 -0
  95. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_descript.py +0 -0
  96. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_encodec.py +0 -0
  97. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_mimi.py +0 -0
  98. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_s3.py +0 -0
  99. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_snac.py +0 -0
  100. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/codec/tests/test_vocos.py +0 -0
  101. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/server.py +0 -0
  102. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/sts/__init__.py +0 -0
  103. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +0 -0
  104. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/sts/voice_pipeline.py +0 -0
  105. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/__init__.py +0 -0
  106. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/generate.py +0 -0
  107. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  108. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/__init__.py +0 -0
  109. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/alignment.py +0 -0
  110. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/attention.py +0 -0
  111. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/audio.py +0 -0
  112. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/conformer.py +0 -0
  113. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/ctc.py +0 -0
  114. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +0 -0
  115. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +0 -0
  116. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +0 -0
  117. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +0 -0
  118. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +0 -0
  119. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/__init__.py +0 -0
  120. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/audio.py +0 -0
  121. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/decoding.py +0 -0
  122. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/timing.py +0 -0
  123. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +0 -0
  124. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/whisper.py +0 -0
  125. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/models/whisper/writers.py +0 -0
  126. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/tests/test_models.py +0 -0
  127. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/stt/utils.py +0 -0
  128. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/__init__.py +0 -0
  129. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/audio_player.py +0 -0
  130. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/convert.py +0 -0
  131. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/generate.py +0 -0
  132. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  133. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/bark/__init__.py +0 -0
  134. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/bark/bark.py +0 -0
  135. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/bark/isftnet.py +0 -0
  136. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/bark/pipeline.py +0 -0
  137. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/base.py +0 -0
  138. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/dia/__init__.py +0 -0
  139. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/dia/audio.py +0 -0
  140. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/dia/config.py +0 -0
  141. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/dia/dia.py +0 -0
  142. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/dia/layers.py +0 -0
  143. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/__init__.py +0 -0
  144. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/attention.py +0 -0
  145. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +0 -0
  146. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/conformer.py +0 -0
  147. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  148. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +0 -0
  149. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +0 -0
  150. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +0 -0
  151. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +0 -0
  152. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/gpt2.py +0 -0
  153. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/indextts.py +0 -0
  154. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/mel.py +0 -0
  155. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/normalize.py +0 -0
  156. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/indextts/perceiver.py +0 -0
  157. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/interpolate.py +0 -0
  158. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/kokoro/__init__.py +0 -0
  159. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +0 -0
  160. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +0 -0
  161. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/kokoro/modules.py +0 -0
  162. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +0 -0
  163. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/kokoro/voice.py +0 -0
  164. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/llama/__init__.py +0 -0
  165. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/llama/llama.py +0 -0
  166. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/outetts/__init__.py +0 -0
  167. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +0 -0
  168. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +0 -0
  169. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/outetts/outetts.py +0 -0
  170. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +0 -0
  171. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/outetts/tokens.py +0 -0
  172. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/sesame/__init__.py +0 -0
  173. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/sesame/attention.py +0 -0
  174. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/sesame/sesame.py +0 -0
  175. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/sesame/watermarking.py +0 -0
  176. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/__init__.py +0 -0
  177. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +0 -0
  178. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/bicodec.py +0 -0
  179. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  180. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  181. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +0 -0
  182. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  183. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +0 -0
  184. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +0 -0
  185. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +0 -0
  186. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +0 -0
  187. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/residual.py +0 -0
  188. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +0 -0
  189. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +0 -0
  190. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +0 -0
  191. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +0 -0
  192. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +0 -0
  193. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +0 -0
  194. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/spark.py +0 -0
  195. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/utils/audio.py +0 -0
  196. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/utils/file.py +0 -0
  197. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +0 -0
  198. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  199. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/tests/test_base.py +0 -0
  200. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/tests/test_convert.py +0 -0
  201. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/tests/test_interpolate.py +0 -0
  202. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/tests/test_models.py +0 -0
  203. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/tts/utils.py +0 -0
  204. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/utils.py +0 -0
  205. /nexaai/binds/{nexa_mlx → metal}/py-lib/mlx_audio/version.py +0 -0
  206. /nexaai/binds/{nexa_mlx → metal}/py-lib/profiling.py +0 -0
  207. /nexaai/binds/{nexa_nexaml → nexaml}/libfftw3.3.dylib +0 -0
  208. /nexaai/binds/{nexa_nexaml → nexaml}/libfftw3f.3.dylib +0 -0
  209. /nexaai/binds/{nexa_nexaml → nexaml}/libggml-cpu.so +0 -0
  210. /nexaai/binds/{nexa_nexaml → nexaml}/libggml-metal.so +0 -0
  211. /nexaai/binds/{nexa_nexaml → nexaml}/libggml.dylib +0 -0
  212. /nexaai/binds/{nexa_nexaml → nexaml}/libmp3lame.0.dylib +0 -0
  213. /nexaai/binds/{nexa_nexaml → nexaml}/libmpg123.0.dylib +0 -0
  214. {nexaai-1.0.18.dist-info → nexaai-1.0.19.dist-info}/WHEEL +0 -0
  215. {nexaai-1.0.18.dist-info → nexaai-1.0.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,259 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import mlx.core as mx
5
+ import mlx.nn as nn
6
+ import time
7
+ from PIL import Image
8
+ import requests
9
+ import numpy as np
10
+ from pathlib import Path
11
+ from huggingface_hub import snapshot_download
12
+ from dataclasses import dataclass
13
+ from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
14
+
15
+ # Import required modules for quantized loading
16
+ from transformers import AutoTokenizer
17
+
18
+ # Import from the nested modeling structure
19
+ from .modeling.models.qwen3vl_moe.llm_common.generate import nexa_generate_step
20
+ from .modeling.models.qwen3vl_moe.llm_common.cache import make_prompt_cache
21
+ from .modeling.models.qwen3vl_moe.qwen3vl_moe import (
22
+ VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
23
+ )
24
+ from .modeling.models.qwen3vl_moe.processor import Qwen3VLProcessor
25
+ from .generate import GenerationResult
26
+ from ml import ChatMessage
27
+
28
+ @dataclass
29
+ class Qwen3VLBundledModel:
30
+ """Container for Qwen3-VL MoE vision and language models."""
31
+ vision_model: VEGModel
32
+ llm_model: LLMModel
33
+
34
+
35
+ def _ensure_list(x: Union[str, List[str], None]) -> Optional[List[str]]:
36
+ if x is None:
37
+ return None
38
+ return x if isinstance(x, list) else [x]
39
+
40
+
41
+ def load_qwen3_vl(
42
+ path_or_repo: str,
43
+ adapter_path: Optional[str] = None,
44
+ lazy: bool = False,
45
+ revision: Optional[str] = None,
46
+ **kwargs,
47
+ ) -> Tuple[Qwen3VLBundledModel, Qwen3VLProcessor]:
48
+ """Load Qwen3-VL MoE quantized models and processor.
49
+
50
+ Parameters are aligned with .generate.load for compatibility.
51
+ """
52
+ model_path = Path(path_or_repo)
53
+ if not model_path.exists():
54
+ if "/" in path_or_repo:
55
+ model_path = Path(snapshot_download(
56
+ repo_id=path_or_repo, repo_type="model", revision=revision))
57
+ else:
58
+ # Fallback to local modelfiles directory relative to this file
59
+ curr_dir = Path(__file__).parent
60
+ model_path = curr_dir / "modeling" / "models" / "qwen3vl_moe" / "modelfiles"
61
+ if not model_path.exists():
62
+ model_path = curr_dir / "modelfiles"
63
+
64
+ # Model configs - Updated to match Qwen3VL-MoE specifications
65
+ vision_config = VisionConfig(
66
+ hidden_size=1152,
67
+ intermediate_size=4304,
68
+ num_heads=16,
69
+ num_hidden_layers=27,
70
+ patch_size=16,
71
+ temporal_patch_size=2,
72
+ in_channels=3,
73
+ hidden_act="gelu_pytorch_tanh",
74
+ spatial_merge_size=2,
75
+ out_hidden_size=2048,
76
+ num_position_embeddings=2304,
77
+ deepstack_visual_indexes=[8, 16, 24],
78
+ )
79
+
80
+ text_config = TextConfig(
81
+ model_type="qwen3_vl_moe_text",
82
+ hidden_size=2048,
83
+ num_hidden_layers=48,
84
+ intermediate_size=6144,
85
+ num_attention_heads=32,
86
+ num_key_value_heads=4,
87
+ rms_norm_eps=1e-6,
88
+ vocab_size=152064,
89
+ max_position_embeddings=128000,
90
+ rope_theta=1000000.0,
91
+ head_dim=128,
92
+ tie_word_embeddings=False,
93
+ attention_bias=False,
94
+ attention_dropout=0.0,
95
+ rope_scaling={
96
+ "mrope_interleaved": True,
97
+ "mrope_section": [24, 20, 20],
98
+ "rope_type": "default"
99
+ },
100
+ # MoE specific parameters
101
+ num_experts=128,
102
+ num_experts_per_tok=8,
103
+ moe_intermediate_size=768,
104
+ shared_expert_intermediate_size=0,
105
+ norm_topk_prob=True,
106
+ decoder_sparse_step=1,
107
+ max_window_layers=48,
108
+ sliding_window=32768,
109
+ mlp_only_layers=[],
110
+ use_qk_norm=True,
111
+ layer_types=[],
112
+ )
113
+
114
+ vision_model = VEGModel(vision_config)
115
+ llm_model = LLMModel(text_config)
116
+
117
+ # Try to load LLM model from available files in order of preference
118
+ preferred_order = [
119
+ ("qwen3vl-moe-llm-30B-A3B-q4_0.safetensors", 4),
120
+ ("qwen3vl-moe-llm-30B-A3B-q8_0.safetensors", 8),
121
+ ("qwen3vl-moe-llm-30B-A3B-f32.safetensors", 32),
122
+ ]
123
+
124
+ llm_weights_path = None
125
+ quantization_bits = None
126
+
127
+ # Try loading in order of preference
128
+ for filename, bits in preferred_order:
129
+ candidate_path = model_path / filename
130
+ if candidate_path.exists():
131
+ llm_weights_path = candidate_path
132
+ quantization_bits = bits
133
+ break
134
+
135
+ if llm_weights_path is None:
136
+ # Fallback to original hardcoded path for backward compatibility
137
+ llm_weights_path = model_path / "qwen3vl-moe-llm-30B-A3B-q4_0.safetensors"
138
+ quantization_bits = 4
139
+
140
+ vision_weights_path = model_path / "qwen3vl-moe-vision-30B-A3B-f16.safetensors"
141
+
142
+ if not vision_weights_path.exists():
143
+ raise FileNotFoundError(
144
+ f"Missing vision weights: {vision_weights_path}"
145
+ )
146
+
147
+ # Load weights (vision fp16, llm with detected quantization)
148
+ vision_model.set_dtype(mx.float16)
149
+ vision_model.load_weights(str(vision_weights_path), strict=True)
150
+
151
+ # Apply quantization if needed and load LLM weights
152
+ if quantization_bits in [4, 8]:
153
+ nn.quantize(llm_model, bits=quantization_bits, group_size=64,
154
+ class_predicate=quant_predicate)
155
+ # For f32 (32-bit), no quantization needed
156
+
157
+ llm_model.load_weights(str(llm_weights_path), strict=True)
158
+
159
+ # Tokenizer and processor
160
+ tokenizer = AutoTokenizer.from_pretrained(path_or_repo)
161
+ processor = Qwen3VLProcessor(tokenizer=tokenizer)
162
+
163
+ return Qwen3VLBundledModel(vision_model=vision_model, llm_model=llm_model), processor
164
+
165
+ def apply_chat_template_qwen3_vl(messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = False) -> str:
166
+ """Apply chat template: serialize messages with content as a list of typed items."""
167
+ messages_dict = []
168
+ for msg in messages:
169
+ content_items = [{"type": "text", "text": msg.content}]
170
+ messages_dict.append({"role": msg.role, "content": content_items})
171
+ return json.dumps(messages_dict)
172
+
173
+
174
+ def stream_generate_qwen3_vl(
175
+ model: Qwen3VLBundledModel,
176
+ processor: Qwen3VLProcessor,
177
+ prompt: str,
178
+ image: Union[str, List[str]] = None,
179
+ audio: Union[str, List[str]] = None,
180
+ max_tokens: int = 512,
181
+ **kwargs,
182
+
183
+ ) -> Generator[Any, None, None]:
184
+ """Stream generation yielding .generate.GenerationResult-compatible chunks."""
185
+ messages = json.loads(prompt)
186
+ if image is not None:
187
+ image_list = image if isinstance(image, list) else [image]
188
+ pil_images = []
189
+ for p in image_list:
190
+ try:
191
+ pil_images.append(Image.open(p))
192
+ except Exception:
193
+ continue
194
+ contents = [{"type": "image", "image": img} for img in pil_images]
195
+ if messages:
196
+ if "content" not in messages[-1] or not isinstance(messages[-1]["content"], list):
197
+ messages[-1]["content"] = []
198
+ messages[-1]["content"].extend(contents)
199
+
200
+ raw_text, processed_images = processor.messages_to_text(
201
+ messages, add_generation_prompt=True)
202
+
203
+ inputs = processor.text_to_input_ids(
204
+ raw_text, images=processed_images, return_tensors="mlx")
205
+
206
+ input_ids = inputs["input_ids"]
207
+ pixel_values = inputs.get("pixel_values")
208
+ image_grid_thw = inputs.get("image_grid_thw")
209
+
210
+ inputs_embeds, deepstack_visual_embeds, visual_pos_masks, cos, sin, rope_deltas = handle_multimodal_embeds(
211
+ model.vision_model, model.llm_model, input_ids, pixel_values, image_grid_thw
212
+ )
213
+
214
+ prompt_cache = make_prompt_cache(model.llm_model, max_kv_size=4096)
215
+ tokenizer = processor.tokenizer
216
+
217
+ # Rough prompt TPS estimation based on input size
218
+ prompt_start = time.perf_counter()
219
+ prompt_tps = input_ids.size / max(1e-6, (time.perf_counter() - prompt_start))
220
+
221
+ gen_count = 0
222
+ tic = time.perf_counter()
223
+
224
+ for token, logprobs in nexa_generate_step(
225
+ model=model.llm_model,
226
+ prompt=None,
227
+ input_embeddings=inputs_embeds,
228
+ max_tokens=max_tokens,
229
+ max_kv_size=4096,
230
+ prompt_cache=prompt_cache,
231
+ visual_pos_masks=visual_pos_masks,
232
+ deepstack_visual_embeds=deepstack_visual_embeds,
233
+ cos=cos,
234
+ sin=sin,
235
+ rope_deltas=rope_deltas,
236
+ ):
237
+ if token == tokenizer.eos_token_id:
238
+ break
239
+
240
+ text_piece = tokenizer.decode([token])
241
+ gen_count += 1
242
+
243
+ yield GenerationResult(
244
+ text=text_piece,
245
+ token=token,
246
+ logprobs=logprobs,
247
+ prompt_tokens=int(input_ids.size),
248
+ generation_tokens=gen_count,
249
+ prompt_tps=float(prompt_tps),
250
+ generation_tps=float(
251
+ gen_count / max(1e-6, (time.perf_counter() - tic))),
252
+ peak_memory=float(mx.get_peak_memory() / 1e9),
253
+ )
254
+
255
+ def quant_predicate(path: str, mod: nn.Module) -> bool:
256
+ """Quantization predicate to exclude certain layers from quantization."""
257
+ if path.endswith("lm_head") or "norm" in path.lower() or "embed" in path.lower():
258
+ return False
259
+ return isinstance(mod, (nn.Linear, nn.Embedding))
@@ -27,6 +27,10 @@ from profiling import ProfilingMixin, ProfilingData, StopReason
27
27
  from .generate import generate, stream_generate, load
28
28
  from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl, ContextLengthExceededError
29
29
 
30
+ from .generate_qwen3_vl_moe import apply_chat_template_qwen3_vl as apply_chat_template_qwen3_vl_moe
31
+ from .generate_qwen3_vl_moe import stream_generate_qwen3_vl as stream_generate_qwen3_vl_moe
32
+ from .generate_qwen3_vl_moe import load_qwen3_vl as load_qwen3_vl_moe
33
+
30
34
  from .modeling.prompt_utils import apply_chat_template
31
35
 
32
36
  # --------------------------------------------------------------------------------------
@@ -75,8 +79,15 @@ class VLM(ProfilingMixin):
75
79
  self.context_length = context_length
76
80
  self.device = device
77
81
 
78
- load_impl = load_qwen3_vl if model_name == "qwen3vl" else load
79
- self.model, self.processor = load_impl(str(model_path))
82
+ if model_name == "qwen3vl-moe":
83
+ load_impl = load_qwen3_vl_moe
84
+ elif model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking"]:
85
+ load_impl = load_qwen3_vl
86
+ else:
87
+ load_impl = load
88
+
89
+ # Pass model_name to the loader for proper configuration
90
+ self.model, self.processor = load_impl(str(model_path), model_name=model_name)
80
91
 
81
92
  # Init deafutl sampler config with defualt.
82
93
  self.sampler_config = SamplerConfig()
@@ -84,6 +95,19 @@ class VLM(ProfilingMixin):
84
95
  # Track global character position for incremental processing
85
96
  self.global_n_past_chars = 0
86
97
 
98
+ # Add conversation state tracking to VLM class
99
+ if model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking"]:
100
+ # Import here to avoid circular imports
101
+ from .modeling.models.qwen3_vl.llm_common.cache import make_prompt_cache
102
+ import mlx.core as mx
103
+
104
+ # Initialize conversation state
105
+ self.rope_deltas_total = mx.zeros((1, 1), dtype=mx.int32)
106
+ self.prompt_cache = make_prompt_cache(self.model.llm_model, max_kv_size=4096)
107
+ else:
108
+ self.rope_deltas_total = None
109
+ self.prompt_cache = None
110
+
87
111
  def destroy(self) -> None:
88
112
  """Destroy the model and free resources."""
89
113
  self.model = None
@@ -93,6 +117,14 @@ class VLM(ProfilingMixin):
93
117
  """Reset the model state."""
94
118
  self._reset_cache()
95
119
  self.global_n_past_chars = 0
120
+
121
+ # Reset conversation state for qwen3vl models
122
+ if self.model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking"]:
123
+ import mlx.core as mx
124
+ from .modeling.models.qwen3_vl.llm_common.cache import make_prompt_cache
125
+
126
+ self.rope_deltas_total = mx.zeros((1, 1), dtype=mx.int32)
127
+ self.prompt_cache = make_prompt_cache(self.model.llm_model, max_kv_size=4096)
96
128
 
97
129
  def _reset_cache(self) -> None:
98
130
  """Reset the KV cache."""
@@ -270,7 +302,7 @@ class VLM(ProfilingMixin):
270
302
 
271
303
  # Apply incremental processing only for non-qwen3vl models
272
304
  # qwen3vl requires complete JSON conversation structure
273
- if self.model_name != "qwen3vl":
305
+ if self.model_name not in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking", "qwen3vl-moe"]:
274
306
  if self.global_n_past_chars < full_prompt_len:
275
307
  incremental_prompt = prompt[self.global_n_past_chars:]
276
308
  else:
@@ -284,33 +316,70 @@ class VLM(ProfilingMixin):
284
316
  text = ""
285
317
  last_result = None
286
318
  first_token = True
287
- stream_generate_impl = stream_generate_qwen3_vl if self.model_name == "qwen3vl" else stream_generate
319
+
320
+ if self.model_name == "qwen3vl-moe":
321
+ stream_generate_impl = stream_generate_qwen3_vl_moe
322
+ elif self.model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking"]:
323
+ stream_generate_impl = stream_generate_qwen3_vl
324
+ else:
325
+ stream_generate_impl = stream_generate
288
326
 
289
327
  try:
290
328
  token_count = 0
291
329
 
292
- for result in stream_generate_impl(
293
- self.model,
294
- self.processor,
295
- incremental_prompt, # Use incremental prompt instead of full prompt
296
- image=image_list,
297
- audio=audio_list,
298
- **gen_kwargs,
299
- ):
300
- token_count += 1
301
-
302
- # Record TTFT on first token
303
- if first_token:
304
- self._record_ttft()
305
- first_token = False
306
-
307
- # Call the token callback if provided
308
- if on_token is not None:
309
- if not on_token(result.text):
310
- self._set_stop_reason(StopReason.ML_STOP_REASON_USER)
311
- break
312
- text += result.text
313
- last_result = result
330
+ # Pass conversation state for qwen3vl models
331
+ if self.model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking"]:
332
+ for result in stream_generate_impl(
333
+ self.model,
334
+ self.processor,
335
+ incremental_prompt,
336
+ image=image_list,
337
+ audio=audio_list,
338
+ rope_deltas_total=self.rope_deltas_total, # Pass conversation state
339
+ prompt_cache=self.prompt_cache, # Pass KV cache
340
+ **gen_kwargs,
341
+ ):
342
+ token_count += 1
343
+
344
+ # Record TTFT on first token
345
+ if first_token:
346
+ self._record_ttft()
347
+ first_token = False
348
+
349
+ # Call the token callback if provided
350
+ if on_token is not None:
351
+ if not on_token(result.text):
352
+ self._set_stop_reason(StopReason.ML_STOP_REASON_USER)
353
+ break
354
+ text += result.text
355
+ last_result = result
356
+
357
+ # Update conversation state after each token
358
+ # Note: rope_deltas_total is updated inside stream_generate_qwen3_vl
359
+
360
+ else:
361
+ for result in stream_generate_impl(
362
+ self.model,
363
+ self.processor,
364
+ incremental_prompt,
365
+ image=image_list,
366
+ audio=audio_list,
367
+ **gen_kwargs,
368
+ ):
369
+ token_count += 1
370
+
371
+ # Record TTFT on first token
372
+ if first_token:
373
+ self._record_ttft()
374
+ first_token = False
375
+
376
+ # Call the token callback if provided
377
+ if on_token is not None:
378
+ if not on_token(result.text):
379
+ self._set_stop_reason(StopReason.ML_STOP_REASON_USER)
380
+ break
381
+ text += result.text
382
+ last_result = result
314
383
 
315
384
 
316
385
  # Set stop reason if not user stop
@@ -323,7 +392,7 @@ class VLM(ProfilingMixin):
323
392
  self._update_generated_tokens(last_result.generation_tokens)
324
393
 
325
394
  # Update global character position (not needed for qwen3vl JSON processing)
326
- if self.model_name != "qwen3vl":
395
+ if self.model_name not in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking", "qwen3vl-moe"]:
327
396
  old_pos = self.global_n_past_chars
328
397
  self.global_n_past_chars = full_prompt_len + len(text)
329
398
 
@@ -428,10 +497,10 @@ class VLM(ProfilingMixin):
428
497
 
429
498
  def apply_chat_template_with_media(self, messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = True) -> str:
430
499
  """Apply chat template to messages with proper image/audio token insertion and optional tools support."""
431
- if self.model_name == "qwen3vl":
500
+ if self.model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking", "qwen3vl-8b", "qwen3vl-8b-thinking"]:
432
501
  return apply_chat_template_qwen3_vl(messages, num_images=num_images, num_audios=num_audios, tools=tools, enable_thinking=enable_thinking)
433
-
434
- # Convert ChatMessage objects to dictionaries for the processor
502
+ if self.model_name == "qwen3vl-moe":
503
+ return apply_chat_template_qwen3_vl_moe(messages, num_images=num_images, num_audios=num_audios, tools=tools, enable_thinking=enable_thinking)
435
504
  messages_dict = [{"role": msg.role, "content": msg.content} for msg in messages]
436
505
 
437
506
  parsed_tools = None
@@ -40,6 +40,57 @@ def parse_media_from_input(user_input):
40
40
 
41
41
  return prompt, image_paths if image_paths else None, audio_paths if audio_paths else None
42
42
 
43
+ def detect_model_name_and_repo(model_path):
44
+ """Detect model name and corresponding HuggingFace repo based on model path or name"""
45
+ model_path_lower = model_path.lower()
46
+
47
+ # Handle HuggingFace repo format
48
+ if "/" in model_path:
49
+ repo_name = model_path.split("/")[-1] if model_path.endswith("/") else model_path.split("/")[-1]
50
+ repo_name_lower = repo_name.lower()
51
+ else:
52
+ repo_name_lower = model_path_lower
53
+
54
+ # Model name mapping based on the provided examples
55
+ model_mappings = {
56
+ # 4B models
57
+ "qwen3vl-4b-4bit-mlx": ("qwen3vl-4b", "NexaAI/qwen3vl-4B-4bit-mlx"),
58
+ "qwen3vl-4b-fp16-mlx": ("qwen3vl-4b", "NexaAI/qwen3vl-4B-fp16-mlx"),
59
+ "qwen3vl-4b-thinking-4bit-mlx": ("qwen3vl-4b-thinking", "NexaAI/qwen3vl-4B-thinking-4bit-mlx"),
60
+ "qwen3vl-4b-thinking-fp16-mlx": ("qwen3vl-4b-thinking", "NexaAI/qwen3vl-4B-thinking-fp16-mlx"),
61
+
62
+ # 8B models
63
+ "qwen3vl-8b-4bit-mlx": ("qwen3vl-8b", "NexaAI/qwen3vl-8B-4bit-mlx"),
64
+ "qwen3vl-8b-fp16-mlx": ("qwen3vl-8b", "NexaAI/qwen3vl-8B-fp16-mlx"),
65
+ "qwen3vl-8b-thinking-4bit-mlx": ("qwen3vl-8b-thinking", "NexaAI/qwen3vl-8B-thinking-4bit-mlx"),
66
+ "qwen3vl-8b-thinking-fp16-mlx": ("qwen3vl-8b-thinking", "NexaAI/qwen3vl-8B-thinking-fp16-mlx"),
67
+ }
68
+
69
+ # Check exact matches first
70
+ for key, (model_name, repo) in model_mappings.items():
71
+ if key in repo_name_lower:
72
+ return model_name, repo if "/" not in model_path else model_path
73
+
74
+ # Fallback detection based on patterns
75
+ if "qwen3vl" in repo_name_lower:
76
+ if "8b" in repo_name_lower:
77
+ if "thinking" in repo_name_lower:
78
+ return "qwen3vl-8b-thinking", model_path
79
+ else:
80
+ return "qwen3vl-8b", model_path
81
+ elif "4b" in repo_name_lower:
82
+ if "thinking" in repo_name_lower:
83
+ return "qwen3vl-4b-thinking", model_path
84
+ else:
85
+ return "qwen3vl-4b", model_path
86
+ else:
87
+ # Default to 4B if size not specified
88
+ return "qwen3vl-4b", model_path
89
+ elif "gemma" in repo_name_lower:
90
+ return "gemma3", model_path
91
+
92
+ return "", model_path
93
+
43
94
  def parse_arguments():
44
95
  """Parse command line arguments for the VLM main function."""
45
96
  parser = argparse.ArgumentParser(
@@ -48,14 +99,14 @@ def parse_arguments():
48
99
  parser.add_argument(
49
100
  "--model_path",
50
101
  type=str,
51
- default="mlx-community/gemma-3-4b-it-8bit",
102
+ default="NexaAI/qwen3vl-4B-4bit-mlx",
52
103
  help="The path to the local model directory or Hugging Face repo."
53
104
  )
54
105
  parser.add_argument(
55
106
  "--model_name",
56
107
  type=str,
57
108
  default="",
58
- help="Specific model name/type (e.g., 'qwen3vl', 'gemma3'). If empty, auto-detect from model_path."
109
+ help="Specific model name/type (e.g., 'qwen3vl-4b', 'qwen3vl-4b-thinking', 'qwen3vl-8b', 'qwen3vl-8b-thinking'). If empty, auto-detect from model_path."
59
110
  )
60
111
  parser.add_argument(
61
112
  "--context_length",
@@ -89,18 +140,16 @@ def main():
89
140
 
90
141
  # Auto-detect model name if not provided
91
142
  model_name = args.model_name
143
+ model_path = args.model_path
144
+
92
145
  if not model_name:
93
- if "qwen" in args.model_path.lower():
94
- model_name = "qwen3vl"
95
- elif "gemma" in args.model_path.lower():
96
- model_name = "gemma3"
97
- else:
98
- model_name = ""
146
+ model_name, model_path = detect_model_name_and_repo(args.model_path)
147
+ print(f"Auto-detected model: {model_name} from path: {model_path}")
99
148
 
100
149
  # Load the VLM instance
101
150
  vlm = VLM(
102
151
  model_name=model_name,
103
- model_path=args.model_path,
152
+ model_path=model_path,
104
153
  mmproj_path=None, # Not needed for this model
105
154
  context_length=args.context_length,
106
155
  device=None