nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (580) hide show
  1. nexaai/__init__.py +99 -0
  2. nexaai/_stub.cpython-310-darwin.so +0 -0
  3. nexaai/_version.py +4 -0
  4. nexaai/asr.py +68 -0
  5. nexaai/asr_impl/__init__.py +0 -0
  6. nexaai/asr_impl/mlx_asr_impl.py +93 -0
  7. nexaai/asr_impl/pybind_asr_impl.py +127 -0
  8. nexaai/base.py +39 -0
  9. nexaai/binds/__init__.py +7 -0
  10. nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
  11. nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
  12. nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
  13. nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
  14. nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
  15. nexaai/binds/cpu_gpu/libggml.dylib +0 -0
  16. nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
  17. nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
  18. nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
  19. nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
  20. nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
  21. nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
  22. nexaai/binds/libnexa_bridge.dylib +0 -0
  23. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  24. nexaai/binds/metal/libnexa_plugin.dylib +0 -0
  25. nexaai/binds/metal/py-lib/ml.py +888 -0
  26. nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
  27. nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
  28. nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
  29. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  30. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  31. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  32. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  33. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  34. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  35. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
  36. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
  37. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
  38. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  39. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  40. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  41. nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
  42. nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
  43. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
  44. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
  45. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  46. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  47. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  48. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  49. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  50. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  51. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
  52. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
  53. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
  54. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
  55. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
  56. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
  57. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
  58. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
  59. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
  60. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
  61. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
  62. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
  63. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  64. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  65. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
  66. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
  67. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
  68. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
  69. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
  70. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
  71. nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
  72. nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
  73. nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  74. nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
  75. nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
  76. nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
  77. nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  78. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  79. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  80. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
  81. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
  82. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  83. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  84. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  85. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  86. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  87. nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  88. nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  89. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
  90. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
  91. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
  92. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
  93. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  94. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
  95. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
  96. nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
  97. nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
  98. nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
  99. nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
  100. nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
  101. nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
  102. nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  103. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
  104. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
  105. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
  106. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
  107. nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
  108. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
  109. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
  110. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
  111. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
  112. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
  113. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
  114. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
  115. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  116. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
  117. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  118. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  119. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  120. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  121. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  122. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  123. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
  124. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
  125. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
  126. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  127. nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
  128. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  129. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  130. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  131. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
  132. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  133. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
  134. nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
  135. nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
  136. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
  137. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  138. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  139. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
  140. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  141. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
  142. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
  143. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
  144. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
  145. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  146. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
  147. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  148. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
  149. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  150. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  151. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  152. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  153. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  154. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  155. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  156. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  157. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  158. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  159. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  160. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  161. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  162. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  163. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  164. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
  165. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  166. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
  167. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  168. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  169. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
  170. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
  171. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
  172. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
  173. nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
  174. nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
  175. nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
  176. nexaai/binds/metal/py-lib/profiling.py +239 -0
  177. nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
  178. nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
  179. nexaai/binds/nexaml/libggml-base.dylib +0 -0
  180. nexaai/binds/nexaml/libggml-cpu.so +0 -0
  181. nexaai/binds/nexaml/libggml-metal.so +0 -0
  182. nexaai/binds/nexaml/libggml.dylib +0 -0
  183. nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
  184. nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
  185. nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
  186. nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
  187. nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
  188. nexaai/binds/nexaml/libnexaproc.dylib +0 -0
  189. nexaai/binds/nexaml/libomp.dylib +0 -0
  190. nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
  191. nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
  192. nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
  193. nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
  194. nexaai/common.py +106 -0
  195. nexaai/cv.py +95 -0
  196. nexaai/cv_impl/__init__.py +0 -0
  197. nexaai/cv_impl/mlx_cv_impl.py +91 -0
  198. nexaai/cv_impl/pybind_cv_impl.py +124 -0
  199. nexaai/diarize.py +80 -0
  200. nexaai/diarize_impl/__init__.py +1 -0
  201. nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
  202. nexaai/embedder.py +73 -0
  203. nexaai/embedder_impl/__init__.py +0 -0
  204. nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
  205. nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
  206. nexaai/image_gen.py +141 -0
  207. nexaai/image_gen_impl/__init__.py +0 -0
  208. nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
  209. nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
  210. nexaai/llm.py +98 -0
  211. nexaai/llm_impl/__init__.py +0 -0
  212. nexaai/llm_impl/mlx_llm_impl.py +271 -0
  213. nexaai/llm_impl/pybind_llm_impl.py +238 -0
  214. nexaai/log.py +92 -0
  215. nexaai/mlx_backend/asr/__init__.py +12 -0
  216. nexaai/mlx_backend/asr/interface.py +122 -0
  217. nexaai/mlx_backend/common/__init__.py +0 -0
  218. nexaai/mlx_backend/common/utils.py +25 -0
  219. nexaai/mlx_backend/cv/__init__.py +0 -0
  220. nexaai/mlx_backend/cv/generate.py +195 -0
  221. nexaai/mlx_backend/cv/interface.py +162 -0
  222. nexaai/mlx_backend/cv/main.py +81 -0
  223. nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
  224. nexaai/mlx_backend/embedding/__init__.py +0 -0
  225. nexaai/mlx_backend/embedding/generate.py +333 -0
  226. nexaai/mlx_backend/embedding/interface.py +617 -0
  227. nexaai/mlx_backend/embedding/main.py +173 -0
  228. nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
  229. nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
  230. nexaai/mlx_backend/image_gen/__init__.py +1 -0
  231. nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
  232. nexaai/mlx_backend/image_gen/interface.py +82 -0
  233. nexaai/mlx_backend/image_gen/main.py +281 -0
  234. nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
  235. nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
  236. nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
  237. nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
  238. nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
  239. nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
  240. nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
  241. nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
  242. nexaai/mlx_backend/llm/__init__.py +0 -0
  243. nexaai/mlx_backend/llm/generate.py +149 -0
  244. nexaai/mlx_backend/llm/interface.py +764 -0
  245. nexaai/mlx_backend/llm/main.py +68 -0
  246. nexaai/mlx_backend/ml.py +888 -0
  247. nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
  248. nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
  249. nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
  250. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  251. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  252. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  253. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  254. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  255. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  256. nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
  257. nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
  258. nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
  259. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  260. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  261. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  262. nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
  263. nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
  264. nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
  265. nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
  266. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  267. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  268. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  269. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  270. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  271. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  272. nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
  273. nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
  274. nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
  275. nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
  276. nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
  277. nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
  278. nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
  279. nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
  280. nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
  281. nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
  282. nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
  283. nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
  284. nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
  285. nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  286. nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
  287. nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
  288. nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
  289. nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
  290. nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
  291. nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
  292. nexaai/mlx_backend/mlx_audio/server.py +525 -0
  293. nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
  294. nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  295. nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
  296. nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
  297. nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
  298. nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
  299. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  300. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  301. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
  302. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
  303. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  304. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  305. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  306. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  307. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  308. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  309. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  310. nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
  311. nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
  312. nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
  313. nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
  314. nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  315. nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
  316. nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
  317. nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
  318. nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
  319. nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
  320. nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
  321. nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
  322. nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
  323. nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
  324. nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
  325. nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
  326. nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
  327. nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
  328. nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
  329. nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
  330. nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
  331. nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
  332. nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
  333. nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
  334. nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
  335. nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
  336. nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  337. nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
  338. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  339. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  340. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  341. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  342. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  343. nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  344. nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
  345. nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
  346. nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
  347. nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  348. nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
  349. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  350. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  351. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  352. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
  353. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  354. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
  355. nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
  356. nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
  357. nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
  358. nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  359. nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  360. nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
  361. nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
  362. nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  363. nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
  364. nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
  365. nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
  366. nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
  367. nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  368. nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
  369. nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  370. nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
  371. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  372. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  373. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  374. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  375. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  376. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  377. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  378. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  379. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  380. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  381. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  382. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  383. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  384. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  385. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  386. nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
  387. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  388. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
  389. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  390. nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
  391. nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
  392. nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
  393. nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
  394. nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
  395. nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
  396. nexaai/mlx_backend/mlx_audio/utils.py +237 -0
  397. nexaai/mlx_backend/mlx_audio/version.py +1 -0
  398. nexaai/mlx_backend/profiling.py +239 -0
  399. nexaai/mlx_backend/rerank/__init__.py +0 -0
  400. nexaai/mlx_backend/rerank/generate.py +174 -0
  401. nexaai/mlx_backend/rerank/interface.py +287 -0
  402. nexaai/mlx_backend/rerank/main.py +127 -0
  403. nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
  404. nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
  405. nexaai/mlx_backend/sd/__init__.py +1 -0
  406. nexaai/mlx_backend/sd/interface.py +362 -0
  407. nexaai/mlx_backend/sd/main.py +286 -0
  408. nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
  409. nexaai/mlx_backend/sd/modeling/clip.py +116 -0
  410. nexaai/mlx_backend/sd/modeling/config.py +65 -0
  411. nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
  412. nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
  413. nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
  414. nexaai/mlx_backend/sd/modeling/unet.py +460 -0
  415. nexaai/mlx_backend/sd/modeling/vae.py +274 -0
  416. nexaai/mlx_backend/tts/__init__.py +12 -0
  417. nexaai/mlx_backend/tts/interface.py +276 -0
  418. nexaai/mlx_backend/vlm/__init__.py +3 -0
  419. nexaai/mlx_backend/vlm/generate.py +572 -0
  420. nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
  421. nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
  422. nexaai/mlx_backend/vlm/interface.py +559 -0
  423. nexaai/mlx_backend/vlm/main.py +365 -0
  424. nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
  425. nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
  426. nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
  427. nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
  428. nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  429. nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  430. nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
  431. nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
  432. nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
  433. nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
  434. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  435. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  436. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  437. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  438. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  439. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  440. nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
  441. nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
  442. nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
  443. nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
  444. nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
  445. nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
  446. nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
  447. nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
  448. nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
  449. nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
  450. nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
  451. nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  452. nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
  453. nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
  454. nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
  455. nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
  456. nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
  457. nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
  458. nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
  459. nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
  460. nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
  461. nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
  462. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  463. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  464. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
  465. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
  466. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
  467. nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
  468. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  469. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  470. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
  471. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
  472. nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
  473. nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
  474. nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
  475. nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
  476. nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
  477. nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
  478. nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
  479. nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
  480. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  481. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
  482. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  483. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
  484. nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
  485. nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
  486. nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
  487. nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
  488. nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
  489. nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
  490. nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
  491. nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
  492. nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
  493. nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
  494. nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
  495. nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
  496. nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
  497. nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
  498. nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
  499. nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
  500. nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  501. nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
  502. nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
  503. nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
  504. nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
  505. nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
  506. nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
  507. nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
  508. nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
  509. nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  510. nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  511. nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
  512. nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
  513. nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
  514. nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
  515. nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
  516. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  517. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  518. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  519. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  520. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  521. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  522. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
  523. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
  524. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  525. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  526. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
  527. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
  528. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
  529. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
  530. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
  531. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
  532. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
  533. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
  534. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
  535. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
  536. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
  537. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
  538. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
  539. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
  540. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
  541. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
  542. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
  543. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
  544. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
  545. nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
  546. nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  547. nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  548. nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
  549. nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
  550. nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
  551. nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
  552. nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
  553. nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
  554. nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
  555. nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
  556. nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
  557. nexaai/rerank.py +57 -0
  558. nexaai/rerank_impl/__init__.py +0 -0
  559. nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
  560. nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
  561. nexaai/runtime.py +68 -0
  562. nexaai/runtime_error.py +24 -0
  563. nexaai/tts.py +75 -0
  564. nexaai/tts_impl/__init__.py +0 -0
  565. nexaai/tts_impl/mlx_tts_impl.py +94 -0
  566. nexaai/tts_impl/pybind_tts_impl.py +43 -0
  567. nexaai/utils/decode.py +18 -0
  568. nexaai/utils/manifest_utils.py +531 -0
  569. nexaai/utils/model_manager.py +1745 -0
  570. nexaai/utils/model_types.py +49 -0
  571. nexaai/utils/progress_tracker.py +389 -0
  572. nexaai/utils/quantization_utils.py +245 -0
  573. nexaai/vlm.py +130 -0
  574. nexaai/vlm_impl/__init__.py +0 -0
  575. nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
  576. nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
  577. nexaai-1.0.29.dist-info/METADATA +35 -0
  578. nexaai-1.0.29.dist-info/RECORD +580 -0
  579. nexaai-1.0.29.dist-info/WHEEL +5 -0
  580. nexaai-1.0.29.dist-info/top_level.txt +1 -0
nexaai/vlm.py ADDED
@@ -0,0 +1,130 @@
1
+ from typing import Generator, Optional, List, Dict, Any, Union
2
+ from abc import abstractmethod
3
+ import queue
4
+ import threading
5
+ import base64
6
+ from pathlib import Path
7
+
8
+ from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
9
+ from nexaai.base import BaseModel, ProfilingData
10
+
11
+
12
+ class VLM(BaseModel):
13
+ def __init__(self, m_cfg: ModelConfig = ModelConfig()):
14
+ """Initialize base VLM class."""
15
+ self._m_cfg = m_cfg
16
+ self._cancel_event = threading.Event() # New attribute to control cancellation
17
+
18
+ @classmethod
19
+ def _load_from(cls,
20
+ local_path: str,
21
+ mmproj_path: str = None,
22
+ model_name: Optional[str] = None,
23
+ m_cfg: ModelConfig = ModelConfig(),
24
+ plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
25
+ device_id: Optional[str] = None,
26
+ **kwargs
27
+ ) -> 'VLM':
28
+ """Load VLM model from local path, routing to appropriate implementation.
29
+
30
+ Args:
31
+ local_path: Path to the main model file
32
+ mmproj_path: Path to the multimodal projection file
33
+ m_cfg: Model configuration
34
+ plugin_id: Plugin identifier
35
+ device_id: Optional device ID (not used in current binding)
36
+
37
+ Returns:
38
+ VLM instance
39
+ """
40
+ # Check plugin_id value for routing - handle both enum and string
41
+ plugin_value = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
42
+
43
+ if plugin_value == "mlx":
44
+ from nexaai.vlm_impl.mlx_vlm_impl import MlxVlmImpl
45
+ return MlxVlmImpl._load_from(local_path, mmproj_path, model_name, m_cfg, plugin_id, device_id)
46
+ else:
47
+ from nexaai.vlm_impl.pybind_vlm_impl import PyBindVLMImpl
48
+ return PyBindVLMImpl._load_from(local_path, mmproj_path, model_name, m_cfg, plugin_id, device_id)
49
+
50
+ @abstractmethod
51
+ def eject(self):
52
+ """Release the model from memory."""
53
+ pass
54
+
55
+ def cancel_generation(self):
56
+ """Signal to cancel any ongoing stream generation."""
57
+ self._cancel_event.set()
58
+
59
+ def reset_cancel(self):
60
+ """Reset the cancel event. Call before starting a new generation if needed."""
61
+ self._cancel_event.clear()
62
+
63
+ @abstractmethod
64
+ def reset(self):
65
+ """
66
+ Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
67
+ If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
68
+ """
69
+ pass
70
+
71
+ def _process_image(self, image: Union[bytes, str, Path]) -> bytes:
72
+ """Process image input to bytes format.
73
+
74
+ Args:
75
+ image: Image data as bytes, base64 string, or file path
76
+
77
+ Returns:
78
+ Image data as bytes
79
+ """
80
+ if isinstance(image, bytes):
81
+ return image
82
+ elif isinstance(image, str):
83
+ # Check if it's a base64 string
84
+ if image.startswith('data:image'):
85
+ # Extract base64 data from data URL
86
+ base64_data = image.split(',')[1] if ',' in image else image
87
+ return base64.b64decode(base64_data)
88
+ else:
89
+ # Assume it's a file path
90
+ with open(image, 'rb') as f:
91
+ return f.read()
92
+ elif isinstance(image, Path):
93
+ with open(image, 'rb') as f:
94
+ return f.read()
95
+ else:
96
+ raise ValueError(f"Unsupported image type: {type(image)}")
97
+
98
+
99
+ @abstractmethod
100
+ def apply_chat_template(
101
+ self,
102
+ messages: List[MultiModalMessage],
103
+ tools: Optional[List[Dict[str, Any]]] = None,
104
+ enable_thinking: bool = True
105
+ ) -> str:
106
+ """Apply the chat template to multimodal messages."""
107
+ pass
108
+
109
+ @abstractmethod
110
+ def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
111
+ """Generate text with streaming."""
112
+ pass
113
+
114
+ @abstractmethod
115
+ def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
116
+ """
117
+ Generate text without streaming.
118
+
119
+ Args:
120
+ prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
121
+ g_cfg (GenerationConfig): Generation configuration.
122
+
123
+ Returns:
124
+ str: The generated text.
125
+ """
126
+ pass
127
+
128
+ def get_profiling_data(self) -> Optional[ProfilingData]:
129
+ """Get profiling data from the last generation."""
130
+ pass
File without changes
@@ -0,0 +1,259 @@
1
+ from typing import Generator, Optional, List, Dict, Any, Union
2
+
3
+ from nexaai.base import ProfilingData
4
+ from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
5
+ from nexaai.vlm import VLM
6
+ from nexaai.mlx_backend.vlm.interface import VLM as MLXVLMInterface
7
+ from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
8
+
9
+
10
+ class MlxVlmImpl(VLM):
11
+ def __init__(self, m_cfg: ModelConfig = ModelConfig()):
12
+ """Initialize MLX VLM implementation."""
13
+ super().__init__(m_cfg)
14
+ self._mlx_vlm = None
15
+
16
+ @classmethod
17
+ def _load_from(cls,
18
+ local_path: str,
19
+ mmproj_path: str = None,
20
+ model_name: Optional[str] = None,
21
+ m_cfg: ModelConfig = ModelConfig(),
22
+ plugin_id: Union[PluginID, str] = PluginID.MLX,
23
+ device_id: Optional[str] = None
24
+ ) -> 'MlxVlmImpl':
25
+ """Load VLM model from local path using MLX backend.
26
+
27
+ Args:
28
+ local_path: Path to the main model file
29
+ mmproj_path: Path to the multimodal projection file (not used in MLX VLM)
30
+ m_cfg: Model configuration
31
+ plugin_id: Plugin identifier
32
+ device_id: Optional device ID
33
+
34
+ Returns:
35
+ MlxVlmImpl instance
36
+ """
37
+ try:
38
+ # MLX interface is already imported
39
+
40
+ # Create instance and load MLX VLM
41
+ instance = cls(m_cfg)
42
+ instance._mlx_vlm = MLXVLMInterface(
43
+ model_name=model_name,
44
+ model_path=local_path,
45
+ mmproj_path=mmproj_path, # MLX VLM may not use this, but pass it anyway
46
+ context_length=m_cfg.n_ctx,
47
+ device=device_id
48
+ )
49
+
50
+ return instance
51
+ except Exception as e:
52
+ raise RuntimeError(f"Failed to load MLX VLM: {str(e)}")
53
+
54
+ def eject(self):
55
+ """Release the model from memory."""
56
+ if self._mlx_vlm:
57
+ self._mlx_vlm.destroy()
58
+ self._mlx_vlm = None
59
+
60
+ def reset(self):
61
+ """
62
+ Reset the VLM model context and KV cache.
63
+ """
64
+ if not self._mlx_vlm:
65
+ raise RuntimeError("MLX VLM not loaded")
66
+
67
+ try:
68
+ self._mlx_vlm.reset()
69
+ except Exception as e:
70
+ raise RuntimeError(f"Failed to reset MLX VLM: {str(e)}")
71
+
72
+ def apply_chat_template(
73
+ self,
74
+ messages: List[MultiModalMessage],
75
+ tools: Optional[List[Dict[str, Any]]] = None,
76
+ enable_thinking: bool = True
77
+ ) -> str:
78
+ """Apply the chat template to multimodal messages."""
79
+ if not self._mlx_vlm:
80
+ raise RuntimeError("MLX VLM not loaded")
81
+
82
+ try:
83
+ mlx_messages = []
84
+ total_images = 0
85
+ total_audios = 0
86
+
87
+ for msg in messages:
88
+ # Create a simple object with role and content attributes
89
+ class MLXChatMessage:
90
+ def __init__(self, role, content):
91
+ self.role = role
92
+ self.content = content
93
+
94
+ # Extract text content and count media files
95
+ text_content = ""
96
+ first_content = True
97
+
98
+ for content_item in msg["content"]:
99
+ content_type = content_item.get("type", "")
100
+
101
+ if content_type == "text":
102
+ if not first_content:
103
+ text_content += " "
104
+ text_content += content_item.get("text", "")
105
+ first_content = False
106
+ elif content_type == "image":
107
+ total_images += 1
108
+ elif content_type == "audio":
109
+ total_audios += 1
110
+
111
+ mlx_messages.append(MLXChatMessage(msg["role"], text_content))
112
+
113
+ if total_images > 0 or total_audios > 0:
114
+ # Use apply_chat_template_with_media when media is present
115
+ return self._mlx_vlm.apply_chat_template_with_media(
116
+ mlx_messages,
117
+ num_images=total_images,
118
+ num_audios=total_audios,
119
+ tools=tools,
120
+ enable_thinking=enable_thinking
121
+ )
122
+ else:
123
+ # Use regular apply_chat_template for text-only messages
124
+ return self._mlx_vlm.apply_chat_template(mlx_messages)
125
+
126
+ except Exception as e:
127
+ raise RuntimeError(f"Failed to apply chat template: {str(e)}")
128
+
129
+ def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
130
+ """Generate text with streaming."""
131
+ if not self._mlx_vlm:
132
+ raise RuntimeError("MLX VLM not loaded")
133
+
134
+ try:
135
+ # Convert GenerationConfig to MLX format
136
+ mlx_gen_config = MLXGenerationConfig()
137
+ mlx_gen_config.max_tokens = g_cfg.max_tokens
138
+ mlx_gen_config.stop = g_cfg.stop_words
139
+ mlx_gen_config.image_paths = g_cfg.image_paths
140
+ mlx_gen_config.audio_paths = g_cfg.audio_paths
141
+
142
+ if g_cfg.sampler_config:
143
+ mlx_sampler_config = MLXSamplerConfig()
144
+ mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
145
+ mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
146
+ mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
147
+ mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
148
+ mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
149
+ mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
150
+ mlx_sampler_config.seed = g_cfg.sampler_config.seed
151
+ mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
152
+ mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
153
+ mlx_gen_config.sampler_config = mlx_sampler_config
154
+
155
+ import queue
156
+ import threading
157
+
158
+ # Create a queue for streaming tokens
159
+ token_queue = queue.Queue()
160
+ exception_container = [None]
161
+ self.reset_cancel() # Reset cancel flag before generation
162
+
163
+ def token_callback(token: str, user_data: Any = None) -> bool:
164
+ if self._cancel_event.is_set():
165
+ token_queue.put(('end', None))
166
+ return False
167
+ try:
168
+ token_queue.put(('token', token))
169
+ return True
170
+ except Exception as e:
171
+ exception_container[0] = e
172
+ return False
173
+
174
+ # Run generation in a separate thread
175
+ def generate():
176
+ try:
177
+ self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
178
+ except Exception as e:
179
+ exception_container[0] = e
180
+ finally:
181
+ token_queue.put(('end', None))
182
+
183
+ thread = threading.Thread(target=generate)
184
+ thread.start()
185
+
186
+ # Yield tokens as they come from the queue
187
+ while True:
188
+ if exception_container[0]:
189
+ raise exception_container[0]
190
+
191
+ try:
192
+ msg_type, token = token_queue.get(timeout=0.1)
193
+ if msg_type == 'end':
194
+ break
195
+ elif msg_type == 'token':
196
+ yield token
197
+ except queue.Empty:
198
+ if not thread.is_alive():
199
+ break
200
+ continue
201
+
202
+ thread.join()
203
+
204
+ if exception_container[0]:
205
+ raise exception_container[0]
206
+
207
+ except Exception as e:
208
+ raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
209
+
210
+ def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
211
+ """
212
+ Generate text without streaming.
213
+
214
+ Args:
215
+ prompt (str): The prompt to generate text from.
216
+ g_cfg (GenerationConfig): Generation configuration.
217
+
218
+ Returns:
219
+ str: The generated text.
220
+ """
221
+ if not self._mlx_vlm:
222
+ raise RuntimeError("MLX VLM not loaded")
223
+
224
+ try:
225
+ # Convert GenerationConfig to MLX format
226
+ mlx_gen_config = MLXGenerationConfig()
227
+ mlx_gen_config.max_tokens = g_cfg.max_tokens
228
+ mlx_gen_config.stop = g_cfg.stop_words
229
+ mlx_gen_config.image_paths = g_cfg.image_paths
230
+ mlx_gen_config.audio_paths = g_cfg.audio_paths
231
+
232
+ if g_cfg.sampler_config:
233
+ mlx_sampler_config = MLXSamplerConfig()
234
+ mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
235
+ mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
236
+ mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
237
+ mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
238
+ mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
239
+ mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
240
+ mlx_sampler_config.seed = g_cfg.sampler_config.seed
241
+ mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
242
+ mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
243
+ mlx_gen_config.sampler_config = mlx_sampler_config
244
+
245
+ # Simple token callback that just continues
246
+ def token_callback(token: str, user_data: Any = None) -> bool:
247
+ return not self._cancel_event.is_set()
248
+
249
+ # Use MLX streaming generation and return the full result
250
+ return self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
251
+
252
+ except Exception as e:
253
+ raise RuntimeError(f"Failed to generate text: {str(e)}")
254
+
255
+ def get_profiling_data(self) -> Optional[ProfilingData]:
256
+ """Get profiling data from the last generation."""
257
+ if not self._mlx_vlm:
258
+ raise RuntimeError("MLX VLM not loaded")
259
+ return self._mlx_vlm.get_profiling_data()
@@ -0,0 +1,275 @@
1
+ from typing import Generator, Optional, List, Dict, Any, Union
2
+ import queue
3
+ import threading
4
+ from pathlib import Path
5
+
6
+ from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
7
+ from nexaai.binds import vlm_bind, common_bind
8
+ from nexaai.runtime import _ensure_runtime
9
+ from nexaai.vlm import VLM
10
+ from nexaai.base import ProfilingData
11
+ from nexaai.runtime_error import ContextLengthExceededError, GenerationError
12
+
13
+ # Error codes from ml.h
14
+ ML_SUCCESS = 0
15
+ ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH = -200004
16
+
17
+
18
+ class PyBindVLMImpl(VLM):
19
+ def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
20
+ """Private constructor, should not be called directly."""
21
+ super().__init__(m_cfg)
22
+ self._handle = handle # This is a py::capsule
23
+ self._profiling_data = None
24
+
25
+ @classmethod
26
+ def _load_from(
27
+ cls,
28
+ local_path: str,
29
+ mmproj_path: str = None,
30
+ model_name: Optional[str] = None,
31
+ m_cfg: ModelConfig = ModelConfig(),
32
+ plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
33
+ device_id: Optional[str] = None,
34
+ ) -> "PyBindVLMImpl":
35
+ """Load VLM model from local path.
36
+
37
+ Args:
38
+ local_path: Path to the main model file
39
+ mmproj_path: Path to the multimodal projection file
40
+ m_cfg: Model configuration
41
+ plugin_id: Plugin identifier
42
+ device_id: Optional device ID (not used in current binding)
43
+
44
+ Returns:
45
+ PyBindVLMImpl instance
46
+ """
47
+ _ensure_runtime()
48
+
49
+ config = common_bind.ModelConfig()
50
+
51
+ config.n_ctx = m_cfg.n_ctx
52
+ if m_cfg.n_threads is not None:
53
+ config.n_threads = m_cfg.n_threads
54
+ if m_cfg.n_threads_batch is not None:
55
+ config.n_threads_batch = m_cfg.n_threads_batch
56
+ if m_cfg.n_batch is not None:
57
+ config.n_batch = m_cfg.n_batch
58
+ if m_cfg.n_ubatch is not None:
59
+ config.n_ubatch = m_cfg.n_ubatch
60
+ if m_cfg.n_seq_max is not None:
61
+ config.n_seq_max = m_cfg.n_seq_max
62
+ config.n_gpu_layers = m_cfg.n_gpu_layers
63
+
64
+ # handle chat template strings
65
+ if m_cfg.chat_template_path:
66
+ config.chat_template_path = m_cfg.chat_template_path
67
+
68
+ if m_cfg.chat_template_content:
69
+ config.chat_template_content = m_cfg.chat_template_content
70
+
71
+ # handle system prompt (required for NPU plugin)
72
+ if m_cfg.system_prompt:
73
+ config.system_prompt = m_cfg.system_prompt
74
+
75
+ # Create handle : returns py::capsule with automatic cleanup
76
+ # Convert enum to string for C++ binding
77
+ plugin_id_str = (
78
+ plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
79
+ )
80
+ handle = vlm_bind.create_vlm(
81
+ model_path=local_path,
82
+ mmproj_path=mmproj_path,
83
+ model_name=model_name,
84
+ model_config=config,
85
+ plugin_id=plugin_id_str,
86
+ device_id=device_id,
87
+ )
88
+ return cls(handle, m_cfg)
89
+
90
+ def eject(self):
91
+ """Release the model from memory."""
92
+ # py::capsule handles cleanup automatically
93
+ del self._handle
94
+ self._handle = None
95
+
96
+ def reset(self):
97
+ """
98
+ Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
99
+ If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
100
+ """
101
+ vlm_bind.ml_vlm_reset(self._handle)
102
+
103
+ def apply_chat_template(
104
+ self,
105
+ messages: List[MultiModalMessage],
106
+ tools: Optional[List[Dict[str, Any]]] = None,
107
+ enable_thinking: bool = True,
108
+ ) -> str:
109
+ """Apply the chat template to multimodal messages."""
110
+ payload = []
111
+ for msg in messages:
112
+ role = msg["role"]
113
+ blocks = []
114
+
115
+ for c in msg["content"]:
116
+ t = c["type"]
117
+ if t == "text":
118
+ blocks.append({"type": "text", "text": c.get("text", "") or ""})
119
+ else:
120
+ # Pass through the original structure for image, audio, and any other types
121
+ # Let vlm-bind.cpp handle field extraction (text/url/path)
122
+ blocks.append(c)
123
+
124
+ payload.append({"role": role, "content": blocks})
125
+
126
+ result = vlm_bind.ml_vlm_apply_chat_template(
127
+ self._handle, payload, tools, enable_thinking
128
+ )
129
+ return result
130
+
131
+ def generate_stream(
132
+ self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()
133
+ ) -> Generator[str, None, None]:
134
+ """Generate text with streaming."""
135
+ token_queue = queue.Queue()
136
+ exception_container = [None]
137
+ self.reset_cancel() # Reset cancel flag before generation
138
+
139
+ def on_token(token: str, user_data) -> bool:
140
+ if self._cancel_event.is_set():
141
+ token_queue.put(("end", None))
142
+ return False # Stop generation
143
+ try:
144
+ token_queue.put(("token", token))
145
+ return True # Continue generation
146
+ except Exception as e:
147
+ exception_container[0] = e
148
+ return False # Stop generation
149
+
150
+ config = self._convert_generation_config(g_cfg)
151
+
152
+ # Run generation in thread
153
+ def generate():
154
+ try:
155
+ result = vlm_bind.ml_vlm_generate(
156
+ handle=self._handle,
157
+ prompt=prompt,
158
+ config=config,
159
+ on_token=on_token,
160
+ user_data=None,
161
+ )
162
+
163
+ # Check for errors in result
164
+ error_code = result.get("error_code", ML_SUCCESS)
165
+ if error_code != ML_SUCCESS:
166
+ error_message = result.get("error_message", "Unknown error")
167
+ if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
168
+ exception_container[0] = ContextLengthExceededError(
169
+ error_message, error_code
170
+ )
171
+ else:
172
+ exception_container[0] = GenerationError(
173
+ error_message, error_code
174
+ )
175
+ token_queue.put(("end", None))
176
+ return
177
+
178
+ self._profiling_data = ProfilingData.from_dict(
179
+ result.get("profile_data", {})
180
+ )
181
+ except Exception as e:
182
+ exception_container[0] = e
183
+ finally:
184
+ token_queue.put(("end", None))
185
+
186
+ thread = threading.Thread(target=generate)
187
+ thread.start()
188
+
189
+ # Yield tokens as they come
190
+ try:
191
+ while True:
192
+ msg_type, token = token_queue.get()
193
+ if msg_type == "token":
194
+ yield token
195
+ elif msg_type in ("error", "end"):
196
+ break
197
+ finally:
198
+ thread.join()
199
+
200
+ if exception_container[0]:
201
+ raise exception_container[0]
202
+
203
+ def generate(
204
+ self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()
205
+ ) -> str:
206
+ """
207
+ Generate text without streaming.
208
+
209
+ Args:
210
+ prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
211
+ g_cfg (GenerationConfig): Generation configuration.
212
+
213
+ Returns:
214
+ str: The generated text.
215
+ """
216
+ config = self._convert_generation_config(g_cfg)
217
+ result = vlm_bind.ml_vlm_generate(
218
+ handle=self._handle,
219
+ prompt=prompt,
220
+ config=config,
221
+ on_token=None, # No callback for non-streaming
222
+ user_data=None,
223
+ )
224
+
225
+ # Check for errors in result
226
+ error_code = result.get("error_code", ML_SUCCESS)
227
+ if error_code != ML_SUCCESS:
228
+ error_message = result.get("error_message", "Unknown error")
229
+ if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
230
+ raise ContextLengthExceededError(error_message, error_code)
231
+ else:
232
+ raise GenerationError(error_message, error_code)
233
+
234
+ self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
235
+ return result.get("text", "")
236
+
237
+ def get_profiling_data(self) -> Optional[ProfilingData]:
238
+ """Get profiling data."""
239
+ return self._profiling_data
240
+
241
+ def _convert_generation_config(self, g_cfg: GenerationConfig):
242
+ """Convert GenerationConfig to binding format."""
243
+ config = common_bind.GenerationConfig()
244
+
245
+ # Set basic generation parameters
246
+ config.max_tokens = g_cfg.max_tokens
247
+
248
+ if g_cfg.stop_words:
249
+ config.stop = g_cfg.stop_words
250
+
251
+ if g_cfg.image_paths:
252
+ config.image_paths = g_cfg.image_paths
253
+
254
+ if g_cfg.audio_paths:
255
+ config.audio_paths = g_cfg.audio_paths
256
+
257
+ if g_cfg.sampler_config:
258
+ sampler = common_bind.SamplerConfig()
259
+ sampler.temperature = g_cfg.sampler_config.temperature
260
+ sampler.top_p = g_cfg.sampler_config.top_p
261
+ sampler.top_k = g_cfg.sampler_config.top_k
262
+ sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
263
+ sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
264
+ sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
265
+ sampler.seed = g_cfg.sampler_config.seed
266
+
267
+ if g_cfg.sampler_config.grammar_path:
268
+ sampler.grammar_path = g_cfg.sampler_config.grammar_path
269
+
270
+ if g_cfg.sampler_config.grammar_string:
271
+ sampler.grammar_string = g_cfg.sampler_config.grammar_string
272
+
273
+ config.sampler_config = sampler
274
+
275
+ return config