nexaai 1.0.4rc10__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Files changed (519) hide show
  1. nexaai/__init__.py +71 -0
  2. nexaai/_version.py +4 -0
  3. nexaai/asr.py +60 -0
  4. nexaai/asr_impl/__init__.py +0 -0
  5. nexaai/asr_impl/mlx_asr_impl.py +91 -0
  6. nexaai/asr_impl/pybind_asr_impl.py +43 -0
  7. nexaai/base.py +39 -0
  8. nexaai/binds/__init__.py +3 -0
  9. nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
  10. nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
  11. nexaai/binds/libnexa_bridge.dylib +0 -0
  12. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  13. nexaai/binds/nexa_llama_cpp/libggml-base.dylib +0 -0
  14. nexaai/binds/nexa_llama_cpp/libggml-cpu.so +0 -0
  15. nexaai/binds/nexa_llama_cpp/libggml-metal.so +0 -0
  16. nexaai/binds/nexa_llama_cpp/libggml.dylib +0 -0
  17. nexaai/binds/nexa_llama_cpp/libllama.dylib +0 -0
  18. nexaai/binds/nexa_llama_cpp/libmtmd.dylib +0 -0
  19. nexaai/binds/nexa_llama_cpp/libnexa_plugin.dylib +0 -0
  20. nexaai/binds/nexa_mlx/libnexa_plugin.dylib +0 -0
  21. nexaai/binds/nexa_mlx/py-lib/ml.py +842 -0
  22. nexaai/binds/nexa_mlx/py-lib/mlx_audio/__init__.py +0 -0
  23. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/__init__.py +1 -0
  24. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/__init__.py +5 -0
  25. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  26. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  27. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  28. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  29. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  30. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  31. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
  32. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
  33. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
  34. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  35. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  36. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  37. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
  38. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
  39. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
  40. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
  41. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  42. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  43. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  44. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  45. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  46. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  47. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
  48. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
  49. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
  50. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
  51. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
  52. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
  53. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
  54. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
  55. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
  56. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
  57. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
  58. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
  59. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  60. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  61. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
  62. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
  63. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
  64. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
  65. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
  66. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
  67. nexaai/binds/nexa_mlx/py-lib/mlx_audio/server.py +525 -0
  68. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/__init__.py +0 -0
  69. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  70. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
  71. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/__init__.py +0 -0
  72. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/generate.py +174 -0
  73. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  74. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  75. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  76. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
  77. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
  78. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  79. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  80. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  81. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  82. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  83. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  84. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  85. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
  86. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
  87. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
  88. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
  89. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  90. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
  91. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
  92. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
  93. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/utils.py +195 -0
  94. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/__init__.py +1 -0
  95. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/audio_player.py +120 -0
  96. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/convert.py +71 -0
  97. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/generate.py +449 -0
  98. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  99. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
  100. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
  101. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
  102. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
  103. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/base.py +84 -0
  104. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
  105. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
  106. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
  107. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
  108. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
  109. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
  110. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
  111. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  112. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
  113. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  114. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  115. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  116. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  117. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  118. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  119. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
  120. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
  121. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
  122. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  123. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
  124. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  125. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  126. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  127. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
  128. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  129. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
  130. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
  131. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
  132. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
  133. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  134. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  135. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
  136. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  137. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
  138. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
  139. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
  140. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
  141. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  142. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
  143. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  144. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
  145. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  146. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  147. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  148. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  149. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  150. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  151. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  152. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  153. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  154. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  155. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  156. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  157. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  158. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  159. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  160. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
  161. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  162. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
  163. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  164. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  165. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
  166. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
  167. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
  168. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
  169. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/utils.py +337 -0
  170. nexaai/binds/nexa_mlx/py-lib/mlx_audio/utils.py +237 -0
  171. nexaai/binds/nexa_mlx/py-lib/mlx_audio/version.py +1 -0
  172. nexaai/binds/nexa_mlx/py-lib/profiling.py +239 -0
  173. nexaai/common.py +61 -0
  174. nexaai/cv.py +87 -0
  175. nexaai/cv_impl/__init__.py +0 -0
  176. nexaai/cv_impl/mlx_cv_impl.py +88 -0
  177. nexaai/cv_impl/pybind_cv_impl.py +31 -0
  178. nexaai/embedder.py +68 -0
  179. nexaai/embedder_impl/__init__.py +0 -0
  180. nexaai/embedder_impl/mlx_embedder_impl.py +114 -0
  181. nexaai/embedder_impl/pybind_embedder_impl.py +91 -0
  182. nexaai/image_gen.py +136 -0
  183. nexaai/image_gen_impl/__init__.py +0 -0
  184. nexaai/image_gen_impl/mlx_image_gen_impl.py +291 -0
  185. nexaai/image_gen_impl/pybind_image_gen_impl.py +84 -0
  186. nexaai/llm.py +89 -0
  187. nexaai/llm_impl/__init__.py +0 -0
  188. nexaai/llm_impl/mlx_llm_impl.py +249 -0
  189. nexaai/llm_impl/pybind_llm_impl.py +207 -0
  190. nexaai/mlx_backend/asr/__init__.py +12 -0
  191. nexaai/mlx_backend/asr/interface.py +122 -0
  192. nexaai/mlx_backend/common/__init__.py +0 -0
  193. nexaai/mlx_backend/common/utils.py +25 -0
  194. nexaai/mlx_backend/cv/__init__.py +0 -0
  195. nexaai/mlx_backend/cv/generate.py +195 -0
  196. nexaai/mlx_backend/cv/interface.py +151 -0
  197. nexaai/mlx_backend/cv/main.py +81 -0
  198. nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
  199. nexaai/mlx_backend/embedding/__init__.py +0 -0
  200. nexaai/mlx_backend/embedding/generate.py +130 -0
  201. nexaai/mlx_backend/embedding/interface.py +312 -0
  202. nexaai/mlx_backend/embedding/main.py +82 -0
  203. nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
  204. nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
  205. nexaai/mlx_backend/llm/__init__.py +0 -0
  206. nexaai/mlx_backend/llm/generate.py +149 -0
  207. nexaai/mlx_backend/llm/interface.py +764 -0
  208. nexaai/mlx_backend/llm/main.py +68 -0
  209. nexaai/mlx_backend/ml.py +842 -0
  210. nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
  211. nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
  212. nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
  213. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  214. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  215. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  216. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  217. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  218. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  219. nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
  220. nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
  221. nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
  222. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  223. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  224. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  225. nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
  226. nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
  227. nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
  228. nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
  229. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  230. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  231. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  232. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  233. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  234. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  235. nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
  236. nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
  237. nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
  238. nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
  239. nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
  240. nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
  241. nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
  242. nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
  243. nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
  244. nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
  245. nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
  246. nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
  247. nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
  248. nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  249. nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
  250. nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
  251. nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
  252. nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
  253. nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
  254. nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
  255. nexaai/mlx_backend/mlx_audio/server.py +525 -0
  256. nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
  257. nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  258. nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
  259. nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
  260. nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
  261. nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
  262. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  263. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  264. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
  265. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
  266. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  267. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  268. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  269. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  270. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  271. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  272. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  273. nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
  274. nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
  275. nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
  276. nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
  277. nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  278. nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
  279. nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
  280. nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
  281. nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
  282. nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
  283. nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
  284. nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
  285. nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
  286. nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
  287. nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
  288. nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
  289. nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
  290. nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
  291. nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
  292. nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
  293. nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
  294. nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
  295. nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
  296. nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
  297. nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
  298. nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
  299. nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  300. nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
  301. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  302. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  303. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  304. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  305. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  306. nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  307. nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
  308. nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
  309. nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
  310. nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  311. nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
  312. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  313. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  314. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  315. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
  316. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  317. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
  318. nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
  319. nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
  320. nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
  321. nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  322. nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  323. nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
  324. nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
  325. nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  326. nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
  327. nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
  328. nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
  329. nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
  330. nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  331. nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
  332. nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  333. nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
  334. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  335. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  336. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  337. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  338. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  339. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  340. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  341. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  342. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  343. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  344. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  345. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  346. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  347. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  348. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  349. nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
  350. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  351. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
  352. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  353. nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
  354. nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
  355. nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
  356. nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
  357. nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
  358. nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
  359. nexaai/mlx_backend/mlx_audio/utils.py +237 -0
  360. nexaai/mlx_backend/mlx_audio/version.py +1 -0
  361. nexaai/mlx_backend/profiling.py +239 -0
  362. nexaai/mlx_backend/rerank/__init__.py +0 -0
  363. nexaai/mlx_backend/rerank/generate.py +174 -0
  364. nexaai/mlx_backend/rerank/interface.py +287 -0
  365. nexaai/mlx_backend/rerank/main.py +127 -0
  366. nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
  367. nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
  368. nexaai/mlx_backend/sd/__init__.py +1 -0
  369. nexaai/mlx_backend/sd/interface.py +362 -0
  370. nexaai/mlx_backend/sd/main.py +286 -0
  371. nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
  372. nexaai/mlx_backend/sd/modeling/clip.py +116 -0
  373. nexaai/mlx_backend/sd/modeling/config.py +65 -0
  374. nexaai/mlx_backend/sd/modeling/model_io.py +330 -0
  375. nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
  376. nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
  377. nexaai/mlx_backend/sd/modeling/unet.py +460 -0
  378. nexaai/mlx_backend/sd/modeling/vae.py +274 -0
  379. nexaai/mlx_backend/tts/__init__.py +12 -0
  380. nexaai/mlx_backend/tts/interface.py +276 -0
  381. nexaai/mlx_backend/vlm/__init__.py +3 -0
  382. nexaai/mlx_backend/vlm/generate.py +572 -0
  383. nexaai/mlx_backend/vlm/interface.py +406 -0
  384. nexaai/mlx_backend/vlm/main.py +157 -0
  385. nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
  386. nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
  387. nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
  388. nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
  389. nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  390. nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  391. nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
  392. nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
  393. nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
  394. nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
  395. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  396. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  397. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  398. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  399. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  400. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  401. nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
  402. nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
  403. nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
  404. nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
  405. nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
  406. nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
  407. nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
  408. nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
  409. nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
  410. nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
  411. nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
  412. nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  413. nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
  414. nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
  415. nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
  416. nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
  417. nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
  418. nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
  419. nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
  420. nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
  421. nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
  422. nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
  423. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  424. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  425. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
  426. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
  427. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
  428. nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
  429. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  430. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  431. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
  432. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
  433. nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
  434. nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
  435. nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
  436. nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
  437. nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
  438. nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
  439. nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
  440. nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
  441. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  442. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
  443. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  444. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
  445. nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
  446. nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
  447. nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
  448. nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
  449. nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
  450. nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
  451. nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
  452. nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
  453. nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
  454. nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
  455. nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
  456. nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
  457. nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
  458. nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
  459. nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
  460. nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
  461. nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  462. nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
  463. nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
  464. nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
  465. nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
  466. nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
  467. nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
  468. nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
  469. nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
  470. nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  471. nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  472. nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
  473. nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
  474. nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
  475. nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
  476. nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
  477. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  478. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  479. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  480. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  481. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  482. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  483. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
  484. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
  485. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  486. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  487. nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
  488. nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  489. nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  490. nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
  491. nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
  492. nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
  493. nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
  494. nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
  495. nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
  496. nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
  497. nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
  498. nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
  499. nexaai/rerank.py +51 -0
  500. nexaai/rerank_impl/__init__.py +0 -0
  501. nexaai/rerank_impl/mlx_rerank_impl.py +91 -0
  502. nexaai/rerank_impl/pybind_rerank_impl.py +42 -0
  503. nexaai/runtime.py +64 -0
  504. nexaai/tts.py +70 -0
  505. nexaai/tts_impl/__init__.py +0 -0
  506. nexaai/tts_impl/mlx_tts_impl.py +93 -0
  507. nexaai/tts_impl/pybind_tts_impl.py +42 -0
  508. nexaai/utils/avatar_fetcher.py +104 -0
  509. nexaai/utils/decode.py +18 -0
  510. nexaai/utils/model_manager.py +1195 -0
  511. nexaai/utils/progress_tracker.py +372 -0
  512. nexaai/vlm.py +120 -0
  513. nexaai/vlm_impl/__init__.py +0 -0
  514. nexaai/vlm_impl/mlx_vlm_impl.py +205 -0
  515. nexaai/vlm_impl/pybind_vlm_impl.py +228 -0
  516. nexaai-1.0.4rc10.dist-info/METADATA +26 -0
  517. nexaai-1.0.4rc10.dist-info/RECORD +519 -0
  518. nexaai-1.0.4rc10.dist-info/WHEEL +5 -0
  519. nexaai-1.0.4rc10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,249 @@
1
+ from typing import Generator, Optional, Any
2
+
3
+ from nexaai.common import ModelConfig, GenerationConfig, ChatMessage
4
+ from nexaai.llm import LLM
5
+ from nexaai.mlx_backend.llm.interface import LLM as MLXLLMInterface
6
+ from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
7
+
8
+
9
+ class MLXLLMImpl(LLM):
10
+ def __init__(self, m_cfg: ModelConfig = ModelConfig()):
11
+ """Initialize MLX LLM implementation."""
12
+ super().__init__(m_cfg)
13
+ self._mlx_llm = None
14
+
15
+ @classmethod
16
+ def _load_from(cls,
17
+ local_path: str,
18
+ tokenizer_path: Optional[str] = None,
19
+ m_cfg: ModelConfig = ModelConfig(),
20
+ plugin_id: str = "mlx",
21
+ device_id: Optional[str] = None
22
+ ) -> 'MLXLLMImpl':
23
+ """Load model from local path using MLX backend."""
24
+ try:
25
+ # MLX interface and configs are already imported
26
+
27
+ # Convert our ModelConfig to MLX ModelConfig
28
+ mlx_config = MLXModelConfig()
29
+ mlx_config.n_ctx = m_cfg.n_ctx
30
+ mlx_config.n_threads = m_cfg.n_threads
31
+ mlx_config.n_threads_batch = m_cfg.n_threads_batch
32
+ mlx_config.n_batch = m_cfg.n_batch
33
+ mlx_config.n_ubatch = m_cfg.n_ubatch
34
+ mlx_config.n_seq_max = m_cfg.n_seq_max
35
+ mlx_config.chat_template_path = m_cfg.chat_template_path
36
+ mlx_config.chat_template_content = m_cfg.chat_template_content
37
+
38
+ # Create instance and load MLX model
39
+ instance = cls(m_cfg)
40
+ instance._mlx_llm = MLXLLMInterface(
41
+ model_path=local_path,
42
+ tokenizer_path=tokenizer_path or local_path,
43
+ config=mlx_config,
44
+ device=device_id
45
+ )
46
+
47
+ return instance
48
+ except Exception as e:
49
+ raise RuntimeError(f"Failed to load MLX LLM: {str(e)}")
50
+
51
+ def eject(self):
52
+ """Release the model from memory."""
53
+ if self._mlx_llm:
54
+ self._mlx_llm.destroy()
55
+ self._mlx_llm = None
56
+
57
+ def apply_chat_template(self, messages: list[ChatMessage]) -> str:
58
+ """Apply the chat template to messages."""
59
+ if not self._mlx_llm:
60
+ raise RuntimeError("MLX LLM not loaded")
61
+
62
+ try:
63
+ # Convert to MLX ChatMessage format
64
+ mlx_messages = []
65
+ for msg in messages:
66
+ # Create a simple object with role and content attributes
67
+ class MLXChatMessage:
68
+ def __init__(self, role, content):
69
+ self.role = role
70
+ self.content = content
71
+ mlx_messages.append(MLXChatMessage(msg["role"], msg["content"]))
72
+
73
+ return self._mlx_llm.apply_chat_template(mlx_messages)
74
+ except Exception as e:
75
+ raise RuntimeError(f"Failed to apply chat template: {str(e)}")
76
+
77
+ def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
78
+ """Generate text with streaming."""
79
+ if not self._mlx_llm:
80
+ raise RuntimeError("MLX LLM not loaded")
81
+
82
+ try:
83
+ import queue
84
+ import threading
85
+
86
+ # Convert GenerationConfig to MLX format
87
+
88
+ mlx_gen_config = MLXGenerationConfig()
89
+ mlx_gen_config.max_tokens = g_cfg.max_tokens
90
+ mlx_gen_config.stop = g_cfg.stop_words
91
+ mlx_gen_config.image_paths = g_cfg.image_paths
92
+ mlx_gen_config.audio_paths = g_cfg.audio_paths
93
+
94
+ if g_cfg.sampler_config:
95
+ mlx_sampler_config = MLXSamplerConfig()
96
+ mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
97
+ mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
98
+ mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
99
+ mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
100
+ mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
101
+ mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
102
+ mlx_sampler_config.seed = g_cfg.sampler_config.seed
103
+ mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
104
+ mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
105
+ mlx_gen_config.sampler_config = mlx_sampler_config
106
+
107
+ # Create a queue for streaming tokens
108
+ token_queue = queue.Queue()
109
+ exception_container = [None]
110
+ self.reset_cancel() # Reset cancel flag before generation
111
+
112
+ def token_callback(token: str, user_data: Any = None) -> bool:
113
+ if self._cancel_event.is_set():
114
+ token_queue.put(('end', None))
115
+ return False
116
+ try:
117
+ token_queue.put(('token', token))
118
+ return True
119
+ except Exception as e:
120
+ exception_container[0] = e
121
+ return False
122
+
123
+ # Run generation in a separate thread
124
+ def generate():
125
+ try:
126
+ self._mlx_llm.generate_stream(prompt, mlx_gen_config, token_callback)
127
+ except Exception as e:
128
+ exception_container[0] = e
129
+ finally:
130
+ token_queue.put(('end', None))
131
+
132
+ thread = threading.Thread(target=generate)
133
+ thread.start()
134
+
135
+ # Yield tokens as they come from the queue
136
+ while True:
137
+ if exception_container[0]:
138
+ raise exception_container[0]
139
+
140
+ try:
141
+ msg_type, token = token_queue.get(timeout=0.1)
142
+ if msg_type == 'end':
143
+ break
144
+ elif msg_type == 'token':
145
+ yield token
146
+ except queue.Empty:
147
+ if not thread.is_alive():
148
+ break
149
+ continue
150
+
151
+ thread.join()
152
+
153
+ if exception_container[0]:
154
+ raise exception_container[0]
155
+
156
+ except Exception as e:
157
+ raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
158
+
159
+ def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
160
+ """
161
+ Generate text without streaming.
162
+
163
+ Args:
164
+ prompt (str): The prompt to generate text from.
165
+ g_cfg (GenerationConfig): Generation configuration.
166
+
167
+ Returns:
168
+ str: The generated text.
169
+ """
170
+ if not self._mlx_llm:
171
+ raise RuntimeError("MLX LLM not loaded")
172
+
173
+ try:
174
+ # Convert GenerationConfig to MLX format
175
+
176
+ mlx_gen_config = MLXGenerationConfig()
177
+ mlx_gen_config.max_tokens = g_cfg.max_tokens
178
+ mlx_gen_config.stop = g_cfg.stop_words
179
+ mlx_gen_config.image_paths = g_cfg.image_paths
180
+ mlx_gen_config.audio_paths = g_cfg.audio_paths
181
+
182
+ if g_cfg.sampler_config:
183
+ mlx_sampler_config = MLXSamplerConfig()
184
+ mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
185
+ mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
186
+ mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
187
+ mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
188
+ mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
189
+ mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
190
+ mlx_sampler_config.seed = g_cfg.sampler_config.seed
191
+ mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
192
+ mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
193
+ mlx_gen_config.sampler_config = mlx_sampler_config
194
+
195
+ # Simple token callback that just continues
196
+ def token_callback(token: str, user_data: Any = None) -> bool:
197
+ return not self._cancel_event.is_set()
198
+
199
+ # Use MLX streaming generation and return the full result
200
+ return self._mlx_llm.generate_stream(prompt, mlx_gen_config, token_callback)
201
+
202
+ except Exception as e:
203
+ raise RuntimeError(f"Failed to generate text: {str(e)}")
204
+
205
+ def save_kv_cache(self, path: str):
206
+ """
207
+ Save the key-value cache to the file.
208
+
209
+ Args:
210
+ path (str): The path to the file.
211
+ """
212
+ if not self._mlx_llm:
213
+ raise RuntimeError("MLX LLM not loaded")
214
+
215
+ try:
216
+ success = self._mlx_llm.save_kv_cache(path)
217
+ if not success:
218
+ raise RuntimeError("Failed to save KV cache")
219
+ except Exception as e:
220
+ raise RuntimeError(f"Failed to save KV cache: {str(e)}")
221
+
222
+ def load_kv_cache(self, path: str):
223
+ """
224
+ Load the key-value cache from the file.
225
+
226
+ Args:
227
+ path (str): The path to the file.
228
+ """
229
+ if not self._mlx_llm:
230
+ raise RuntimeError("MLX LLM not loaded")
231
+
232
+ try:
233
+ success = self._mlx_llm.load_kv_cache(path)
234
+ if not success:
235
+ raise RuntimeError("Failed to load KV cache")
236
+ except Exception as e:
237
+ raise RuntimeError(f"Failed to load KV cache: {str(e)}")
238
+
239
+ def reset(self):
240
+ """
241
+ Reset the LLM model context and KV cache.
242
+ """
243
+ if not self._mlx_llm:
244
+ raise RuntimeError("MLX LLM not loaded")
245
+
246
+ try:
247
+ self._mlx_llm.reset()
248
+ except Exception as e:
249
+ raise RuntimeError(f"Failed to reset MLX LLM: {str(e)}")
@@ -0,0 +1,207 @@
1
+ from typing import Generator, Optional
2
+ import queue
3
+ import threading
4
+
5
+ from nexaai.common import ModelConfig, GenerationConfig, ChatMessage
6
+ from nexaai.binds import llm_bind, common_bind
7
+ from nexaai.runtime import _ensure_runtime
8
+ from nexaai.llm import LLM
9
+
10
+
11
+ class PyBindLLMImpl(LLM):
12
+ def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
13
+ """Private constructor, should not be called directly."""
14
+ super().__init__(m_cfg)
15
+ self._handle = handle # This is a py::capsule
16
+
17
+ @classmethod
18
+ def _load_from(cls,
19
+ local_path: str,
20
+ tokenizer_path: Optional[str] = None,
21
+ m_cfg: ModelConfig = ModelConfig(),
22
+ plugin_id: str = "llama_cpp",
23
+ device_id: Optional[str] = None
24
+ ) -> 'PyBindLLMImpl':
25
+ """Load model from local path."""
26
+ _ensure_runtime()
27
+
28
+ config = common_bind.ModelConfig()
29
+
30
+ config.n_ctx = m_cfg.n_ctx
31
+ if m_cfg.n_threads is not None:
32
+ config.n_threads = m_cfg.n_threads
33
+ if m_cfg.n_threads_batch is not None:
34
+ config.n_threads_batch = m_cfg.n_threads_batch
35
+ if m_cfg.n_batch is not None:
36
+ config.n_batch = m_cfg.n_batch
37
+ if m_cfg.n_ubatch is not None:
38
+ config.n_ubatch = m_cfg.n_ubatch
39
+ if m_cfg.n_seq_max is not None:
40
+ config.n_seq_max = m_cfg.n_seq_max
41
+ if m_cfg.n_gpu_layers is not None:
42
+ config.n_gpu_layers = m_cfg.n_gpu_layers
43
+
44
+ # handle chat template strings
45
+ if m_cfg.chat_template_path:
46
+ config.chat_template_path = m_cfg.chat_template_path
47
+
48
+ if m_cfg.chat_template_content:
49
+ config.chat_template_content = m_cfg.chat_template_content
50
+
51
+ # Create handle : returns py::capsule with automatic cleanup
52
+ handle = llm_bind.ml_llm_create(
53
+ model_path=local_path,
54
+ tokenizer_path=tokenizer_path,
55
+ model_config=config,
56
+ plugin_id=plugin_id,
57
+ device_id=device_id
58
+ )
59
+ return cls(handle, m_cfg)
60
+
61
+ def eject(self):
62
+ """Release the model from memory."""
63
+ # py::capsule handles cleanup automatically
64
+ del self._handle
65
+ self._handle = None
66
+
67
+ def apply_chat_template(self, messages: list[ChatMessage]) -> str:
68
+ """Apply the chat template to messages."""
69
+ # Convert TypedDict to list of dicts for binding
70
+ message_dicts = [
71
+ {"role": m["role"], "content": m["content"]}
72
+ for m in messages
73
+ ]
74
+ return llm_bind.ml_llm_apply_chat_template(self._handle, message_dicts)
75
+
76
+ def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
77
+ """Generate text with streaming."""
78
+ token_queue = queue.Queue()
79
+ exception_container = [None]
80
+ self.reset_cancel() # Reset cancel flag before generation
81
+
82
+ def on_token(token: str, user_data) -> bool:
83
+ if self._cancel_event.is_set():
84
+ token_queue.put(('end', None))
85
+ return False # Stop generation
86
+ try:
87
+ token_queue.put(('token', token))
88
+ return True # Continue generation
89
+ except Exception as e:
90
+ exception_container[0] = e
91
+ return False # Stop generation
92
+
93
+ config = self._convert_generation_config(g_cfg)
94
+
95
+ # Run generation in thread
96
+ def generate():
97
+ try:
98
+ llm_bind.ml_llm_generate(
99
+ handle=self._handle,
100
+ prompt=prompt,
101
+ config=config,
102
+ on_token=on_token,
103
+ user_data=None
104
+ )
105
+ except Exception as e:
106
+ exception_container[0] = e
107
+ finally:
108
+ token_queue.put(('end', None))
109
+
110
+ thread = threading.Thread(target=generate)
111
+ thread.start()
112
+
113
+ # Yield tokens as they come
114
+ try:
115
+ while True:
116
+ msg_type, token = token_queue.get()
117
+ if msg_type == 'token':
118
+ yield token
119
+ elif msg_type in ('error', 'end'):
120
+ break
121
+ finally:
122
+ thread.join()
123
+
124
+ if exception_container[0]:
125
+ raise exception_container[0]
126
+
127
+ def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
128
+ """
129
+ Generate text without streaming.
130
+
131
+ Args:
132
+ prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
133
+ g_cfg (GenerationConfig): Generation configuration.
134
+
135
+ Returns:
136
+ str: The generated text.
137
+ """
138
+ config = self._convert_generation_config(g_cfg)
139
+ result = llm_bind.ml_llm_generate(
140
+ handle=self._handle,
141
+ prompt=prompt,
142
+ config=config,
143
+ on_token=None, # No callback for non-streaming
144
+ user_data=None
145
+ )
146
+ return result.get("text", "")
147
+
148
+ def save_kv_cache(self, path: str):
149
+ """
150
+ Save the key-value cache to the file.
151
+
152
+ Args:
153
+ path (str): The path to the file.
154
+ """
155
+ llm_bind.ml_llm_save_kv_cache(self._handle, path)
156
+
157
+ def load_kv_cache(self, path: str):
158
+ """
159
+ Load the key-value cache from the file.
160
+
161
+ Args:
162
+ path (str): The path to the file.
163
+ """
164
+ llm_bind.ml_llm_load_kv_cache(self._handle, path)
165
+
166
+ def reset(self):
167
+ """
168
+ Reset the LLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
169
+ If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
170
+ """
171
+ llm_bind.ml_llm_reset(self._handle)
172
+
173
+ def _convert_generation_config(self, g_cfg: GenerationConfig):
174
+ """Convert GenerationConfig to binding format."""
175
+ config = common_bind.GenerationConfig()
176
+
177
+ # Set basic generation parameters
178
+ config.max_tokens = g_cfg.max_tokens
179
+
180
+ if g_cfg.stop_words:
181
+ config.stop = g_cfg.stop_words
182
+
183
+ if g_cfg.image_paths:
184
+ config.image_paths = g_cfg.image_paths
185
+
186
+ if g_cfg.audio_paths:
187
+ config.audio_paths = g_cfg.audio_paths
188
+
189
+ if g_cfg.sampler_config:
190
+ sampler = common_bind.SamplerConfig()
191
+ sampler.temperature = g_cfg.sampler_config.temperature
192
+ sampler.top_p = g_cfg.sampler_config.top_p
193
+ sampler.top_k = g_cfg.sampler_config.top_k
194
+ sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
195
+ sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
196
+ sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
197
+ sampler.seed = g_cfg.sampler_config.seed
198
+
199
+ if g_cfg.sampler_config.grammar_path:
200
+ sampler.grammar_path = g_cfg.sampler_config.grammar_path
201
+
202
+ if g_cfg.sampler_config.grammar_string:
203
+ sampler.grammar_string = g_cfg.sampler_config.grammar_string
204
+
205
+ config.sampler_config = sampler
206
+
207
+ return config
@@ -0,0 +1,12 @@
1
+ # patching the _resume method in phonemizer because logger.setLevel(logging.ERROR) doesn't work - the logger instance is created and stored in the package.
2
+ try:
3
+ from phonemizer.backend.espeak.words_mismatch import BaseWordsMismatch
4
+
5
+ def silent_resume(self, nmismatch, nlines):
6
+ """Silent version of _resume that suppresses warnings"""
7
+ pass
8
+
9
+ BaseWordsMismatch._resume = silent_resume
10
+
11
+ except ImportError:
12
+ pass
@@ -0,0 +1,122 @@
1
+ from typing import Any, List, Optional, Sequence
2
+ import argparse
3
+ import sys
4
+ import os
5
+
6
+ import mlx.core as mx
7
+ import numpy as np
8
+
9
+ from ml import ASR, ASRConfig, ASRResult, Path as MLPath
10
+ from mlx_audio.stt.utils import load_model
11
+ from mlx_audio.stt.models.whisper.tokenizer import LANGUAGES
12
+ from mlx_audio.stt.models.whisper.whisper import Model
13
+ import soundfile as sf
14
+ import scipy.signal
15
+
16
+ from profiling import ProfilingMixin, StopReason
17
+
18
+
19
+ class MlxAsr(ASR, ProfilingMixin):
20
+ """MLX Audio implementation of ASR interface."""
21
+
22
+ def __init__(
23
+ self,
24
+ model_path: MLPath,
25
+ tokenizer_path: Optional[MLPath],
26
+ language: Optional[str],
27
+ device: Optional[str] = None,
28
+ ) -> None:
29
+ # Initialize profiling mixin
30
+ ProfilingMixin.__init__(self)
31
+
32
+ if os.path.isfile(model_path):
33
+ model_path = os.path.dirname(model_path)
34
+
35
+ super().__init__(model_path, tokenizer_path, language, device)
36
+
37
+ # Load model immediately in constructor
38
+ self.model: Model = load_model(model_path)
39
+ self.model_path = model_path
40
+
41
+ def destroy(self) -> None:
42
+ """Destroy the model and free resources."""
43
+ if self.model is not None:
44
+ del self.model
45
+ self.model = None
46
+ mx.clear_cache()
47
+
48
+ def close(self) -> None:
49
+ """Close the model."""
50
+ self.destroy()
51
+
52
+ def transcribe(
53
+ self,
54
+ audio_path: MLPath,
55
+ language: Optional[str] = None,
56
+ config: Optional[ASRConfig] = None,
57
+ clear_cache: bool = True,
58
+ ) -> ASRResult:
59
+ """Transcribe audio file to text."""
60
+ if self.model is None:
61
+ raise RuntimeError("Model not loaded")
62
+
63
+ # Start profiling
64
+ self._start_profiling()
65
+ self._decode_start()
66
+
67
+ try:
68
+ result = self.model.generate(audio_path)
69
+
70
+ if clear_cache:
71
+ mx.clear_cache()
72
+
73
+ self._decode_end()
74
+ self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
75
+ self._end_profiling()
76
+ except Exception as e:
77
+ self._end_profiling()
78
+ raise RuntimeError(f"Failed to transcribe audio file {audio_path}: {e}")
79
+
80
+ # Extract confidence scores and timestamps
81
+ confidence_scores = []
82
+ timestamps = []
83
+
84
+ # Handle different result types: Whisper (STTOutput) vs Parakeet (AlignedResult)
85
+ if hasattr(result, 'segments') and result.segments:
86
+ # Whisper STTOutput format
87
+ for segment in result.segments:
88
+ if 'avg_logprob' in segment:
89
+ # Convert log probability to confidence score (0-1)
90
+ confidence = max(0.0, min(1.0, np.exp(segment['avg_logprob'])))
91
+ confidence_scores.append(confidence)
92
+ else:
93
+ confidence_scores.append(0.5) # Default confidence
94
+
95
+ start_time = segment.get('start', 0.0)
96
+ end_time = segment.get('end', 0.0)
97
+ timestamps.append((start_time, end_time))
98
+ elif hasattr(result, 'sentences') and result.sentences:
99
+ # Parakeet AlignedResult format
100
+ for sentence in result.sentences:
101
+ confidence_scores.append(0.5) # Default confidence for Parakeet
102
+ timestamps.append((sentence.start, sentence.end))
103
+ else:
104
+ # Single segment case or empty result
105
+ confidence_scores.append(0.5)
106
+ timestamps.append((0.0, 0.0)) # Default timestamps
107
+
108
+ return ASRResult(
109
+ transcript=result.text,
110
+ confidence_scores=confidence_scores,
111
+ timestamps=timestamps,
112
+ duration_us=self._get_audio_duration_us(audio_path)
113
+ )
114
+
115
+ def list_supported_languages(self) -> List[str]:
116
+ """List supported languages."""
117
+ return list(LANGUAGES.keys())
118
+
119
+ def _get_audio_duration_us(self, audio_path: MLPath) -> int:
120
+ with sf.SoundFile(audio_path) as f:
121
+ duration_us = f.frames / f.samplerate * 1e6
122
+ return int(duration_us)
File without changes
@@ -0,0 +1,25 @@
1
+ import atexit
2
+
3
+ # Store the original atexit.register function
4
+ _original_atexit_register = atexit.register
5
+
6
+ def _filtered_atexit_register(func, *args, **kwargs):
7
+ """
8
+ Clean atexit interceptor that skips nanobind handlers to prevent segfaults due to MLX atexit cleanups.
9
+ This should be registered early during Python runtime initialization.
10
+ """
11
+ # Skip nanobind handlers silently
12
+ func_type_str = str(type(func))
13
+ if 'nanobind' in func_type_str or func_type_str.startswith("<class 'nb_"):
14
+ return lambda: None
15
+
16
+ # Allow all other handlers to register normally
17
+ return _original_atexit_register(func, *args, **kwargs)
18
+
19
+ def install_atexit_filter():
20
+ """Install the atexit filter to prevent problematic nanobind registrations."""
21
+ atexit.register = _filtered_atexit_register
22
+
23
+ def uninstall_atexit_filter():
24
+ """Restore the original atexit.register function."""
25
+ atexit.register = _original_atexit_register
File without changes