nexaai 1.0.4rc10__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Files changed (519) hide show
  1. nexaai/__init__.py +71 -0
  2. nexaai/_version.py +4 -0
  3. nexaai/asr.py +60 -0
  4. nexaai/asr_impl/__init__.py +0 -0
  5. nexaai/asr_impl/mlx_asr_impl.py +91 -0
  6. nexaai/asr_impl/pybind_asr_impl.py +43 -0
  7. nexaai/base.py +39 -0
  8. nexaai/binds/__init__.py +3 -0
  9. nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
  10. nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
  11. nexaai/binds/libnexa_bridge.dylib +0 -0
  12. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  13. nexaai/binds/nexa_llama_cpp/libggml-base.dylib +0 -0
  14. nexaai/binds/nexa_llama_cpp/libggml-cpu.so +0 -0
  15. nexaai/binds/nexa_llama_cpp/libggml-metal.so +0 -0
  16. nexaai/binds/nexa_llama_cpp/libggml.dylib +0 -0
  17. nexaai/binds/nexa_llama_cpp/libllama.dylib +0 -0
  18. nexaai/binds/nexa_llama_cpp/libmtmd.dylib +0 -0
  19. nexaai/binds/nexa_llama_cpp/libnexa_plugin.dylib +0 -0
  20. nexaai/binds/nexa_mlx/libnexa_plugin.dylib +0 -0
  21. nexaai/binds/nexa_mlx/py-lib/ml.py +842 -0
  22. nexaai/binds/nexa_mlx/py-lib/mlx_audio/__init__.py +0 -0
  23. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/__init__.py +1 -0
  24. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/__init__.py +5 -0
  25. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  26. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  27. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  28. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  29. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  30. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  31. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
  32. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
  33. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
  34. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  35. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  36. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  37. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
  38. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
  39. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
  40. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
  41. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  42. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  43. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  44. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  45. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  46. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  47. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
  48. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
  49. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
  50. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
  51. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
  52. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
  53. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
  54. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
  55. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
  56. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
  57. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
  58. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
  59. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  60. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  61. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
  62. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
  63. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
  64. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
  65. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
  66. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
  67. nexaai/binds/nexa_mlx/py-lib/mlx_audio/server.py +525 -0
  68. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/__init__.py +0 -0
  69. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  70. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
  71. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/__init__.py +0 -0
  72. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/generate.py +174 -0
  73. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  74. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  75. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  76. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
  77. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
  78. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  79. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  80. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  81. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  82. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  83. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  84. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  85. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
  86. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
  87. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
  88. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
  89. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  90. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
  91. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
  92. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
  93. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/utils.py +195 -0
  94. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/__init__.py +1 -0
  95. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/audio_player.py +120 -0
  96. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/convert.py +71 -0
  97. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/generate.py +449 -0
  98. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  99. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
  100. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
  101. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
  102. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
  103. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/base.py +84 -0
  104. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
  105. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
  106. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
  107. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
  108. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
  109. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
  110. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
  111. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  112. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
  113. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  114. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  115. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  116. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  117. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  118. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  119. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
  120. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
  121. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
  122. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  123. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
  124. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  125. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  126. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  127. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
  128. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  129. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
  130. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
  131. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
  132. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
  133. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  134. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  135. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
  136. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  137. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
  138. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
  139. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
  140. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
  141. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  142. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
  143. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  144. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
  145. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  146. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  147. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  148. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  149. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  150. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  151. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  152. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  153. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  154. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  155. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  156. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  157. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  158. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  159. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  160. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
  161. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  162. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
  163. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  164. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  165. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
  166. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
  167. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
  168. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
  169. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/utils.py +337 -0
  170. nexaai/binds/nexa_mlx/py-lib/mlx_audio/utils.py +237 -0
  171. nexaai/binds/nexa_mlx/py-lib/mlx_audio/version.py +1 -0
  172. nexaai/binds/nexa_mlx/py-lib/profiling.py +239 -0
  173. nexaai/common.py +61 -0
  174. nexaai/cv.py +87 -0
  175. nexaai/cv_impl/__init__.py +0 -0
  176. nexaai/cv_impl/mlx_cv_impl.py +88 -0
  177. nexaai/cv_impl/pybind_cv_impl.py +31 -0
  178. nexaai/embedder.py +68 -0
  179. nexaai/embedder_impl/__init__.py +0 -0
  180. nexaai/embedder_impl/mlx_embedder_impl.py +114 -0
  181. nexaai/embedder_impl/pybind_embedder_impl.py +91 -0
  182. nexaai/image_gen.py +136 -0
  183. nexaai/image_gen_impl/__init__.py +0 -0
  184. nexaai/image_gen_impl/mlx_image_gen_impl.py +291 -0
  185. nexaai/image_gen_impl/pybind_image_gen_impl.py +84 -0
  186. nexaai/llm.py +89 -0
  187. nexaai/llm_impl/__init__.py +0 -0
  188. nexaai/llm_impl/mlx_llm_impl.py +249 -0
  189. nexaai/llm_impl/pybind_llm_impl.py +207 -0
  190. nexaai/mlx_backend/asr/__init__.py +12 -0
  191. nexaai/mlx_backend/asr/interface.py +122 -0
  192. nexaai/mlx_backend/common/__init__.py +0 -0
  193. nexaai/mlx_backend/common/utils.py +25 -0
  194. nexaai/mlx_backend/cv/__init__.py +0 -0
  195. nexaai/mlx_backend/cv/generate.py +195 -0
  196. nexaai/mlx_backend/cv/interface.py +151 -0
  197. nexaai/mlx_backend/cv/main.py +81 -0
  198. nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
  199. nexaai/mlx_backend/embedding/__init__.py +0 -0
  200. nexaai/mlx_backend/embedding/generate.py +130 -0
  201. nexaai/mlx_backend/embedding/interface.py +312 -0
  202. nexaai/mlx_backend/embedding/main.py +82 -0
  203. nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
  204. nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
  205. nexaai/mlx_backend/llm/__init__.py +0 -0
  206. nexaai/mlx_backend/llm/generate.py +149 -0
  207. nexaai/mlx_backend/llm/interface.py +764 -0
  208. nexaai/mlx_backend/llm/main.py +68 -0
  209. nexaai/mlx_backend/ml.py +842 -0
  210. nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
  211. nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
  212. nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
  213. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  214. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  215. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  216. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  217. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  218. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  219. nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
  220. nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
  221. nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
  222. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  223. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  224. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  225. nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
  226. nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
  227. nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
  228. nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
  229. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  230. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  231. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  232. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  233. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  234. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  235. nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
  236. nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
  237. nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
  238. nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
  239. nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
  240. nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
  241. nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
  242. nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
  243. nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
  244. nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
  245. nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
  246. nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
  247. nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
  248. nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  249. nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
  250. nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
  251. nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
  252. nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
  253. nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
  254. nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
  255. nexaai/mlx_backend/mlx_audio/server.py +525 -0
  256. nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
  257. nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  258. nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
  259. nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
  260. nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
  261. nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
  262. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  263. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  264. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
  265. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
  266. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  267. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  268. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  269. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  270. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  271. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  272. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  273. nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
  274. nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
  275. nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
  276. nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
  277. nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  278. nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
  279. nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
  280. nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
  281. nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
  282. nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
  283. nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
  284. nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
  285. nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
  286. nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
  287. nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
  288. nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
  289. nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
  290. nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
  291. nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
  292. nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
  293. nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
  294. nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
  295. nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
  296. nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
  297. nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
  298. nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
  299. nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  300. nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
  301. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  302. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  303. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  304. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  305. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  306. nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  307. nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
  308. nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
  309. nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
  310. nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  311. nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
  312. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  313. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  314. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  315. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
  316. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  317. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
  318. nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
  319. nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
  320. nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
  321. nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  322. nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  323. nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
  324. nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
  325. nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  326. nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
  327. nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
  328. nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
  329. nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
  330. nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  331. nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
  332. nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  333. nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
  334. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  335. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  336. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  337. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  338. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  339. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  340. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  341. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  342. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  343. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  344. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  345. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  346. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  347. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  348. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  349. nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
  350. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  351. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
  352. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  353. nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
  354. nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
  355. nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
  356. nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
  357. nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
  358. nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
  359. nexaai/mlx_backend/mlx_audio/utils.py +237 -0
  360. nexaai/mlx_backend/mlx_audio/version.py +1 -0
  361. nexaai/mlx_backend/profiling.py +239 -0
  362. nexaai/mlx_backend/rerank/__init__.py +0 -0
  363. nexaai/mlx_backend/rerank/generate.py +174 -0
  364. nexaai/mlx_backend/rerank/interface.py +287 -0
  365. nexaai/mlx_backend/rerank/main.py +127 -0
  366. nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
  367. nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
  368. nexaai/mlx_backend/sd/__init__.py +1 -0
  369. nexaai/mlx_backend/sd/interface.py +362 -0
  370. nexaai/mlx_backend/sd/main.py +286 -0
  371. nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
  372. nexaai/mlx_backend/sd/modeling/clip.py +116 -0
  373. nexaai/mlx_backend/sd/modeling/config.py +65 -0
  374. nexaai/mlx_backend/sd/modeling/model_io.py +330 -0
  375. nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
  376. nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
  377. nexaai/mlx_backend/sd/modeling/unet.py +460 -0
  378. nexaai/mlx_backend/sd/modeling/vae.py +274 -0
  379. nexaai/mlx_backend/tts/__init__.py +12 -0
  380. nexaai/mlx_backend/tts/interface.py +276 -0
  381. nexaai/mlx_backend/vlm/__init__.py +3 -0
  382. nexaai/mlx_backend/vlm/generate.py +572 -0
  383. nexaai/mlx_backend/vlm/interface.py +406 -0
  384. nexaai/mlx_backend/vlm/main.py +157 -0
  385. nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
  386. nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
  387. nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
  388. nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
  389. nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  390. nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  391. nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
  392. nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
  393. nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
  394. nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
  395. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  396. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  397. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  398. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  399. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  400. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  401. nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
  402. nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
  403. nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
  404. nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
  405. nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
  406. nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
  407. nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
  408. nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
  409. nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
  410. nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
  411. nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
  412. nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  413. nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
  414. nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
  415. nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
  416. nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
  417. nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
  418. nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
  419. nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
  420. nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
  421. nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
  422. nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
  423. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  424. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  425. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
  426. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
  427. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
  428. nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
  429. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  430. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  431. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
  432. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
  433. nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
  434. nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
  435. nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
  436. nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
  437. nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
  438. nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
  439. nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
  440. nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
  441. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  442. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
  443. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  444. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
  445. nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
  446. nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
  447. nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
  448. nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
  449. nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
  450. nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
  451. nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
  452. nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
  453. nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
  454. nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
  455. nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
  456. nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
  457. nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
  458. nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
  459. nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
  460. nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
  461. nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  462. nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
  463. nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
  464. nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
  465. nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
  466. nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
  467. nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
  468. nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
  469. nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
  470. nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  471. nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  472. nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
  473. nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
  474. nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
  475. nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
  476. nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
  477. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  478. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  479. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  480. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  481. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  482. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  483. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
  484. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
  485. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  486. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  487. nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
  488. nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  489. nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  490. nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
  491. nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
  492. nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
  493. nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
  494. nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
  495. nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
  496. nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
  497. nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
  498. nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
  499. nexaai/rerank.py +51 -0
  500. nexaai/rerank_impl/__init__.py +0 -0
  501. nexaai/rerank_impl/mlx_rerank_impl.py +91 -0
  502. nexaai/rerank_impl/pybind_rerank_impl.py +42 -0
  503. nexaai/runtime.py +64 -0
  504. nexaai/tts.py +70 -0
  505. nexaai/tts_impl/__init__.py +0 -0
  506. nexaai/tts_impl/mlx_tts_impl.py +93 -0
  507. nexaai/tts_impl/pybind_tts_impl.py +42 -0
  508. nexaai/utils/avatar_fetcher.py +104 -0
  509. nexaai/utils/decode.py +18 -0
  510. nexaai/utils/model_manager.py +1195 -0
  511. nexaai/utils/progress_tracker.py +372 -0
  512. nexaai/vlm.py +120 -0
  513. nexaai/vlm_impl/__init__.py +0 -0
  514. nexaai/vlm_impl/mlx_vlm_impl.py +205 -0
  515. nexaai/vlm_impl/pybind_vlm_impl.py +228 -0
  516. nexaai-1.0.4rc10.dist-info/METADATA +26 -0
  517. nexaai-1.0.4rc10.dist-info/RECORD +519 -0
  518. nexaai-1.0.4rc10.dist-info/WHEEL +5 -0
  519. nexaai-1.0.4rc10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,156 @@
1
+ from unittest import mock
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ from mlx_audio.sts.voice_pipeline import VoicePipeline
7
+
8
+
9
+ class TestVoicePipeline:
10
+ def test_initialization_default_params(self):
11
+ """
12
+ Test that the initialization method initializes the parameters correctly.
13
+ """
14
+ pipeline = VoicePipeline()
15
+ assert pipeline.silence_threshold == 0.03
16
+ assert pipeline.silence_duration == 1.5
17
+ assert pipeline.input_sample_rate == 16_000
18
+ assert pipeline.output_sample_rate == 24_000
19
+ assert pipeline.streaming_interval == 3
20
+ assert pipeline.frame_duration_ms == 30
21
+ assert pipeline.stt_model == "mlx-community/whisper-large-v3-turbo"
22
+ assert pipeline.llm_model == "Qwen/Qwen2.5-0.5B-Instruct-4bit"
23
+ assert pipeline.tts_model == "mlx-community/csm-1b-fp16"
24
+
25
+ def test_initialization_custom_params(self):
26
+ """
27
+ Test that the initialization method initializes the parameters correctly.
28
+ """
29
+ pipeline = VoicePipeline(
30
+ silence_threshold=0.05,
31
+ silence_duration=2.0,
32
+ input_sample_rate=8_000,
33
+ output_sample_rate=12_000,
34
+ streaming_interval=5,
35
+ frame_duration_ms=20,
36
+ vad_mode=2,
37
+ stt_model="custom/stt",
38
+ llm_model="custom/llm",
39
+ tts_model="custom/tts",
40
+ )
41
+ assert pipeline.silence_threshold == 0.05
42
+ assert pipeline.silence_duration == 2.0
43
+ assert pipeline.input_sample_rate == 8_000
44
+ assert pipeline.output_sample_rate == 12_000
45
+ assert pipeline.streaming_interval == 5
46
+ assert pipeline.frame_duration_ms == 20
47
+ assert pipeline.stt_model == "custom/stt"
48
+ assert pipeline.llm_model == "custom/llm"
49
+ assert pipeline.tts_model == "custom/tts"
50
+
51
+ @mock.patch("mlx_audio.sts.voice_pipeline.load_llm")
52
+ @mock.patch("mlx_audio.sts.voice_pipeline.load_tts")
53
+ @mock.patch("mlx_audio.sts.voice_pipeline.Whisper.from_pretrained")
54
+ async def test_init_models(self, mock_whisper_load, mock_tts_load, mock_llm_load):
55
+ """
56
+ Test that the init_models method initializes the models correctly.
57
+ """
58
+ pipeline = VoicePipeline()
59
+
60
+ # Mock the return values of the model loaders
61
+ mock_llm = mock.AsyncMock()
62
+ mock_tokenizer = mock.AsyncMock()
63
+ mock_llm_load.return_value = (mock_llm, mock_tokenizer)
64
+
65
+ mock_tts = mock.AsyncMock()
66
+ mock_tts_load.return_value = mock_tts
67
+
68
+ mock_stt = mock.AsyncMock()
69
+ mock_whisper_load.return_value = mock_stt
70
+
71
+ await pipeline.init_models()
72
+
73
+ mock_llm_load.assert_called_once_with(pipeline.llm_model)
74
+ mock_tts_load.assert_called_once_with(pipeline.tts_model)
75
+ mock_whisper_load.assert_called_once_with(pipeline.stt_model)
76
+
77
+ assert pipeline.llm is mock_llm
78
+ assert pipeline.tokenizer is mock_tokenizer
79
+ assert pipeline.tts is mock_tts
80
+ assert pipeline.stt is mock_stt
81
+
82
+ def test_is_silent_true(self):
83
+ """
84
+ Test that the is_silent method returns True for silent audio frames.
85
+ """
86
+ pipeline = VoicePipeline(silence_threshold=0.1)
87
+ # Create a silent audio frame (very low amplitude)
88
+ silent_audio_data_np = np.random.uniform(-0.01, 0.01, size=480).astype(
89
+ np.float32
90
+ ) # 30ms at 16kHz
91
+ silent_audio_data_bytes = (
92
+ (silent_audio_data_np * 32768.0).astype(np.int16).tobytes()
93
+ )
94
+
95
+ assert pipeline._is_silent(silent_audio_data_np) is np.True_
96
+ assert pipeline._is_silent(silent_audio_data_bytes) is np.True_
97
+
98
+ def test_is_silent_false(self):
99
+ """
100
+ Test that the is_silent method returns False for non-silent audio frames.
101
+ """
102
+ pipeline = VoicePipeline(silence_threshold=0.001)
103
+ # Create a non-silent audio frame (higher amplitude)
104
+ speech_audio_data_np = np.random.uniform(-2, 2, size=480).astype(np.float32)
105
+ speech_audio_data_bytes = (
106
+ (speech_audio_data_np * 32768.0).astype(np.int16).tobytes()
107
+ )
108
+
109
+ assert pipeline._is_silent(speech_audio_data_np) is np.False_
110
+ assert pipeline._is_silent(speech_audio_data_bytes) is np.False_
111
+
112
+ @mock.patch("webrtcvad.Vad.is_speech")
113
+ def test_voice_activity_detection_vad_speech(self, mock_is_speech):
114
+ """
115
+ Test that the voice activity detection returns True for speech frames.
116
+ """
117
+ pipeline = VoicePipeline()
118
+ mock_is_speech.return_value = True
119
+ frame = b"\x00\x00" * (16000 * 30 // 1000) # 30ms of silence at 16kHz, 16-bit
120
+ assert pipeline._voice_activity_detection(frame) is True
121
+ mock_is_speech.assert_called_once_with(frame, pipeline.input_sample_rate)
122
+
123
+ @mock.patch("webrtcvad.Vad.is_speech")
124
+ def test_voice_activity_detection_vad_silence(self, mock_is_speech):
125
+ """
126
+ Test that the voice activity detection returns False for silent frames.
127
+ """
128
+ pipeline = VoicePipeline()
129
+ mock_is_speech.return_value = False
130
+ frame = b"\x00\x00" * (16000 * 30 // 1000)
131
+ assert pipeline._voice_activity_detection(frame) is False
132
+ mock_is_speech.assert_called_once_with(frame, pipeline.input_sample_rate)
133
+
134
+ @mock.patch("webrtcvad.Vad.is_speech")
135
+ def test_voice_activity_detection_vad_error_fallback_silent(self, mock_is_speech):
136
+ """
137
+ Test that the voice activity detection returns False for silent frames.
138
+ """
139
+ pipeline = VoicePipeline(silence_threshold=0.1)
140
+ mock_is_speech.side_effect = ValueError("VAD error")
141
+
142
+ frame_np = np.full(480, 0.001, dtype=np.float32)
143
+ frame_bytes = (frame_np * 32768.0).astype(np.int16).tobytes()
144
+
145
+ assert pipeline._voice_activity_detection(frame_bytes) is False
146
+ mock_is_speech.assert_called_once_with(frame_bytes, pipeline.input_sample_rate)
147
+
148
+ @mock.patch("webrtcvad.Vad.is_speech")
149
+ def test_voice_activity_detection_vad_error_fallback_speech(self, mock_is_speech):
150
+ pipeline = VoicePipeline(silence_threshold=0.01)
151
+ mock_is_speech.side_effect = ValueError("VAD error")
152
+ frame_np = np.full(480, 0.5, dtype=np.float32)
153
+ frame_bytes = (frame_np * 32768.0).astype(np.int16).tobytes()
154
+
155
+ assert pipeline._voice_activity_detection(frame_bytes) is True
156
+ mock_is_speech.assert_called_once_with(frame_bytes, pipeline.input_sample_rate)
@@ -0,0 +1,327 @@
1
+ import argparse
2
+ import asyncio
3
+ import logging
4
+
5
+ import mlx.core as mx
6
+ import numpy as np
7
+ import sounddevice as sd
8
+ import webrtcvad
9
+ from mlx_lm.generate import generate as generate_text
10
+ from mlx_lm.utils import load as load_llm
11
+
12
+ from mlx_audio.stt.models.whisper import Model as Whisper
13
+ from mlx_audio.tts.audio_player import AudioPlayer
14
+ from mlx_audio.tts.utils import load_model as load_tts
15
+
16
+ logging.basicConfig(
17
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class VoicePipeline:
23
+ def __init__(
24
+ self,
25
+ silence_threshold=0.03,
26
+ silence_duration=1.5,
27
+ input_sample_rate=16_000,
28
+ output_sample_rate=24_000,
29
+ streaming_interval=3,
30
+ frame_duration_ms=30,
31
+ vad_mode=3,
32
+ stt_model="mlx-community/whisper-large-v3-turbo",
33
+ llm_model="Qwen/Qwen2.5-0.5B-Instruct-4bit",
34
+ tts_model="mlx-community/csm-1b-fp16",
35
+ ):
36
+ self.silence_threshold = silence_threshold
37
+ self.silence_duration = silence_duration
38
+ self.input_sample_rate = input_sample_rate
39
+ self.output_sample_rate = output_sample_rate
40
+ self.streaming_interval = streaming_interval
41
+ self.frame_duration_ms = frame_duration_ms
42
+
43
+ self.stt_model = stt_model
44
+ self.llm_model = llm_model
45
+ self.tts_model = tts_model
46
+
47
+ self.vad = webrtcvad.Vad(vad_mode)
48
+
49
+ self.input_audio_queue = asyncio.Queue(maxsize=50)
50
+ self.transcription_queue = asyncio.Queue()
51
+ self.output_audio_queue = asyncio.Queue(maxsize=50)
52
+
53
+ self.mlx_lock = asyncio.Lock()
54
+
55
+ async def init_models(self):
56
+ logger.info(f"Loading text generation model: {self.llm_model}")
57
+ self.llm, self.tokenizer = await asyncio.to_thread(
58
+ lambda: load_llm(self.llm_model)
59
+ )
60
+
61
+ logger.info(f"Loading text-to-speech model: {self.tts_model}")
62
+ self.tts = await asyncio.to_thread(lambda: load_tts(self.tts_model))
63
+
64
+ logger.info(f"Loading speech-to-text model: {self.stt_model}")
65
+ self.stt = Whisper.from_pretrained(self.stt_model)
66
+
67
+ async def start(self):
68
+ self.loop = asyncio.get_running_loop()
69
+
70
+ await self.init_models()
71
+
72
+ tasks = [
73
+ asyncio.create_task(self._listener()),
74
+ asyncio.create_task(self._response_processor()),
75
+ asyncio.create_task(self._audio_output_processor()),
76
+ ]
77
+ try:
78
+ await asyncio.gather(*tasks)
79
+ finally:
80
+ for t in tasks:
81
+ t.cancel()
82
+ await asyncio.gather(*tasks, return_exceptions=True)
83
+
84
+ # speech detection and transcription
85
+
86
+ def _is_silent(self, audio_data):
87
+ if isinstance(audio_data, bytes):
88
+ audio_np = np.frombuffer(audio_data, dtype=np.int16)
89
+ audio_np = (
90
+ audio_np.astype(np.float32) / 32768.0
91
+ ) # Normalize if input is bytes
92
+ else:
93
+ audio_np = audio_data
94
+
95
+ # Ensure audio_np is float32 for energy calculation.
96
+ audio_np = audio_np.astype(np.float32)
97
+
98
+ energy = np.linalg.norm(audio_np) / np.sqrt(audio_np.size)
99
+ return energy < self.silence_threshold
100
+
101
+ def _voice_activity_detection(self, frame):
102
+ try:
103
+ return self.vad.is_speech(frame, self.input_sample_rate)
104
+ except ValueError:
105
+ # fall back to energy-based detection
106
+ return not self._is_silent(frame)
107
+
108
+ async def _listener(self):
109
+ frame_size = int(self.input_sample_rate * (self.frame_duration_ms / 1000.0))
110
+ stream = sd.InputStream(
111
+ samplerate=self.input_sample_rate,
112
+ blocksize=frame_size,
113
+ channels=1,
114
+ dtype="int16",
115
+ callback=self._sd_callback,
116
+ )
117
+ stream.start()
118
+
119
+ logger.info("Listening for voice input...")
120
+
121
+ frames = []
122
+ silent_frames = 0
123
+ frames_until_silence = int(
124
+ self.silence_duration * 1000 / self.frame_duration_ms
125
+ )
126
+ speaking_detected = False
127
+
128
+ try:
129
+ while True:
130
+ frame = await self.input_audio_queue.get()
131
+ is_speech = self._voice_activity_detection(frame)
132
+
133
+ if is_speech:
134
+ speaking_detected = True
135
+ silent_frames = 0
136
+ frames.append(frame)
137
+
138
+ # Cancel the current TTS task
139
+ if hasattr(self, "current_tts_task") and self.current_tts_task:
140
+ # Signal the generator loop to stop
141
+ self.current_tts_cancel.set()
142
+
143
+ # Clear the output audio queue
144
+ self.loop.call_soon_threadsafe(self.player.flush)
145
+ elif speaking_detected:
146
+ silent_frames += 1
147
+ frames.append(frame)
148
+
149
+ if silent_frames > frames_until_silence:
150
+ # Process the voice input
151
+ if frames:
152
+
153
+ logger.info("Processing voice input...")
154
+ await self._process_audio(frames)
155
+
156
+ frames = []
157
+ speaking_detected = False
158
+ silent_frames = 0
159
+ except (asyncio.CancelledError, KeyboardInterrupt):
160
+ stream.stop()
161
+ stream.close()
162
+ raise
163
+ finally:
164
+ stream.stop()
165
+ stream.close()
166
+
167
+ def _sd_callback(self, indata, frames, _time, status):
168
+ data = indata.reshape(-1).tobytes()
169
+
170
+ def _enqueue():
171
+ try:
172
+ self.input_audio_queue.put_nowait(data)
173
+ except asyncio.QueueFull:
174
+ return
175
+
176
+ self.loop.call_soon_threadsafe(_enqueue)
177
+
178
+ async def _process_audio(self, frames):
179
+ audio = (
180
+ np.frombuffer(b"".join(frames), dtype=np.int16).astype(np.float32) / 32768.0
181
+ )
182
+
183
+ async with self.mlx_lock:
184
+ result = await asyncio.to_thread(self.stt.generate, mx.array(audio))
185
+ text = result.text.strip()
186
+
187
+ if text:
188
+ logger.info(f"Transcribed: {text}")
189
+ await self.transcription_queue.put(text)
190
+
191
+ # response generation
192
+
193
+ async def _response_processor(self):
194
+ while True:
195
+ text = await self.transcription_queue.get()
196
+ await self._generate_response(text)
197
+ self.transcription_queue.task_done()
198
+
199
+ async def _generate_response(self, text):
200
+ def _get_llm_response(llm, tokenizer, messages, *, verbose=False):
201
+ prompt = tokenizer.apply_chat_template(
202
+ messages, tokenize=False, add_generation_prompt=True
203
+ )
204
+ return generate_text(llm, tokenizer, prompt, verbose=verbose).strip()
205
+
206
+ try:
207
+ logger.info("Generating response...")
208
+
209
+ messages = [
210
+ {
211
+ "role": "system",
212
+ "content": "You are a helpful voice assistant. You always respond with short sentences and never use punctuation like parentheses or colons that wouldn't appear in conversational speech.",
213
+ },
214
+ {"role": "user", "content": text},
215
+ ]
216
+ async with self.mlx_lock:
217
+ response_text = await asyncio.to_thread(
218
+ _get_llm_response, self.llm, self.tokenizer, messages, verbose=False
219
+ )
220
+
221
+ logger.info(f"Generated response: {response_text}")
222
+
223
+ if response_text:
224
+ self.current_tts_cancel = asyncio.Event()
225
+ self.current_tts_task = asyncio.create_task(
226
+ self._speak_response(response_text, self.current_tts_cancel)
227
+ )
228
+ except Exception as e:
229
+ logger.error(f"Generation error: {e}")
230
+
231
+ # speech generation
232
+
233
+ async def _speak_response(self, text: str, cancel_event: asyncio.Event):
234
+ """
235
+ Speak `text`, yielding PCM chunks into `self.output_audio_queue`.
236
+ Playback can be interrupted at any moment by setting `cancel_event`.
237
+ """
238
+ loop = self.loop
239
+
240
+ def _tts_stream(tts, txt, rate, queue, cancel_ev: asyncio.Event):
241
+ # This runs in a worker thread, so we *must* poll a thread‑safe flag.
242
+ for chunk in tts.generate(
243
+ txt,
244
+ sample_rate=rate,
245
+ stream=True,
246
+ streaming_interval=self.streaming_interval,
247
+ verbose=False,
248
+ ):
249
+ if cancel_ev.is_set(): # <-- stop immediately
250
+ break
251
+ loop.call_soon_threadsafe(queue.put_nowait, chunk.audio)
252
+
253
+ try:
254
+ async with self.mlx_lock:
255
+ await asyncio.to_thread(
256
+ _tts_stream,
257
+ self.tts,
258
+ text,
259
+ self.output_sample_rate,
260
+ self.output_audio_queue,
261
+ cancel_event,
262
+ )
263
+ except asyncio.CancelledError:
264
+ # The coroutine itself was cancelled from outside → just exit cleanly.
265
+ pass
266
+ except Exception as exc:
267
+ logger.error("Speech synthesis error: %s", exc)
268
+
269
+ async def _audio_output_processor(self):
270
+ self.player = AudioPlayer(sample_rate=self.output_sample_rate)
271
+
272
+ try:
273
+ while True:
274
+ audio = await self.output_audio_queue.get()
275
+ self.player.queue_audio(audio)
276
+ self.output_audio_queue.task_done()
277
+ except (asyncio.CancelledError, KeyboardInterrupt):
278
+ self.player.stop()
279
+ raise
280
+
281
+
282
+ async def main():
283
+ parser = argparse.ArgumentParser(description="Voice Pipeline")
284
+ parser.add_argument(
285
+ "--stt_model",
286
+ type=str,
287
+ default="mlx-community/whisper-large-v3-turbo",
288
+ help="STT model",
289
+ )
290
+ parser.add_argument(
291
+ "--tts_model", type=str, default="mlx-community/csm-1b-fp16", help="TTS model"
292
+ )
293
+ parser.add_argument(
294
+ "--llm_model",
295
+ type=str,
296
+ default="mlx-community/Qwen2.5-0.5B-Instruct-4bit",
297
+ help="LLM model",
298
+ )
299
+ parser.add_argument("--vad_mode", type=int, default=3, help="VAD mode")
300
+ parser.add_argument(
301
+ "--silence_duration", type=float, default=1.5, help="Silence duration"
302
+ )
303
+ parser.add_argument(
304
+ "--silence_threshold", type=float, default=0.03, help="Silence threshold"
305
+ )
306
+ parser.add_argument(
307
+ "--streaming_interval", type=int, default=3, help="Streaming interval"
308
+ )
309
+ args = parser.parse_args()
310
+
311
+ pipeline = VoicePipeline(
312
+ stt_model=args.stt_model,
313
+ tts_model=args.tts_model,
314
+ llm_model=args.llm_model,
315
+ vad_mode=args.vad_mode,
316
+ silence_duration=args.silence_duration,
317
+ silence_threshold=args.silence_threshold,
318
+ streaming_interval=args.streaming_interval,
319
+ )
320
+ await pipeline.start()
321
+
322
+
323
+ if __name__ == "__main__":
324
+ try:
325
+ asyncio.run(main())
326
+ except KeyboardInterrupt:
327
+ pass
File without changes
@@ -0,0 +1,174 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import mlx.core as mx
9
+
10
+ from mlx_audio.stt.utils import load_model
11
+
12
+
13
+ def parse_args():
14
+ parser = argparse.ArgumentParser(
15
+ description="Generate transcriptions from audio files"
16
+ )
17
+ parser.add_argument("--model", type=str, required=True, help="Path to the model")
18
+ parser.add_argument(
19
+ "--audio", type=str, required=True, help="Path to the audio file"
20
+ )
21
+ parser.add_argument(
22
+ "--output", type=str, required=True, help="Path to save the output"
23
+ )
24
+ parser.add_argument(
25
+ "--format",
26
+ type=str,
27
+ default="txt",
28
+ choices=["txt", "srt", "vtt", "json"],
29
+ help="Output format (txt, srt, vtt, or json)",
30
+ )
31
+ parser.add_argument("--verbose", action="store_true", help="Verbose output")
32
+ return parser.parse_args()
33
+
34
+
35
+ def format_timestamp(seconds: float) -> str:
36
+ """Convert seconds to HH:MM:SS,mmm format for SRT/VTT"""
37
+ hours = int(seconds // 3600)
38
+ minutes = int((seconds % 3600) // 60)
39
+ seconds = seconds % 60
40
+ return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}".replace(".", ",")
41
+
42
+
43
+ def format_vtt_timestamp(seconds: float) -> str:
44
+ """Convert seconds to HH:MM:SS.mmm format for VTT"""
45
+ return format_timestamp(seconds).replace(",", ".")
46
+
47
+
48
+ def save_as_txt(segments, output_path: str):
49
+ with open(f"{output_path}.txt", "w", encoding="utf-8") as f:
50
+ f.write(segments.text)
51
+
52
+
53
+ def save_as_srt(segments, output_path: str):
54
+ with open(f"{output_path}.srt", "w", encoding="utf-8") as f:
55
+ for i, sentence in enumerate(segments.sentences, 1):
56
+ f.write(f"{i}\n")
57
+ f.write(
58
+ f"{format_timestamp(sentence.start)} --> {format_timestamp(sentence.end)}\n"
59
+ )
60
+ f.write(f"{sentence.text}\n\n")
61
+
62
+
63
+ def save_as_vtt(segments, output_path: str):
64
+ with open(f"{output_path}.vtt", "w", encoding="utf-8") as f:
65
+ f.write("WEBVTT\n\n")
66
+ if hasattr(segments, "sentences"):
67
+ sentences = segments.sentences
68
+
69
+ for i, sentence in enumerate(sentences, 1):
70
+ f.write(f"{i}\n")
71
+ f.write(
72
+ f"{format_vtt_timestamp(sentence.start)} --> {format_vtt_timestamp(sentence.end)}\n"
73
+ )
74
+ f.write(f"{sentence.text}\n\n")
75
+ else:
76
+ sentences = segments.segments
77
+ for i, token in enumerate(sentences, 1):
78
+ f.write(f"{i}\n")
79
+ f.write(
80
+ f"{format_vtt_timestamp(token['start'])} --> {format_vtt_timestamp(token['end'])}\n"
81
+ )
82
+ f.write(f"{token['text']}\n\n")
83
+
84
+
85
+ def save_as_json(segments, output_path: str):
86
+ if hasattr(segments, "sentences"):
87
+ result = {
88
+ "text": segments.text,
89
+ "sentences": [
90
+ {
91
+ "text": s.text,
92
+ "start": s.start,
93
+ "end": s.end,
94
+ "duration": s.duration,
95
+ "tokens": [
96
+ {
97
+ "text": t.text,
98
+ "start": t.start,
99
+ "end": t.end,
100
+ "duration": t.duration,
101
+ }
102
+ for t in s.tokens
103
+ ],
104
+ }
105
+ for s in segments.sentences
106
+ ],
107
+ }
108
+ else:
109
+ result = {
110
+ "text": segments.text,
111
+ "segments": [
112
+ {
113
+ "text": s["text"],
114
+ "start": s["start"],
115
+ "end": s["end"],
116
+ "duration": s["end"] - s["start"],
117
+ }
118
+ for s in segments.segments
119
+ ],
120
+ }
121
+
122
+ with open(f"{output_path}.json", "w", encoding="utf-8") as f:
123
+ json.dump(result, f, ensure_ascii=False, indent=2)
124
+
125
+
126
+ def generate(
127
+ model_path: str,
128
+ audio_path: str,
129
+ output_path: str,
130
+ format: str = "txt",
131
+ verbose: bool = True,
132
+ ):
133
+ model = load_model(model_path)
134
+ print(f"\n\033[94mModel:\033[0m {model_path}")
135
+ print(f"\033[94mAudio path:\033[0m {audio_path}")
136
+ print(f"\033[94mOutput path:\033[0m {output_path}")
137
+ print(f"\033[94mFormat:\033[0m {format}")
138
+ mx.reset_peak_memory()
139
+ start_time = time.time()
140
+ segments = model.generate(audio_path)
141
+ end_time = time.time()
142
+
143
+ if verbose:
144
+ print("\n\033[94mTranscription:\033[0m")
145
+ print(segments.text)
146
+ print("\n\033[94mSegments:\033[0m")
147
+ if hasattr(segments, "segments"):
148
+ print(segments.segments)
149
+ elif hasattr(segments, "tokens"):
150
+ print(segments.tokens)
151
+ else:
152
+ print(segments)
153
+
154
+ print(f"\033[94mProcessing time:\033[0m {end_time - start_time:.2f} seconds")
155
+ print(f"\033[94mPeak memory:\033[0m {mx.get_peak_memory() / 1e9:.2f} GB")
156
+
157
+ # Create output directory if it doesn't exist
158
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
159
+
160
+ if format == "txt":
161
+ save_as_txt(segments, output_path)
162
+ elif format == "srt":
163
+ save_as_srt(segments, output_path)
164
+ elif format == "vtt":
165
+ save_as_vtt(segments, output_path)
166
+ elif format == "json":
167
+ save_as_json(segments, output_path)
168
+
169
+ return segments
170
+
171
+
172
+ if __name__ == "__main__":
173
+ args = parse_args()
174
+ generate(args.model, args.audio, args.output, args.format, args.verbose)
File without changes
@@ -0,0 +1 @@
1
+ from .parakeet import Model