nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (580) hide show
  1. nexaai/__init__.py +99 -0
  2. nexaai/_stub.cpython-310-darwin.so +0 -0
  3. nexaai/_version.py +4 -0
  4. nexaai/asr.py +68 -0
  5. nexaai/asr_impl/__init__.py +0 -0
  6. nexaai/asr_impl/mlx_asr_impl.py +93 -0
  7. nexaai/asr_impl/pybind_asr_impl.py +127 -0
  8. nexaai/base.py +39 -0
  9. nexaai/binds/__init__.py +7 -0
  10. nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
  11. nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
  12. nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
  13. nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
  14. nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
  15. nexaai/binds/cpu_gpu/libggml.dylib +0 -0
  16. nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
  17. nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
  18. nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
  19. nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
  20. nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
  21. nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
  22. nexaai/binds/libnexa_bridge.dylib +0 -0
  23. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  24. nexaai/binds/metal/libnexa_plugin.dylib +0 -0
  25. nexaai/binds/metal/py-lib/ml.py +888 -0
  26. nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
  27. nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
  28. nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
  29. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  30. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  31. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  32. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  33. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  34. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  35. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
  36. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
  37. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
  38. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  39. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  40. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  41. nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
  42. nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
  43. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
  44. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
  45. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  46. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  47. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  48. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  49. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  50. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  51. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
  52. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
  53. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
  54. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
  55. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
  56. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
  57. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
  58. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
  59. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
  60. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
  61. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
  62. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
  63. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  64. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  65. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
  66. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
  67. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
  68. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
  69. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
  70. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
  71. nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
  72. nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
  73. nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  74. nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
  75. nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
  76. nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
  77. nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  78. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  79. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  80. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
  81. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
  82. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  83. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  84. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  85. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  86. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  87. nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  88. nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  89. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
  90. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
  91. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
  92. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
  93. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  94. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
  95. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
  96. nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
  97. nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
  98. nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
  99. nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
  100. nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
  101. nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
  102. nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  103. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
  104. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
  105. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
  106. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
  107. nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
  108. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
  109. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
  110. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
  111. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
  112. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
  113. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
  114. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
  115. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  116. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
  117. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  118. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  119. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  120. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  121. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  122. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  123. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
  124. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
  125. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
  126. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  127. nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
  128. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  129. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  130. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  131. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
  132. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  133. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
  134. nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
  135. nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
  136. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
  137. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  138. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  139. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
  140. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  141. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
  142. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
  143. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
  144. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
  145. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  146. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
  147. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  148. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
  149. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  150. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  151. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  152. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  153. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  154. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  155. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  156. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  157. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  158. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  159. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  160. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  161. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  162. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  163. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  164. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
  165. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  166. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
  167. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  168. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  169. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
  170. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
  171. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
  172. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
  173. nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
  174. nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
  175. nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
  176. nexaai/binds/metal/py-lib/profiling.py +239 -0
  177. nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
  178. nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
  179. nexaai/binds/nexaml/libggml-base.dylib +0 -0
  180. nexaai/binds/nexaml/libggml-cpu.so +0 -0
  181. nexaai/binds/nexaml/libggml-metal.so +0 -0
  182. nexaai/binds/nexaml/libggml.dylib +0 -0
  183. nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
  184. nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
  185. nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
  186. nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
  187. nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
  188. nexaai/binds/nexaml/libnexaproc.dylib +0 -0
  189. nexaai/binds/nexaml/libomp.dylib +0 -0
  190. nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
  191. nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
  192. nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
  193. nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
  194. nexaai/common.py +106 -0
  195. nexaai/cv.py +95 -0
  196. nexaai/cv_impl/__init__.py +0 -0
  197. nexaai/cv_impl/mlx_cv_impl.py +91 -0
  198. nexaai/cv_impl/pybind_cv_impl.py +124 -0
  199. nexaai/diarize.py +80 -0
  200. nexaai/diarize_impl/__init__.py +1 -0
  201. nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
  202. nexaai/embedder.py +73 -0
  203. nexaai/embedder_impl/__init__.py +0 -0
  204. nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
  205. nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
  206. nexaai/image_gen.py +141 -0
  207. nexaai/image_gen_impl/__init__.py +0 -0
  208. nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
  209. nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
  210. nexaai/llm.py +98 -0
  211. nexaai/llm_impl/__init__.py +0 -0
  212. nexaai/llm_impl/mlx_llm_impl.py +271 -0
  213. nexaai/llm_impl/pybind_llm_impl.py +238 -0
  214. nexaai/log.py +92 -0
  215. nexaai/mlx_backend/asr/__init__.py +12 -0
  216. nexaai/mlx_backend/asr/interface.py +122 -0
  217. nexaai/mlx_backend/common/__init__.py +0 -0
  218. nexaai/mlx_backend/common/utils.py +25 -0
  219. nexaai/mlx_backend/cv/__init__.py +0 -0
  220. nexaai/mlx_backend/cv/generate.py +195 -0
  221. nexaai/mlx_backend/cv/interface.py +162 -0
  222. nexaai/mlx_backend/cv/main.py +81 -0
  223. nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
  224. nexaai/mlx_backend/embedding/__init__.py +0 -0
  225. nexaai/mlx_backend/embedding/generate.py +333 -0
  226. nexaai/mlx_backend/embedding/interface.py +617 -0
  227. nexaai/mlx_backend/embedding/main.py +173 -0
  228. nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
  229. nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
  230. nexaai/mlx_backend/image_gen/__init__.py +1 -0
  231. nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
  232. nexaai/mlx_backend/image_gen/interface.py +82 -0
  233. nexaai/mlx_backend/image_gen/main.py +281 -0
  234. nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
  235. nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
  236. nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
  237. nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
  238. nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
  239. nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
  240. nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
  241. nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
  242. nexaai/mlx_backend/llm/__init__.py +0 -0
  243. nexaai/mlx_backend/llm/generate.py +149 -0
  244. nexaai/mlx_backend/llm/interface.py +764 -0
  245. nexaai/mlx_backend/llm/main.py +68 -0
  246. nexaai/mlx_backend/ml.py +888 -0
  247. nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
  248. nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
  249. nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
  250. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  251. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  252. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  253. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  254. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  255. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  256. nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
  257. nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
  258. nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
  259. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  260. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  261. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  262. nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
  263. nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
  264. nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
  265. nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
  266. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  267. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  268. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  269. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  270. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  271. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  272. nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
  273. nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
  274. nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
  275. nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
  276. nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
  277. nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
  278. nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
  279. nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
  280. nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
  281. nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
  282. nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
  283. nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
  284. nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
  285. nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  286. nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
  287. nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
  288. nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
  289. nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
  290. nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
  291. nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
  292. nexaai/mlx_backend/mlx_audio/server.py +525 -0
  293. nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
  294. nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  295. nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
  296. nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
  297. nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
  298. nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
  299. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  300. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  301. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
  302. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
  303. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  304. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  305. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  306. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  307. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  308. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  309. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  310. nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
  311. nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
  312. nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
  313. nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
  314. nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  315. nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
  316. nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
  317. nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
  318. nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
  319. nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
  320. nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
  321. nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
  322. nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
  323. nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
  324. nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
  325. nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
  326. nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
  327. nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
  328. nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
  329. nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
  330. nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
  331. nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
  332. nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
  333. nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
  334. nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
  335. nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
  336. nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  337. nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
  338. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  339. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  340. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  341. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  342. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  343. nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  344. nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
  345. nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
  346. nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
  347. nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  348. nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
  349. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  350. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  351. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  352. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
  353. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  354. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
  355. nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
  356. nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
  357. nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
  358. nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  359. nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  360. nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
  361. nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
  362. nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  363. nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
  364. nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
  365. nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
  366. nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
  367. nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  368. nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
  369. nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  370. nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
  371. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  372. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  373. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  374. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  375. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  376. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  377. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  378. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  379. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  380. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  381. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  382. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  383. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  384. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  385. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  386. nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
  387. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  388. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
  389. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  390. nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
  391. nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
  392. nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
  393. nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
  394. nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
  395. nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
  396. nexaai/mlx_backend/mlx_audio/utils.py +237 -0
  397. nexaai/mlx_backend/mlx_audio/version.py +1 -0
  398. nexaai/mlx_backend/profiling.py +239 -0
  399. nexaai/mlx_backend/rerank/__init__.py +0 -0
  400. nexaai/mlx_backend/rerank/generate.py +174 -0
  401. nexaai/mlx_backend/rerank/interface.py +287 -0
  402. nexaai/mlx_backend/rerank/main.py +127 -0
  403. nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
  404. nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
  405. nexaai/mlx_backend/sd/__init__.py +1 -0
  406. nexaai/mlx_backend/sd/interface.py +362 -0
  407. nexaai/mlx_backend/sd/main.py +286 -0
  408. nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
  409. nexaai/mlx_backend/sd/modeling/clip.py +116 -0
  410. nexaai/mlx_backend/sd/modeling/config.py +65 -0
  411. nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
  412. nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
  413. nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
  414. nexaai/mlx_backend/sd/modeling/unet.py +460 -0
  415. nexaai/mlx_backend/sd/modeling/vae.py +274 -0
  416. nexaai/mlx_backend/tts/__init__.py +12 -0
  417. nexaai/mlx_backend/tts/interface.py +276 -0
  418. nexaai/mlx_backend/vlm/__init__.py +3 -0
  419. nexaai/mlx_backend/vlm/generate.py +572 -0
  420. nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
  421. nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
  422. nexaai/mlx_backend/vlm/interface.py +559 -0
  423. nexaai/mlx_backend/vlm/main.py +365 -0
  424. nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
  425. nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
  426. nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
  427. nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
  428. nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  429. nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  430. nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
  431. nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
  432. nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
  433. nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
  434. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  435. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  436. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  437. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  438. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  439. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  440. nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
  441. nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
  442. nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
  443. nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
  444. nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
  445. nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
  446. nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
  447. nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
  448. nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
  449. nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
  450. nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
  451. nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  452. nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
  453. nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
  454. nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
  455. nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
  456. nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
  457. nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
  458. nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
  459. nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
  460. nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
  461. nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
  462. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  463. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  464. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
  465. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
  466. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
  467. nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
  468. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  469. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  470. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
  471. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
  472. nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
  473. nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
  474. nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
  475. nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
  476. nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
  477. nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
  478. nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
  479. nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
  480. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  481. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
  482. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  483. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
  484. nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
  485. nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
  486. nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
  487. nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
  488. nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
  489. nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
  490. nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
  491. nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
  492. nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
  493. nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
  494. nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
  495. nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
  496. nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
  497. nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
  498. nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
  499. nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
  500. nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  501. nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
  502. nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
  503. nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
  504. nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
  505. nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
  506. nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
  507. nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
  508. nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
  509. nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  510. nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  511. nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
  512. nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
  513. nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
  514. nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
  515. nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
  516. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  517. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  518. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  519. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  520. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  521. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  522. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
  523. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
  524. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  525. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  526. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
  527. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
  528. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
  529. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
  530. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
  531. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
  532. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
  533. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
  534. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
  535. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
  536. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
  537. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
  538. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
  539. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
  540. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
  541. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
  542. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
  543. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
  544. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
  545. nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
  546. nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  547. nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  548. nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
  549. nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
  550. nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
  551. nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
  552. nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
  553. nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
  554. nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
  555. nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
  556. nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
  557. nexaai/rerank.py +57 -0
  558. nexaai/rerank_impl/__init__.py +0 -0
  559. nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
  560. nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
  561. nexaai/runtime.py +68 -0
  562. nexaai/runtime_error.py +24 -0
  563. nexaai/tts.py +75 -0
  564. nexaai/tts_impl/__init__.py +0 -0
  565. nexaai/tts_impl/mlx_tts_impl.py +94 -0
  566. nexaai/tts_impl/pybind_tts_impl.py +43 -0
  567. nexaai/utils/decode.py +18 -0
  568. nexaai/utils/manifest_utils.py +531 -0
  569. nexaai/utils/model_manager.py +1745 -0
  570. nexaai/utils/model_types.py +49 -0
  571. nexaai/utils/progress_tracker.py +389 -0
  572. nexaai/utils/quantization_utils.py +245 -0
  573. nexaai/vlm.py +130 -0
  574. nexaai/vlm_impl/__init__.py +0 -0
  575. nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
  576. nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
  577. nexaai-1.0.29.dist-info/METADATA +35 -0
  578. nexaai-1.0.29.dist-info/RECORD +580 -0
  579. nexaai-1.0.29.dist-info/WHEEL +5 -0
  580. nexaai-1.0.29.dist-info/top_level.txt +1 -0
@@ -0,0 +1,162 @@
1
+ import math
2
+
3
+ import mlx.core as mx
4
+ import numpy as np
5
+ import pyloudnorm as pyln
6
+ import scipy.signal
7
+ import soundfile as sf
8
+
9
+ from mlx_audio.codec import DAC
10
+
11
+
12
+ def process_audio_array(
13
+ audio: mx.array,
14
+ sample_rate: int = 24000,
15
+ target_loudness: float = -18.0,
16
+ peak_limit: float = -1,
17
+ block_size: float = 0.400,
18
+ ) -> mx.array:
19
+ audio_np = np.array(audio)
20
+
21
+ # handle multi-channel audio
22
+ if len(audio_np.shape) > 1:
23
+ if audio_np.shape[1] > 1:
24
+ audio_np = np.mean(audio_np, axis=1)
25
+ else:
26
+ audio_np = np.squeeze(audio_np)
27
+
28
+ original_length = len(audio_np)
29
+ min_samples = int(block_size * sample_rate)
30
+
31
+ if original_length < min_samples:
32
+ pad_length = min_samples - original_length
33
+ audio_padded = np.pad(audio_np, (0, pad_length), mode="constant")
34
+ else:
35
+ audio_padded = audio_np
36
+
37
+ # measure and normalize loudness
38
+ meter = pyln.Meter(sample_rate, block_size=block_size)
39
+ measured_loudness = meter.integrated_loudness(audio_padded)
40
+ normalized = pyln.normalize.loudness(
41
+ audio_padded, measured_loudness, target_loudness
42
+ )
43
+
44
+ # apply peak limiting if necessary
45
+ peak_value = np.max(np.abs(normalized))
46
+ threshold_value = 10 ** (peak_limit / 20)
47
+ if peak_value > threshold_value:
48
+ normalized = pyln.normalize.peak(normalized, peak_limit)
49
+
50
+ if original_length < min_samples:
51
+ normalized = normalized[:original_length]
52
+
53
+ normalized_array = mx.array(normalized).reshape(1, 1, -1)
54
+ return normalized_array
55
+
56
+
57
+ class DacInterface:
58
+ def __init__(self, repo_id: str = "mlx-community/dac-speech-24khz-1.5kbps"):
59
+ self.model = DAC.from_pretrained(repo_id)
60
+ self.sr = 24000
61
+
62
+ def convert_audio(
63
+ self, audio: mx.array, sr: int, target_sr: int, target_channels: int
64
+ ):
65
+ audio_np = np.array(audio)
66
+
67
+ if len(audio_np.shape) < 2:
68
+ audio_np = audio_np.reshape(1, -1)
69
+
70
+ channels, length = audio_np.shape[-2:]
71
+
72
+ if target_channels == 1:
73
+ if channels > 1:
74
+ audio_np = np.mean(audio_np, axis=-2, keepdims=True)
75
+ elif target_channels == 2:
76
+ if channels == 1:
77
+ audio_np = np.repeat(audio_np, 2, axis=-2)
78
+ elif channels > 2:
79
+ audio_np = audio_np[..., :2, :]
80
+
81
+ if sr != target_sr:
82
+ new_length = int(length * target_sr / sr)
83
+ resampled = np.zeros((target_channels, new_length))
84
+
85
+ for ch in range(target_channels):
86
+ resampled[ch] = scipy.signal.resample(audio_np[ch], new_length)
87
+
88
+ audio_np = resampled
89
+
90
+ return mx.array(audio_np)
91
+
92
+ def convert_audio_array(self, audio: mx.array, sr):
93
+ return self.convert_audio(audio, sr, self.sr, 1)
94
+
95
+ def load_audio(self, path):
96
+ audio_np, sr = sf.read(path)
97
+ audio = mx.array(audio_np)
98
+ if len(audio.shape) == 1:
99
+ audio = audio.reshape(1, -1)
100
+ # if stereo, reshape to channels-first format
101
+ elif len(audio.shape) > 1 and audio.shape[0] > audio.shape[1]:
102
+ audio = audio.T
103
+ return self.convert_audio_array(audio, sr).reshape(1, 1, -1)
104
+
105
+ def preprocess(self, audio_data):
106
+ length = audio_data.shape[-1]
107
+ hop_length = self.model.hop_length
108
+ right_pad = math.ceil(length / hop_length) * hop_length - length
109
+ audio_data = mx.pad(audio_data, [(0, 0), (0, 0), (0, right_pad)])
110
+ return audio_data
111
+
112
+ def encode(self, x: mx.array, win_duration: int = 5.0, verbose: bool = False):
113
+ x = process_audio_array(x)
114
+ nb, nac, nt = x.shape
115
+ x = x.reshape(nb * nac, 1, nt)
116
+ n_samples = int(win_duration * self.sr)
117
+ n_samples = int(
118
+ math.ceil(n_samples / self.model.hop_length) * self.model.hop_length
119
+ )
120
+ hop = n_samples
121
+ codes_list = []
122
+
123
+ if verbose:
124
+ from tqdm import trange
125
+
126
+ range_fn = trange
127
+ else:
128
+ range_fn = range
129
+
130
+ for i in range_fn(0, nt, hop):
131
+ chunk = x[..., i : i + n_samples]
132
+ audio_data = self.preprocess(chunk)
133
+ _, c, _, _, _ = self.model.encode(audio_data, None)
134
+ codes_list.append(c)
135
+
136
+ codes = mx.concatenate(codes_list, axis=-1)
137
+ return codes
138
+
139
+ def decode(self, codes: mx.array, verbose: bool = False) -> mx.array:
140
+ model = self.model
141
+ chunk_length = 4096
142
+ recons = []
143
+
144
+ if verbose:
145
+ from tqdm import trange
146
+
147
+ range_fn = trange
148
+ else:
149
+ range_fn = range
150
+
151
+ @mx.compile
152
+ def decode_chunk(codes):
153
+ z = model.quantizer.from_codes(codes)[0]
154
+ r = model.decode(z)
155
+ return r
156
+
157
+ for i in range_fn(0, codes.shape[-1], chunk_length):
158
+ c = codes[..., i : i + chunk_length]
159
+ recons.append(decode_chunk(c))
160
+
161
+ recons = mx.concatenate(recons, axis=-1)
162
+ return process_audio_array(recons.swapaxes(1, 2))
@@ -0,0 +1,461 @@
1
+ {
2
+ "text": "The cat watched from the windowsill, tail flicking with quiet curiosity as the first snowflakes of winter began to fall, dusting the world in fragile white.",
3
+ "words": [
4
+ {
5
+ "word": "The",
6
+ "duration": 0.2,
7
+ "c1": [
8
+ 720, 720, 474, 691, 607, 126, 597, 607, 897, 288, 362, 903, 333, 1009,
9
+ 79
10
+ ],
11
+ "c2": [
12
+ 658, 663, 237, 915, 74, 74, 966, 721, 893, 722, 630, 516, 861, 385, 149
13
+ ],
14
+ "features": {
15
+ "energy": 10,
16
+ "spectral_centroid": 15,
17
+ "pitch": 45
18
+ }
19
+ },
20
+ {
21
+ "word": "cat",
22
+ "duration": 0.33,
23
+ "c1": [
24
+ 700, 597, 639, 838, 622, 336, 975, 326, 67, 375, 853, 761, 35, 363, 31,
25
+ 1000, 982, 192, 647, 564, 329, 1002, 275, 480, 551
26
+ ],
27
+ "c2": [
28
+ 34, 810, 457, 546, 42, 631, 339, 867, 115, 1011, 509, 369, 473, 85, 190,
29
+ 715, 391, 518, 562, 986, 749, 193, 530, 327, 820
30
+ ],
31
+ "features": {
32
+ "energy": 14,
33
+ "spectral_centroid": 21,
34
+ "pitch": 35
35
+ }
36
+ },
37
+ {
38
+ "word": "watched",
39
+ "duration": 0.44,
40
+ "c1": [
41
+ 625, 668, 168, 524, 462, 151, 549, 951, 597, 820, 489, 329, 377, 144,
42
+ 112, 16, 481, 133, 195, 744, 144, 750, 288, 500, 1000, 58, 916, 597, 72,
43
+ 336, 224, 476, 581
44
+ ],
45
+ "c2": [
46
+ 204, 421, 318, 677, 74, 953, 903, 413, 809, 37, 634, 824, 933, 200, 14,
47
+ 1007, 111, 17, 435, 718, 559, 783, 415, 821, 958, 247, 14, 721, 158,
48
+ 235, 276, 875, 683
49
+ ],
50
+ "features": {
51
+ "energy": 19,
52
+ "spectral_centroid": 21,
53
+ "pitch": 26
54
+ }
55
+ },
56
+ {
57
+ "word": "from",
58
+ "duration": 0.2,
59
+ "c1": [
60
+ 528, 668, 738, 985, 126, 924, 1003, 325, 393, 86, 114, 392, 638, 915,
61
+ 549
62
+ ],
63
+ "c2": [
64
+ 929, 872, 332, 296, 983, 406, 867, 568, 374, 328, 419, 348, 177, 379,
65
+ 181
66
+ ],
67
+ "features": {
68
+ "energy": 10,
69
+ "spectral_centroid": 29,
70
+ "pitch": 14
71
+ }
72
+ },
73
+ {
74
+ "word": "the",
75
+ "duration": 0.12,
76
+ "c1": [470, 985, 152, 474, 967, 558, 460, 728, 470],
77
+ "c2": [596, 246, 314, 246, 756, 238, 606, 262, 499],
78
+ "features": {
79
+ "energy": 23,
80
+ "spectral_centroid": 10,
81
+ "pitch": 23
82
+ }
83
+ },
84
+ {
85
+ "word": "windowsill,",
86
+ "duration": 0.75,
87
+ "c1": [
88
+ 217, 126, 549, 700, 198, 891, 95, 683, 158, 680, 16, 769, 402, 776, 295,
89
+ 258, 68, 213, 669, 865, 719, 29, 949, 329, 216, 481, 284, 224, 221, 359,
90
+ 328, 311, 415, 443, 410, 359, 600, 590, 932, 611, 905, 304, 292, 72,
91
+ 388, 333, 66, 943, 489, 648, 630, 648, 402, 972, 392, 558
92
+ ],
93
+ "c2": [
94
+ 911, 19, 1007, 169, 185, 182, 399, 849, 656, 963, 265, 80, 453, 768,
95
+ 919, 1010, 501, 794, 141, 123, 93, 694, 499, 174, 768, 689, 598, 686,
96
+ 10, 381, 282, 556, 126, 672, 872, 650, 990, 556, 913, 635, 174, 819,
97
+ 999, 423, 64, 272, 112, 600, 453, 678, 791, 301, 206, 187, 819, 948
98
+ ],
99
+ "features": {
100
+ "energy": 17,
101
+ "spectral_centroid": 25,
102
+ "pitch": 24
103
+ }
104
+ },
105
+ {
106
+ "word": "tail",
107
+ "duration": 0.6,
108
+ "c1": [
109
+ 669, 94, 917, 202, 607, 720, 625, 597, 126, 607, 885, 700, 474, 480,
110
+ 126, 126, 551, 720, 126, 551, 720, 607, 572, 234, 114, 963, 963, 975,
111
+ 587, 119, 378, 696, 730, 375, 46, 827, 515, 447, 979, 138, 22, 267, 43,
112
+ 495, 16
113
+ ],
114
+ "c2": [
115
+ 1011, 336, 157, 39, 1000, 721, 862, 413, 557, 569, 74, 569, 141, 493,
116
+ 124, 775, 204, 588, 74, 588, 810, 124, 102, 1021, 83, 848, 297, 339,
117
+ 335, 684, 400, 905, 909, 710, 460, 115, 81, 628, 224, 663, 892, 247,
118
+ 392, 234, 132
119
+ ],
120
+ "features": {
121
+ "energy": 15,
122
+ "spectral_centroid": 23,
123
+ "pitch": 34
124
+ }
125
+ },
126
+ {
127
+ "word": "flicking",
128
+ "duration": 0.45,
129
+ "c1": [
130
+ 978, 489, 630, 588, 436, 798, 4, 975, 245, 325, 415, 4, 393, 4, 4, 997,
131
+ 982, 437, 444, 180, 861, 868, 225, 440, 780, 597, 720, 639, 168, 426,
132
+ 114, 621, 854, 869
133
+ ],
134
+ "c2": [
135
+ 571, 321, 376, 232, 301, 678, 904, 630, 990, 772, 690, 870, 719, 694,
136
+ 332, 558, 301, 194, 279, 443, 852, 64, 709, 401, 401, 14, 74, 873, 134,
137
+ 754, 1002, 595, 540, 525
138
+ ],
139
+ "features": {
140
+ "energy": 9,
141
+ "spectral_centroid": 22,
142
+ "pitch": 23
143
+ }
144
+ },
145
+ {
146
+ "word": "with",
147
+ "duration": 0.23,
148
+ "c1": [
149
+ 621, 392, 756, 459, 433, 881, 786, 198, 702, 847, 490, 27, 680, 146, 58,
150
+ 808, 997
151
+ ],
152
+ "c2": [
153
+ 460, 840, 840, 303, 847, 534, 801, 99, 662, 666, 510, 132, 376, 96, 639,
154
+ 240, 668
155
+ ],
156
+ "features": {
157
+ "energy": 11,
158
+ "spectral_centroid": 15,
159
+ "pitch": 20
160
+ }
161
+ },
162
+ {
163
+ "word": "quiet",
164
+ "duration": 0.37,
165
+ "c1": [
166
+ 969, 291, 572, 720, 625, 85, 698, 478, 811, 956, 232, 85, 962, 817, 986,
167
+ 483, 835, 526, 77, 187, 178, 50, 440, 16, 198, 237, 418, 862
168
+ ],
169
+ "c2": [
170
+ 498, 606, 24, 629, 662, 181, 119, 678, 340, 736, 217, 204, 935, 796,
171
+ 118, 478, 818, 791, 329, 209, 5, 234, 337, 647, 110, 922, 933, 1011
172
+ ],
173
+ "features": {
174
+ "energy": 12,
175
+ "spectral_centroid": 12,
176
+ "pitch": 43
177
+ }
178
+ },
179
+ {
180
+ "word": "curiosity",
181
+ "duration": 0.71,
182
+ "c1": [
183
+ 321, 402, 215, 607, 720, 224, 731, 621, 491, 720, 551, 456, 336, 688,
184
+ 476, 953, 718, 806, 410, 786, 976, 664, 855, 433, 756, 396, 699, 776,
185
+ 443, 739, 932, 22, 305, 353, 503, 564, 978, 407, 395, 798, 324, 168,
186
+ 909, 328, 328, 443, 738, 114, 962, 681, 535, 701, 382
187
+ ],
188
+ "c2": [
189
+ 777, 665, 629, 327, 831, 764, 162, 725, 810, 170, 629, 774, 108, 948,
190
+ 972, 449, 600, 905, 81, 765, 601, 422, 820, 746, 450, 346, 733, 77, 733,
191
+ 81, 722, 576, 286, 271, 714, 95, 346, 133, 514, 799, 122, 900, 568, 666,
192
+ 209, 668, 558, 630, 165, 587, 423, 904, 629
193
+ ],
194
+ "features": {
195
+ "energy": 10,
196
+ "spectral_centroid": 29,
197
+ "pitch": 22
198
+ }
199
+ },
200
+ {
201
+ "word": "as",
202
+ "duration": 0.48,
203
+ "c1": [
204
+ 474, 936, 336, 589, 254, 854, 79, 140, 863, 854, 701, 260, 929, 140,
205
+ 669, 808, 411, 232, 434, 542, 597, 126, 551, 126, 607, 1011, 774, 681,
206
+ 94, 25, 971, 288, 305, 347, 355, 415
207
+ ],
208
+ "c2": [
209
+ 267, 813, 232, 361, 77, 607, 252, 933, 508, 658, 846, 849, 873, 496,
210
+ 832, 167, 440, 124, 557, 124, 736, 588, 569, 983, 497, 360, 810, 274,
211
+ 588, 365, 517, 934, 957, 839, 646, 720
212
+ ],
213
+ "features": {
214
+ "energy": 7,
215
+ "spectral_centroid": 31,
216
+ "pitch": 23
217
+ }
218
+ },
219
+ {
220
+ "word": "the",
221
+ "duration": 0.13,
222
+ "c1": [359, 568, 700, 985, 80, 580, 274, 129, 600, 794],
223
+ "c2": [423, 833, 245, 690, 209, 688, 765, 453, 677, 615],
224
+ "features": {
225
+ "energy": 9,
226
+ "spectral_centroid": 26,
227
+ "pitch": 20
228
+ }
229
+ },
230
+ {
231
+ "word": "first",
232
+ "duration": 0.36,
233
+ "c1": [
234
+ 997, 325, 147, 4, 780, 669, 621, 896, 30, 686, 526, 399, 210, 783, 216,
235
+ 144, 329, 448, 481, 288, 132, 600, 168, 221, 415, 415, 528
236
+ ],
237
+ "c2": [
238
+ 325, 666, 627, 629, 240, 665, 650, 481, 962, 328, 128, 358, 166, 264,
239
+ 555, 30, 815, 10, 669, 525, 450, 746, 919, 621, 647, 16, 601
240
+ ],
241
+ "features": {
242
+ "energy": 13,
243
+ "spectral_centroid": 28,
244
+ "pitch": 22
245
+ }
246
+ },
247
+ {
248
+ "word": "snowflakes",
249
+ "duration": 0.76,
250
+ "c1": [
251
+ 1003, 680, 607, 720, 126, 668, 336, 224, 114, 997, 426, 997, 147, 221,
252
+ 359, 328, 1003, 738, 974, 151, 782, 179, 190, 553, 453, 761, 778, 23,
253
+ 128, 643, 125, 7, 345, 223, 275, 524, 325, 764, 114, 953, 70, 75, 449,
254
+ 513, 783, 830, 825, 365, 819, 920, 669, 700, 700, 720, 220, 209, 221
255
+ ],
256
+ "c2": [
257
+ 276, 489, 810, 975, 775, 913, 1022, 818, 340, 481, 690, 366, 924, 782,
258
+ 366, 481, 400, 998, 872, 556, 688, 719, 78, 952, 119, 412, 286, 847, 60,
259
+ 381, 86, 694, 779, 55, 246, 374, 143, 91, 209, 640, 313, 873, 295, 355,
260
+ 333, 705, 468, 1008, 317, 87, 105, 511, 260, 650, 574, 88, 690
261
+ ],
262
+ "features": {
263
+ "energy": 12,
264
+ "spectral_centroid": 29,
265
+ "pitch": 22
266
+ }
267
+ },
268
+ {
269
+ "word": "of",
270
+ "duration": 0.15,
271
+ "c1": [443, 328, 528, 85, 313, 145, 588, 140, 114, 325, 325],
272
+ "c2": [924, 835, 400, 832, 397, 1011, 695, 716, 366, 489, 487],
273
+ "features": {
274
+ "energy": 7,
275
+ "spectral_centroid": 34,
276
+ "pitch": 13
277
+ }
278
+ },
279
+ {
280
+ "word": "winter",
281
+ "duration": 0.29,
282
+ "c1": [
283
+ 559, 71, 549, 64, 902, 609, 206, 386, 428, 529, 92, 1020, 148, 456, 605,
284
+ 673, 958, 897, 250, 716, 236, 232
285
+ ],
286
+ "c2": [
287
+ 891, 358, 1016, 185, 558, 392, 63, 45, 238, 404, 603, 520, 657, 628,
288
+ 748, 649, 629, 298, 772, 483, 1008, 401
289
+ ],
290
+ "features": {
291
+ "energy": 18,
292
+ "spectral_centroid": 16,
293
+ "pitch": 31
294
+ }
295
+ },
296
+ {
297
+ "word": "began",
298
+ "duration": 0.24,
299
+ "c1": [
300
+ 490, 6, 596, 669, 1011, 700, 583, 349, 666, 783, 215, 126, 61, 22, 945,
301
+ 773, 920, 975
302
+ ],
303
+ "c2": [
304
+ 194, 225, 140, 243, 14, 650, 929, 671, 323, 365, 556, 298, 707, 483,
305
+ 550, 57, 127, 886
306
+ ],
307
+ "features": {
308
+ "energy": 11,
309
+ "spectral_centroid": 12,
310
+ "pitch": 18
311
+ }
312
+ },
313
+ {
314
+ "word": "to",
315
+ "duration": 0.2,
316
+ "c1": [
317
+ 265, 1021, 113, 178, 698, 561, 97, 402, 25, 916, 766, 660, 159, 945, 967
318
+ ],
319
+ "c2": [
320
+ 141, 976, 455, 403, 760, 738, 519, 123, 327, 721, 690, 904, 689, 140,
321
+ 615
322
+ ],
323
+ "features": {
324
+ "energy": 13,
325
+ "spectral_centroid": 19,
326
+ "pitch": 20
327
+ }
328
+ },
329
+ {
330
+ "word": "fall,",
331
+ "duration": 0.39,
332
+ "c1": [
333
+ 781, 325, 4, 114, 997, 415, 4, 443, 953, 781, 399, 993, 489, 383, 920,
334
+ 383, 272, 755, 843, 450, 763, 392, 411, 682, 895, 443, 490, 863, 79
335
+ ],
336
+ "c2": [
337
+ 143, 990, 209, 990, 990, 556, 462, 952, 914, 702, 301, 833, 779, 982,
338
+ 26, 458, 519, 9, 264, 74, 304, 110, 646, 905, 185, 959, 53, 543, 909
339
+ ],
340
+ "features": {
341
+ "energy": 13,
342
+ "spectral_centroid": 14,
343
+ "pitch": 18
344
+ }
345
+ },
346
+ {
347
+ "word": "dusting",
348
+ "duration": 0.89,
349
+ "c1": [
350
+ 27, 669, 490, 691, 691, 625, 625, 572, 474, 885, 215, 215, 215, 215,
351
+ 215, 215, 75, 718, 94, 924, 232, 818, 14, 232, 985, 547, 955, 4, 627,
352
+ 524, 524, 579, 462, 104, 597, 720, 720, 491, 597, 571, 802, 864, 315,
353
+ 515, 832, 219, 133, 923, 773, 245, 415, 328, 590, 80, 528, 322, 808,
354
+ 551, 625, 716, 158, 562, 712, 477, 905, 920, 424
355
+ ],
356
+ "c2": [
357
+ 206, 521, 77, 447, 260, 810, 74, 301, 243, 775, 243, 775, 880, 862,
358
+ 1017, 806, 806, 631, 873, 806, 806, 722, 14, 531, 630, 500, 990, 240,
359
+ 690, 431, 240, 815, 449, 273, 903, 569, 325, 629, 872, 239, 686, 189,
360
+ 774, 264, 314, 628, 107, 120, 560, 929, 1008, 610, 24, 929, 400, 949,
361
+ 431, 721, 447, 443, 774, 392, 923, 855, 747, 144, 460
362
+ ],
363
+ "features": {
364
+ "energy": 14,
365
+ "spectral_centroid": 28,
366
+ "pitch": 30
367
+ }
368
+ },
369
+ {
370
+ "word": "the",
371
+ "duration": 0.12,
372
+ "c1": [396, 433, 276, 530, 316, 117, 112, 7, 531],
373
+ "c2": [332, 479, 262, 239, 123, 239, 453, 499, 545],
374
+ "features": {
375
+ "energy": 23,
376
+ "spectral_centroid": 11,
377
+ "pitch": 30
378
+ }
379
+ },
380
+ {
381
+ "word": "world",
382
+ "duration": 0.32,
383
+ "c1": [
384
+ 217, 489, 897, 607, 402, 383, 496, 937, 247, 206, 790, 32, 406, 856,
385
+ 715, 458, 278, 481, 503, 399, 871, 453, 858, 392
386
+ ],
387
+ "c2": [
388
+ 593, 959, 461, 546, 242, 438, 81, 99, 939, 361, 269, 571, 525, 542, 246,
389
+ 10, 613, 228, 913, 252, 132, 132, 287, 559
390
+ ],
391
+ "features": {
392
+ "energy": 22,
393
+ "spectral_centroid": 11,
394
+ "pitch": 31
395
+ }
396
+ },
397
+ {
398
+ "word": "in",
399
+ "duration": 0.23,
400
+ "c1": [
401
+ 558, 497, 436, 598, 607, 416, 311, 906, 955, 905, 448, 54, 92, 487, 770,
402
+ 298, 490
403
+ ],
404
+ "c2": [
405
+ 838, 399, 420, 819, 325, 929, 124, 214, 1021, 728, 975, 688, 132, 718,
406
+ 724, 911, 536
407
+ ],
408
+ "features": {
409
+ "energy": 14,
410
+ "spectral_centroid": 16,
411
+ "pitch": 22
412
+ }
413
+ },
414
+ {
415
+ "word": "fragile",
416
+ "duration": 0.41,
417
+ "c1": [
418
+ 415, 325, 953, 359, 325, 838, 359, 764, 842, 341, 706, 674, 971, 592,
419
+ 507, 16, 628, 481, 626, 691, 1011, 610, 336, 476, 528, 637, 472, 251,
420
+ 945, 811, 406
421
+ ],
422
+ "c2": [
423
+ 126, 990, 374, 143, 629, 868, 338, 91, 346, 393, 407, 987, 987, 1009,
424
+ 617, 854, 824, 439, 789, 311, 810, 497, 664, 549, 135, 908, 702, 639,
425
+ 320, 698, 414
426
+ ],
427
+ "features": {
428
+ "energy": 13,
429
+ "spectral_centroid": 20,
430
+ "pitch": 18
431
+ }
432
+ },
433
+ {
434
+ "word": "white.",
435
+ "duration": 0.75,
436
+ "c1": [
437
+ 26, 432, 1, 651, 998, 716, 998, 727, 978, 311, 85, 895, 279, 392, 669,
438
+ 916, 549, 1011, 97, 597, 296, 392, 526, 998, 835, 468, 871, 405, 26,
439
+ 759, 524, 107, 77, 22, 260, 682, 621, 79, 682, 411, 701, 972, 691, 720,
440
+ 551, 597, 660, 224, 236, 70, 652, 215, 126, 474, 597, 625
441
+ ],
442
+ "c2": [
443
+ 475, 778, 695, 612, 913, 315, 536, 593, 55, 371, 19, 560, 821, 646, 151,
444
+ 801, 821, 413, 14, 922, 629, 380, 417, 679, 487, 562, 821, 706, 324,
445
+ 896, 169, 594, 810, 864, 810, 588, 862, 969, 14, 105, 528, 165, 420,
446
+ 170, 821, 423, 977, 904, 690, 235, 702, 14, 124, 350, 74, 413
447
+ ],
448
+ "features": {
449
+ "energy": 13,
450
+ "spectral_centroid": 11,
451
+ "pitch": 23
452
+ }
453
+ }
454
+ ],
455
+ "global_features": {
456
+ "energy": 13,
457
+ "spectral_centroid": 20,
458
+ "pitch": 28
459
+ },
460
+ "interface_version": 3
461
+ }