nexaai 1.0.4rc10__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Files changed (519) hide show
  1. nexaai/__init__.py +71 -0
  2. nexaai/_version.py +4 -0
  3. nexaai/asr.py +60 -0
  4. nexaai/asr_impl/__init__.py +0 -0
  5. nexaai/asr_impl/mlx_asr_impl.py +91 -0
  6. nexaai/asr_impl/pybind_asr_impl.py +43 -0
  7. nexaai/base.py +39 -0
  8. nexaai/binds/__init__.py +3 -0
  9. nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
  10. nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
  11. nexaai/binds/libnexa_bridge.dylib +0 -0
  12. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  13. nexaai/binds/nexa_llama_cpp/libggml-base.dylib +0 -0
  14. nexaai/binds/nexa_llama_cpp/libggml-cpu.so +0 -0
  15. nexaai/binds/nexa_llama_cpp/libggml-metal.so +0 -0
  16. nexaai/binds/nexa_llama_cpp/libggml.dylib +0 -0
  17. nexaai/binds/nexa_llama_cpp/libllama.dylib +0 -0
  18. nexaai/binds/nexa_llama_cpp/libmtmd.dylib +0 -0
  19. nexaai/binds/nexa_llama_cpp/libnexa_plugin.dylib +0 -0
  20. nexaai/binds/nexa_mlx/libnexa_plugin.dylib +0 -0
  21. nexaai/binds/nexa_mlx/py-lib/ml.py +842 -0
  22. nexaai/binds/nexa_mlx/py-lib/mlx_audio/__init__.py +0 -0
  23. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/__init__.py +1 -0
  24. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/__init__.py +5 -0
  25. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  26. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  27. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  28. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  29. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  30. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  31. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
  32. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
  33. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
  34. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  35. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  36. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  37. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
  38. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
  39. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
  40. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
  41. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  42. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  43. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  44. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  45. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  46. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  47. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
  48. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
  49. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
  50. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
  51. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
  52. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
  53. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
  54. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
  55. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
  56. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
  57. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
  58. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
  59. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  60. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  61. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
  62. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
  63. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
  64. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
  65. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
  66. nexaai/binds/nexa_mlx/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
  67. nexaai/binds/nexa_mlx/py-lib/mlx_audio/server.py +525 -0
  68. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/__init__.py +0 -0
  69. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  70. nexaai/binds/nexa_mlx/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
  71. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/__init__.py +0 -0
  72. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/generate.py +174 -0
  73. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  74. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  75. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  76. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
  77. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
  78. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  79. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  80. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  81. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  82. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  83. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  84. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  85. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
  86. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
  87. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
  88. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
  89. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  90. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
  91. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
  92. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
  93. nexaai/binds/nexa_mlx/py-lib/mlx_audio/stt/utils.py +195 -0
  94. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/__init__.py +1 -0
  95. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/audio_player.py +120 -0
  96. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/convert.py +71 -0
  97. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/generate.py +449 -0
  98. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  99. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
  100. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
  101. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
  102. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
  103. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/base.py +84 -0
  104. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
  105. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
  106. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
  107. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
  108. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
  109. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
  110. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
  111. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  112. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
  113. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  114. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  115. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  116. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  117. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  118. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  119. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
  120. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
  121. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
  122. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  123. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
  124. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  125. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  126. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  127. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
  128. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  129. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
  130. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
  131. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
  132. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
  133. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  134. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  135. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
  136. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  137. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
  138. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
  139. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
  140. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
  141. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  142. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
  143. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  144. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
  145. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  146. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  147. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  148. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  149. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  150. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  151. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  152. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  153. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  154. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  155. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  156. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  157. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  158. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  159. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  160. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
  161. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  162. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
  163. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  164. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  165. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
  166. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
  167. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
  168. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
  169. nexaai/binds/nexa_mlx/py-lib/mlx_audio/tts/utils.py +337 -0
  170. nexaai/binds/nexa_mlx/py-lib/mlx_audio/utils.py +237 -0
  171. nexaai/binds/nexa_mlx/py-lib/mlx_audio/version.py +1 -0
  172. nexaai/binds/nexa_mlx/py-lib/profiling.py +239 -0
  173. nexaai/common.py +61 -0
  174. nexaai/cv.py +87 -0
  175. nexaai/cv_impl/__init__.py +0 -0
  176. nexaai/cv_impl/mlx_cv_impl.py +88 -0
  177. nexaai/cv_impl/pybind_cv_impl.py +31 -0
  178. nexaai/embedder.py +68 -0
  179. nexaai/embedder_impl/__init__.py +0 -0
  180. nexaai/embedder_impl/mlx_embedder_impl.py +114 -0
  181. nexaai/embedder_impl/pybind_embedder_impl.py +91 -0
  182. nexaai/image_gen.py +136 -0
  183. nexaai/image_gen_impl/__init__.py +0 -0
  184. nexaai/image_gen_impl/mlx_image_gen_impl.py +291 -0
  185. nexaai/image_gen_impl/pybind_image_gen_impl.py +84 -0
  186. nexaai/llm.py +89 -0
  187. nexaai/llm_impl/__init__.py +0 -0
  188. nexaai/llm_impl/mlx_llm_impl.py +249 -0
  189. nexaai/llm_impl/pybind_llm_impl.py +207 -0
  190. nexaai/mlx_backend/asr/__init__.py +12 -0
  191. nexaai/mlx_backend/asr/interface.py +122 -0
  192. nexaai/mlx_backend/common/__init__.py +0 -0
  193. nexaai/mlx_backend/common/utils.py +25 -0
  194. nexaai/mlx_backend/cv/__init__.py +0 -0
  195. nexaai/mlx_backend/cv/generate.py +195 -0
  196. nexaai/mlx_backend/cv/interface.py +151 -0
  197. nexaai/mlx_backend/cv/main.py +81 -0
  198. nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
  199. nexaai/mlx_backend/embedding/__init__.py +0 -0
  200. nexaai/mlx_backend/embedding/generate.py +130 -0
  201. nexaai/mlx_backend/embedding/interface.py +312 -0
  202. nexaai/mlx_backend/embedding/main.py +82 -0
  203. nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
  204. nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
  205. nexaai/mlx_backend/llm/__init__.py +0 -0
  206. nexaai/mlx_backend/llm/generate.py +149 -0
  207. nexaai/mlx_backend/llm/interface.py +764 -0
  208. nexaai/mlx_backend/llm/main.py +68 -0
  209. nexaai/mlx_backend/ml.py +842 -0
  210. nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
  211. nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
  212. nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
  213. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  214. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  215. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  216. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  217. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  218. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  219. nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
  220. nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
  221. nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
  222. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  223. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  224. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  225. nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
  226. nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
  227. nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
  228. nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
  229. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  230. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  231. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  232. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  233. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  234. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  235. nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
  236. nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
  237. nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
  238. nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
  239. nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
  240. nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
  241. nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
  242. nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
  243. nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
  244. nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
  245. nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
  246. nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
  247. nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
  248. nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  249. nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
  250. nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
  251. nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
  252. nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
  253. nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
  254. nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
  255. nexaai/mlx_backend/mlx_audio/server.py +525 -0
  256. nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
  257. nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  258. nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
  259. nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
  260. nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
  261. nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
  262. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  263. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  264. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
  265. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
  266. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  267. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  268. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  269. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  270. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  271. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  272. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  273. nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
  274. nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
  275. nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
  276. nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
  277. nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  278. nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
  279. nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
  280. nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
  281. nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
  282. nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
  283. nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
  284. nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
  285. nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
  286. nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
  287. nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
  288. nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
  289. nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
  290. nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
  291. nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
  292. nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
  293. nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
  294. nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
  295. nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
  296. nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
  297. nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
  298. nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
  299. nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  300. nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
  301. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  302. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  303. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  304. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  305. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  306. nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  307. nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
  308. nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
  309. nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
  310. nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  311. nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
  312. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  313. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  314. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  315. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
  316. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  317. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
  318. nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
  319. nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
  320. nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
  321. nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  322. nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  323. nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
  324. nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
  325. nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  326. nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
  327. nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
  328. nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
  329. nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
  330. nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  331. nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
  332. nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  333. nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
  334. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  335. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  336. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  337. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  338. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  339. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  340. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  341. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  342. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  343. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  344. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  345. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  346. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  347. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  348. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  349. nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
  350. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  351. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
  352. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  353. nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
  354. nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
  355. nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
  356. nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
  357. nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
  358. nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
  359. nexaai/mlx_backend/mlx_audio/utils.py +237 -0
  360. nexaai/mlx_backend/mlx_audio/version.py +1 -0
  361. nexaai/mlx_backend/profiling.py +239 -0
  362. nexaai/mlx_backend/rerank/__init__.py +0 -0
  363. nexaai/mlx_backend/rerank/generate.py +174 -0
  364. nexaai/mlx_backend/rerank/interface.py +287 -0
  365. nexaai/mlx_backend/rerank/main.py +127 -0
  366. nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
  367. nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
  368. nexaai/mlx_backend/sd/__init__.py +1 -0
  369. nexaai/mlx_backend/sd/interface.py +362 -0
  370. nexaai/mlx_backend/sd/main.py +286 -0
  371. nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
  372. nexaai/mlx_backend/sd/modeling/clip.py +116 -0
  373. nexaai/mlx_backend/sd/modeling/config.py +65 -0
  374. nexaai/mlx_backend/sd/modeling/model_io.py +330 -0
  375. nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
  376. nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
  377. nexaai/mlx_backend/sd/modeling/unet.py +460 -0
  378. nexaai/mlx_backend/sd/modeling/vae.py +274 -0
  379. nexaai/mlx_backend/tts/__init__.py +12 -0
  380. nexaai/mlx_backend/tts/interface.py +276 -0
  381. nexaai/mlx_backend/vlm/__init__.py +3 -0
  382. nexaai/mlx_backend/vlm/generate.py +572 -0
  383. nexaai/mlx_backend/vlm/interface.py +406 -0
  384. nexaai/mlx_backend/vlm/main.py +157 -0
  385. nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
  386. nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
  387. nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
  388. nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
  389. nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  390. nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  391. nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
  392. nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
  393. nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
  394. nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
  395. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  396. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  397. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  398. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  399. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  400. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  401. nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
  402. nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
  403. nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
  404. nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
  405. nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
  406. nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
  407. nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
  408. nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
  409. nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
  410. nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
  411. nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
  412. nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  413. nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
  414. nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
  415. nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
  416. nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
  417. nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
  418. nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
  419. nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
  420. nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
  421. nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
  422. nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
  423. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  424. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  425. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
  426. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
  427. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
  428. nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
  429. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  430. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  431. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
  432. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
  433. nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
  434. nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
  435. nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
  436. nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
  437. nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
  438. nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
  439. nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
  440. nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
  441. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  442. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
  443. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  444. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
  445. nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
  446. nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
  447. nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
  448. nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
  449. nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
  450. nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
  451. nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
  452. nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
  453. nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
  454. nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
  455. nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
  456. nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
  457. nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
  458. nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
  459. nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
  460. nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
  461. nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  462. nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
  463. nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
  464. nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
  465. nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
  466. nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
  467. nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
  468. nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
  469. nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
  470. nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  471. nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  472. nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
  473. nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
  474. nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
  475. nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
  476. nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
  477. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  478. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  479. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  480. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  481. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  482. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  483. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
  484. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
  485. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  486. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  487. nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
  488. nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  489. nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  490. nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
  491. nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
  492. nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
  493. nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
  494. nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
  495. nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
  496. nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
  497. nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
  498. nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
  499. nexaai/rerank.py +51 -0
  500. nexaai/rerank_impl/__init__.py +0 -0
  501. nexaai/rerank_impl/mlx_rerank_impl.py +91 -0
  502. nexaai/rerank_impl/pybind_rerank_impl.py +42 -0
  503. nexaai/runtime.py +64 -0
  504. nexaai/tts.py +70 -0
  505. nexaai/tts_impl/__init__.py +0 -0
  506. nexaai/tts_impl/mlx_tts_impl.py +93 -0
  507. nexaai/tts_impl/pybind_tts_impl.py +42 -0
  508. nexaai/utils/avatar_fetcher.py +104 -0
  509. nexaai/utils/decode.py +18 -0
  510. nexaai/utils/model_manager.py +1195 -0
  511. nexaai/utils/progress_tracker.py +372 -0
  512. nexaai/vlm.py +120 -0
  513. nexaai/vlm_impl/__init__.py +0 -0
  514. nexaai/vlm_impl/mlx_vlm_impl.py +205 -0
  515. nexaai/vlm_impl/pybind_vlm_impl.py +228 -0
  516. nexaai-1.0.4rc10.dist-info/METADATA +26 -0
  517. nexaai-1.0.4rc10.dist-info/RECORD +519 -0
  518. nexaai-1.0.4rc10.dist-info/WHEEL +5 -0
  519. nexaai-1.0.4rc10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,842 @@
1
+ # This file defines the python interface that c-lib expects from a python backend
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+ from typing import (
8
+ Any,
9
+ Callable,
10
+ List,
11
+ Optional,
12
+ Protocol,
13
+ Sequence,
14
+ Tuple,
15
+ TypedDict,
16
+ )
17
+
18
+ # --------------------------------------------------------------------------------------
19
+ # Core aliases & callback protocols
20
+ # --------------------------------------------------------------------------------------
21
+
22
+ Path = str
23
+
24
+ LogCallback = Callable[[str], None]
25
+
26
+
27
+ class TokenCallback(Protocol):
28
+ def __call__(self, token: str, user_data: Any) -> bool: ...
29
+
30
+
31
+ # --------------------------------------------------------------------------------------
32
+ # Core module functions
33
+ # --------------------------------------------------------------------------------------
34
+
35
+ def init() -> None:
36
+ """Initialize the ML module."""
37
+ pass
38
+
39
+
40
+ def deinit() -> None:
41
+ """Deinitialize the ML module."""
42
+ pass
43
+
44
+
45
+ def set_log(callback: LogCallback) -> None:
46
+ """Set the logging callback."""
47
+ pass
48
+
49
+
50
+ def log(message: str) -> None:
51
+ """Log a message."""
52
+ pass
53
+
54
+
55
+ def model_config_default() -> ModelConfig:
56
+ """Get default model configuration with sensible defaults."""
57
+ return ModelConfig()
58
+
59
+
60
+ # --------------------------------------------------------------------------------------
61
+ # Basic data structures
62
+ # --------------------------------------------------------------------------------------
63
+
64
+ @dataclass
65
+ class Image:
66
+ """Image data structure."""
67
+ data: List[float] # width × height × channels
68
+ width: int
69
+ height: int
70
+ channels: int # 3 = RGB, 4 = RGBA
71
+
72
+
73
+ @dataclass
74
+ class Audio:
75
+ """Audio data structure."""
76
+ data: List[float] # num_samples × channels
77
+ sample_rate: int
78
+ channels: int
79
+ num_samples: int
80
+
81
+
82
+ @dataclass
83
+ class Video:
84
+ """Video data structure."""
85
+ data: List[float] # width × height × channels × num_frames
86
+ width: int
87
+ height: int
88
+ channels: int
89
+ num_frames: int
90
+
91
+
92
+ # --------------------------------------------------------------------------------------
93
+ # Language-model structures
94
+ # --------------------------------------------------------------------------------------
95
+
96
+ @dataclass
97
+ class ModelConfig:
98
+ """Configuration for model parameters."""
99
+ n_ctx: int = 0 # text context, 0 = from model
100
+ n_threads: int = 0 # number of threads to use for generation
101
+ n_threads_batch: int = 0 # number of threads to use for batch processing
102
+ n_batch: int = 0 # logical maximum batch size that can be submitted to llama_decode
103
+ n_ubatch: int = 0 # physical maximum batch size
104
+ n_seq_max: int = 0 # max number of sequences (i.e. distinct states for recurrent models)
105
+ chat_template_path: Optional[Path] = None # path to chat template file, optional
106
+ chat_template_content: Optional[str] = None # content of chat template file, optional
107
+
108
+
109
+ @dataclass
110
+ class SamplerConfig:
111
+ """Configuration for text sampling."""
112
+ temperature: float = 0.7
113
+ top_p: float = 0.9
114
+ top_k: int = 40
115
+ min_p: float = 0.0 # Minimum probability for nucleus sampling
116
+ repetition_penalty: float = 1.0
117
+ presence_penalty: float = 0.0
118
+ frequency_penalty: float = 0.0
119
+ seed: int = -1 # –1 for random
120
+ grammar_path: Optional[Path] = None
121
+ grammar_string: Optional[str] = None # Optional grammar string (BNF-like format)
122
+
123
+
124
+ @dataclass
125
+ class GenerationConfig:
126
+ """Configuration for text generation."""
127
+ max_tokens: int = 512
128
+ stop: Sequence[str] = field(default_factory=tuple)
129
+ n_past: int = 0
130
+ sampler_config: Optional[SamplerConfig] = None
131
+ image_paths: Optional[Sequence[Path]] = None # Array of image paths for VLM (None if none)
132
+ audio_paths: Optional[Sequence[Path]] = None # Array of audio paths for VLM (None if none)
133
+
134
+
135
+ @dataclass
136
+ class ChatMessage:
137
+ """A chat message with role and content."""
138
+ role: str # "user" | "assistant" | "system"
139
+ content: str
140
+
141
+
142
+ class ToolFunction(TypedDict):
143
+ name: str
144
+ description: str
145
+ parameters_json: str
146
+
147
+
148
+ class Tool(TypedDict):
149
+ type: str
150
+ function: ToolFunction
151
+
152
+
153
+ # --------------------------------------------------------------------------------------
154
+ # Embedding / rerank / diffusion / OCR / ASR / TTS utilities
155
+ # --------------------------------------------------------------------------------------
156
+
157
+ @dataclass
158
+ class EmbeddingConfig:
159
+ """Configuration for embeddings."""
160
+ batch_size: int = 1
161
+ normalize: bool = True
162
+ normalize_method: str = "l2" # "l2" | "mean" | "none"
163
+
164
+
165
+ @dataclass
166
+ class RerankConfig:
167
+ """Configuration for reranking."""
168
+ batch_size: int = 1
169
+ normalize: bool = True
170
+ normalize_method: str = "softmax" # "softmax" | "min-max" | "none"
171
+
172
+
173
+ @dataclass
174
+ class ImageSamplerConfig:
175
+ """Configuration for image sampling."""
176
+ method: str = "ddim"
177
+ steps: int = 20
178
+ guidance_scale: float = 7.5
179
+ eta: float = 0.0
180
+ seed: int = -1 # –1 for random
181
+
182
+
183
+ @dataclass
184
+ class ImageGenerationConfig:
185
+ """Configuration for image generation."""
186
+ prompts: str | List[str]
187
+ negative_prompts: str | List[str] | None = None
188
+ height: int = 512
189
+ width: int = 512
190
+ sampler_config: Optional[ImageSamplerConfig] = None
191
+ lora_id: int = -1 # –1 for none
192
+ init_image: Optional[Image] = None
193
+ strength: float = 1.0
194
+
195
+
196
+ @dataclass
197
+ class SchedulerConfig:
198
+ """Configuration for diffusion scheduler."""
199
+ type: str = "ddim"
200
+ num_train_timesteps: int = 1000
201
+ steps_offset: int = 0 # An offset added to the inference steps
202
+ beta_start: float = 0.00085
203
+ beta_end: float = 0.012
204
+ beta_schedule: str = "scaled_linear"
205
+ prediction_type: str = "epsilon"
206
+ timestep_type: str = "discrete"
207
+ timestep_spacing: str = "linspace"
208
+ interpolation_type: str = "linear"
209
+ config_path: Optional[Path] = None
210
+
211
+
212
+ @dataclass
213
+ class ASRConfig:
214
+ """Configuration for ASR."""
215
+ timestamps: str = "none" # "none" | "segment" | "word"
216
+ beam_size: int = 5
217
+ stream: bool = False
218
+
219
+
220
+ @dataclass
221
+ class ASRResult:
222
+ """Result from ASR processing."""
223
+ transcript: str
224
+ confidence_scores: Sequence[float]
225
+ timestamps: Sequence[Tuple[float, float]]
226
+ duration_us: float
227
+
228
+
229
+ @dataclass
230
+ class TTSConfig:
231
+ """Configuration for TTS."""
232
+ voice: str = "default"
233
+ speed: float = 1.0
234
+ seed: int = -1 # –1 for random
235
+ sample_rate: int = 22050
236
+
237
+
238
+ @dataclass
239
+ class TTSSamplerConfig:
240
+ """Configuration for TTS sampling."""
241
+ temperature: float = 1.0
242
+ noise_scale: float = 0.667
243
+ length_scale: float = 1.0
244
+
245
+
246
+ @dataclass
247
+ class TTSResult:
248
+ """Result from TTS processing."""
249
+ audio_path: str # Path where the synthesized audio is saved
250
+ duration_seconds: float
251
+ sample_rate: int
252
+ channels: int
253
+ num_samples: int
254
+
255
+
256
+ # --------------------------------------------------------------------------------------
257
+ # Computer Vision structures
258
+ # --------------------------------------------------------------------------------------
259
+
260
+ @dataclass
261
+ class BoundingBox:
262
+ """Generic bounding box structure."""
263
+ x: float # X coordinate (normalized or pixel, depends on model)
264
+ y: float # Y coordinate (normalized or pixel, depends on model)
265
+ width: float # Width
266
+ height: float # Height
267
+
268
+
269
+ @dataclass
270
+ class CVResult:
271
+ """Generic detection/classification result."""
272
+ image_paths: Optional[List[Path]] = None # Output image paths
273
+ image_count: int = 0 # Number of output images
274
+ class_id: int = 0 # Class ID (example: ConvNext)
275
+ confidence: float = 0.0 # Confidence score [0.0-1.0]
276
+ bbox: Optional[BoundingBox] = None # Bounding box (example: YOLO)
277
+ text: Optional[str] = None # Text result (example: OCR)
278
+ embedding: Optional[List[float]] = None # Feature embedding (example: CLIP embedding)
279
+ embedding_dim: int = 0 # Embedding dimension
280
+
281
+
282
+ @dataclass
283
+ class CVResults:
284
+ """Generic CV inference result."""
285
+ results: List[CVResult] # Array of CV results
286
+ result_count: int # Number of CV results
287
+
288
+
289
+ class CVCapabilities:
290
+ """CV capabilities enum."""
291
+ OCR = 0 # OCR
292
+ CLASSIFICATION = 1 # Classification
293
+ SEGMENTATION = 2 # Segmentation
294
+ CUSTOM = 3 # Custom task
295
+
296
+
297
+ @dataclass
298
+ class CVModelConfig:
299
+ """CV model preprocessing configuration."""
300
+ capabilities: int # CVCapabilities
301
+
302
+ # MLX-OCR
303
+ det_model_path: Optional[str] = None # Detection model path
304
+ rec_model_path: Optional[str] = None # Recognition model path
305
+
306
+ # QNN
307
+ model_path: Optional[str] = None # Model path
308
+ system_library_path: Optional[str] = None # System library path
309
+ backend_library_path: Optional[str] = None # Backend library path
310
+ extension_library_path: Optional[str] = None # Extension library path
311
+ config_file_path: Optional[str] = None # Config file path
312
+ char_dict_path: Optional[str] = None # Character dictionary path
313
+
314
+
315
+ # --------------------------------------------------------------------------------------
316
+ # LLM
317
+ # --------------------------------------------------------------------------------------
318
+
319
+ class LLM(ABC):
320
+ """Abstract base class for Large Language Models."""
321
+
322
+ def __init__(
323
+ self,
324
+ model_path: Path,
325
+ tokenizer_path: Path,
326
+ config: ModelConfig,
327
+ device: Optional[str] = None,
328
+ ) -> None:
329
+ self.model_path = model_path
330
+ self.tokenizer_path = tokenizer_path
331
+ self.config = config
332
+ self.device = device
333
+
334
+ @abstractmethod
335
+ def destroy(self) -> None:
336
+ """Destroy the model and free resources."""
337
+ pass
338
+
339
+ @abstractmethod
340
+ def reset(self) -> None:
341
+ """Reset the model state."""
342
+ pass
343
+
344
+ # Tokenization
345
+ @abstractmethod
346
+ def encode(self, text: str) -> List[int]:
347
+ """Encode text to token IDs."""
348
+ pass
349
+
350
+ @abstractmethod
351
+ def decode(self, token_ids: Sequence[int]) -> str:
352
+ """Decode token IDs to text."""
353
+ pass
354
+
355
+ # KV-cache
356
+ @abstractmethod
357
+ def save_kv_cache(self, path: Path) -> bool:
358
+ """Save KV cache to file."""
359
+ pass
360
+
361
+ @abstractmethod
362
+ def load_kv_cache(self, path: Path) -> bool:
363
+ """Load KV cache from file."""
364
+ pass
365
+
366
+ # LoRA
367
+ @abstractmethod
368
+ def set_lora(self, lora_id: int) -> None:
369
+ """Set active LoRA adapter."""
370
+ pass
371
+
372
+ @abstractmethod
373
+ def add_lora(self, lora_path: Path) -> int:
374
+ """Add LoRA adapter and return its ID."""
375
+ pass
376
+
377
+ @abstractmethod
378
+ def remove_lora(self, lora_id: int) -> None:
379
+ """Remove LoRA adapter."""
380
+ pass
381
+
382
+ @abstractmethod
383
+ def list_loras(self) -> List[int]:
384
+ """List available LoRA adapters."""
385
+ pass
386
+
387
+ # Sampler
388
+ @abstractmethod
389
+ def set_sampler(self, config: SamplerConfig) -> None:
390
+ """Set sampler configuration."""
391
+ pass
392
+
393
+ @abstractmethod
394
+ def reset_sampler(self) -> None:
395
+ """Reset sampler to default configuration."""
396
+ pass
397
+
398
+ @abstractmethod
399
+ def generate_stream(
400
+ self,
401
+ prompt: str,
402
+ config: Optional[GenerationConfig],
403
+ on_token: TokenCallback,
404
+ user_data: Any = None,
405
+ ) -> str:
406
+ """Generate text with streaming callback."""
407
+ pass
408
+
409
+ @abstractmethod
410
+ def get_chat_template(self, template_name: str) -> str:
411
+ """Get chat template by name."""
412
+ pass
413
+
414
+ @abstractmethod
415
+ def apply_chat_template(self, messages: Sequence[ChatMessage], tools: Optional[Sequence[Tool]] = None, enable_thinking: bool = True) -> str:
416
+ """Apply chat template to messages with optional tools support."""
417
+ pass
418
+
419
+ # Embeddings
420
+ @abstractmethod
421
+ def embed(
422
+ self,
423
+ texts: Sequence[str],
424
+ config: Optional[EmbeddingConfig] = None,
425
+ ) -> List[List[float]]:
426
+ """Generate embeddings for texts."""
427
+ pass
428
+
429
+
430
+ # --------------------------------------------------------------------------------------
431
+ # VLM (Vision-Language Model)
432
+ # --------------------------------------------------------------------------------------
433
+
434
+ class VLM(ABC):
435
+ """Abstract base class for Vision-Language Models."""
436
+
437
+ def __init__(
438
+ self,
439
+ model_path: Path,
440
+ mmproj_path: Path,
441
+ context_length: int,
442
+ device: Optional[str] = None,
443
+ ) -> None:
444
+ self.model_path = model_path
445
+ self.mmproj_path = mmproj_path
446
+ self.context_length = context_length
447
+ self.device = device
448
+
449
+ @abstractmethod
450
+ def destroy(self) -> None:
451
+ """Destroy the model and free resources."""
452
+ pass
453
+
454
+ @abstractmethod
455
+ def reset(self) -> None:
456
+ """Reset the model state."""
457
+ pass
458
+
459
+ # Tokenization
460
+ @abstractmethod
461
+ def encode(self, text: str) -> List[int]:
462
+ """Encode text to token IDs."""
463
+ pass
464
+
465
+ @abstractmethod
466
+ def decode(self, token_ids: Sequence[int]) -> str:
467
+ """Decode token IDs to text."""
468
+ pass
469
+
470
+ # Sampler
471
+ @abstractmethod
472
+ def set_sampler(self, config: SamplerConfig) -> None:
473
+ """Set sampler configuration."""
474
+ pass
475
+
476
+ @abstractmethod
477
+ def reset_sampler(self) -> None:
478
+ """Reset sampler to default configuration."""
479
+ pass
480
+
481
+ # Generation
482
+ @abstractmethod
483
+ def generate(
484
+ self,
485
+ prompt: str,
486
+ config: Optional[GenerationConfig] = None,
487
+ ) -> str:
488
+ """Generate text from prompt."""
489
+ pass
490
+
491
+ @abstractmethod
492
+ def generate_multimodal(
493
+ self,
494
+ prompt: str,
495
+ image_paths: Optional[Sequence[Path]] = None,
496
+ audio_paths: Optional[Sequence[Path]] = None,
497
+ config: Optional[GenerationConfig] = None,
498
+ ) -> str:
499
+ """Generate text from prompt with multiple images and audio."""
500
+ pass
501
+
502
+ @abstractmethod
503
+ def generate_stream(
504
+ self,
505
+ prompt: str,
506
+ config: Optional[GenerationConfig],
507
+ on_token: TokenCallback,
508
+ user_data: Any = None,
509
+ ) -> str:
510
+ """Generate text with streaming callback."""
511
+ pass
512
+
513
+ @abstractmethod
514
+ def generate_stream_multimodal(
515
+ self,
516
+ prompt: str,
517
+ image_paths: Optional[Sequence[Path]] = None,
518
+ audio_paths: Optional[Sequence[Path]] = None,
519
+ config: Optional[GenerationConfig] = None,
520
+ on_token: Optional[TokenCallback] = None,
521
+ user_data: Any = None,
522
+ ) -> str:
523
+ """Generate text from prompt with multiple images and audio using streaming callback."""
524
+ pass
525
+
526
+ @abstractmethod
527
+ def get_chat_template(self, template_name: str) -> str:
528
+ """Get chat template by name."""
529
+ pass
530
+
531
+ @abstractmethod
532
+ def apply_chat_template(self, messages: Sequence[ChatMessage], tools: Optional[Sequence[Tool]] = None, enable_thinking: bool = True) -> str:
533
+ """Apply chat template to messages with optional tools support."""
534
+ pass
535
+
536
+ # Embeddings
537
+ @abstractmethod
538
+ def embed(
539
+ self,
540
+ texts: Sequence[str],
541
+ config: Optional[EmbeddingConfig] = None,
542
+ ) -> List[List[float]]:
543
+ """Generate embeddings for texts."""
544
+ pass
545
+
546
+
547
+ # --------------------------------------------------------------------------------------
548
+ # Embedding Model
549
+ # --------------------------------------------------------------------------------------
550
+
551
+ class Embedder(ABC):
552
+ """Abstract base class for embedding models."""
553
+
554
+ def __init__(
555
+ self,
556
+ model_path: Path,
557
+ tokenizer_path: Path,
558
+ device: Optional[str] = None,
559
+ ) -> None:
560
+ self.model_path = model_path
561
+ self.tokenizer_path = tokenizer_path
562
+ self.device = device
563
+
564
+ @abstractmethod
565
+ def destroy(self) -> None:
566
+ """Destroy the model and free resources."""
567
+ pass
568
+
569
+ @abstractmethod
570
+ def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
571
+ """Load model from path."""
572
+ pass
573
+
574
+ @abstractmethod
575
+ def close(self) -> None:
576
+ """Close the model."""
577
+ pass
578
+
579
+ @abstractmethod
580
+ def embed(
581
+ self,
582
+ texts: Sequence[str],
583
+ config: Optional[EmbeddingConfig] = None,
584
+ ) -> List[List[float]]:
585
+ """Generate embeddings for texts."""
586
+ pass
587
+
588
+ @abstractmethod
589
+ def embedding_dim(self) -> int:
590
+ """Get embedding dimension."""
591
+ pass
592
+
593
+ @abstractmethod
594
+ def set_lora(self, lora_id: int) -> None:
595
+ """Set active LoRA adapter."""
596
+ pass
597
+
598
+ @abstractmethod
599
+ def add_lora(self, lora_path: Path) -> int:
600
+ """Add LoRA adapter and return its ID."""
601
+ pass
602
+
603
+ @abstractmethod
604
+ def remove_lora(self, lora_id: int) -> None:
605
+ """Remove LoRA adapter."""
606
+ pass
607
+
608
+ @abstractmethod
609
+ def list_loras(self) -> List[int]:
610
+ """List available LoRA adapters."""
611
+ pass
612
+
613
+
614
+ # --------------------------------------------------------------------------------------
615
+ # Reranker Model
616
+ # --------------------------------------------------------------------------------------
617
+
618
+ class Reranker(ABC):
619
+ """Abstract base class for reranker models."""
620
+
621
+ def __init__(
622
+ self,
623
+ model_path: Path,
624
+ tokenizer_path: Path,
625
+ device: Optional[str] = None,
626
+ ) -> None:
627
+ self.model_path = model_path
628
+ self.tokenizer_path = tokenizer_path
629
+ self.device = device
630
+
631
+ @abstractmethod
632
+ def destroy(self) -> None:
633
+ """Destroy the model and free resources."""
634
+ pass
635
+
636
+ @abstractmethod
637
+ def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
638
+ """Load model from path."""
639
+ pass
640
+
641
+ @abstractmethod
642
+ def close(self) -> None:
643
+ """Close the model."""
644
+ pass
645
+
646
+ @abstractmethod
647
+ def rerank(
648
+ self,
649
+ query: str,
650
+ documents: Sequence[str],
651
+ config: Optional[RerankConfig] = None,
652
+ ) -> List[float]:
653
+ """Rerank documents given a query."""
654
+ pass
655
+
656
+
657
+ # --------------------------------------------------------------------------------------
658
+ # Image generation
659
+ # --------------------------------------------------------------------------------------
660
+
661
+ class ImageGen(ABC):
662
+ """Abstract base class for image generation models."""
663
+
664
+ def __init__(
665
+ self,
666
+ model_path: Path,
667
+ scheduler_config_path: Path,
668
+ device: Optional[str] = None,
669
+ ) -> None:
670
+ self.model_path = model_path
671
+ self.scheduler_config_path = scheduler_config_path
672
+ self.device = device
673
+
674
+ @abstractmethod
675
+ def destroy(self) -> None:
676
+ """Destroy the model and free resources."""
677
+ pass
678
+
679
+ @abstractmethod
680
+ def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
681
+ """Load model from path."""
682
+ pass
683
+
684
+ @abstractmethod
685
+ def close(self) -> None:
686
+ """Close the model."""
687
+ pass
688
+
689
+ @abstractmethod
690
+ def set_scheduler(self, config: SchedulerConfig) -> None:
691
+ """Set scheduler configuration."""
692
+ pass
693
+
694
+ @abstractmethod
695
+ def set_sampler(self, config: ImageSamplerConfig) -> None:
696
+ """Set sampler configuration."""
697
+ pass
698
+
699
+ @abstractmethod
700
+ def reset_sampler(self) -> None:
701
+ """Reset sampler to default configuration."""
702
+ pass
703
+
704
+ @abstractmethod
705
+ def txt2img(self, prompt: str, config: ImageGenerationConfig) -> Image:
706
+ """Generate image from text prompt."""
707
+ pass
708
+
709
+ @abstractmethod
710
+ def img2img(self, init_image: Image, prompt: str, config: ImageGenerationConfig) -> Image:
711
+ """Generate image from initial image and text prompt."""
712
+ pass
713
+
714
+ @abstractmethod
715
+ def generate(self, config: ImageGenerationConfig) -> Image:
716
+ """Generate image from configuration."""
717
+ pass
718
+
719
+ @abstractmethod
720
+ def set_lora(self, lora_id: int) -> None:
721
+ """Set active LoRA adapter."""
722
+ pass
723
+
724
+ @abstractmethod
725
+ def add_lora(self, lora_path: Path) -> int:
726
+ """Add LoRA adapter and return its ID."""
727
+ pass
728
+
729
+ @abstractmethod
730
+ def remove_lora(self, lora_id: int) -> None:
731
+ """Remove LoRA adapter."""
732
+ pass
733
+
734
+ @abstractmethod
735
+ def list_loras(self) -> List[int]:
736
+ """List available LoRA adapters."""
737
+ pass
738
+
739
+
740
+ # --------------------------------------------------------------------------------------
741
+ # Computer vision – Generic CV Model
742
+ # --------------------------------------------------------------------------------------
743
+
744
+ class CVModel(ABC):
745
+ """Abstract base class for generic computer vision models."""
746
+
747
+ def __init__(self, config: CVModelConfig, device: Optional[str] = None) -> None:
748
+ self.config = config
749
+ self.device = device
750
+
751
+ @abstractmethod
752
+ def destroy(self) -> None:
753
+ """Destroy the model and free resources."""
754
+ pass
755
+
756
+ @abstractmethod
757
+ def infer(self, input_image_path: str) -> CVResults:
758
+ """Perform inference on image."""
759
+ pass
760
+
761
+
762
+ # --------------------------------------------------------------------------------------
763
+ # Speech recognition – ASR
764
+ # --------------------------------------------------------------------------------------
765
+
766
+ class ASR(ABC):
767
+ """Abstract base class for Automatic Speech Recognition models."""
768
+
769
+ def __init__(
770
+ self,
771
+ model_path: Path,
772
+ tokenizer_path: Optional[Path],
773
+ language: Optional[str],
774
+ device: Optional[str] = None,
775
+ ) -> None:
776
+ self.model_path = model_path
777
+ self.tokenizer_path = tokenizer_path
778
+ self.language = language
779
+ self.device = device
780
+
781
+ @abstractmethod
782
+ def destroy(self) -> None:
783
+ """Destroy the model and free resources."""
784
+ pass
785
+
786
+ @abstractmethod
787
+ def close(self) -> None:
788
+ """Close the model."""
789
+ pass
790
+
791
+ @abstractmethod
792
+ def transcribe(
793
+ self,
794
+ audio_path: Path,
795
+ language: Optional[str] = None,
796
+ config: Optional[ASRConfig] = None,
797
+ ) -> ASRResult:
798
+ """Transcribe audio file to text."""
799
+ pass
800
+
801
+ @abstractmethod
802
+ def list_supported_languages(self) -> List[str]:
803
+ """List supported languages."""
804
+ pass
805
+
806
+
807
+ # --------------------------------------------------------------------------------------
808
+ # Speech synthesis – TTS
809
+ # --------------------------------------------------------------------------------------
810
+
811
+ class TTS(ABC):
812
+ """Abstract base class for Text-to-Speech models."""
813
+
814
+ def __init__(
815
+ self,
816
+ model_path: Path,
817
+ vocoder_path: Path,
818
+ device: Optional[str] = None,
819
+ ) -> None:
820
+ self.model_path = model_path
821
+ self.vocoder_path = vocoder_path
822
+ self.device = device
823
+
824
+ @abstractmethod
825
+ def destroy(self) -> None:
826
+ """Destroy the model and free resources."""
827
+ pass
828
+
829
+ @abstractmethod
830
+ def synthesize(
831
+ self,
832
+ text: str,
833
+ config: Optional[TTSConfig] = None,
834
+ output_path: Optional[Path] = None,
835
+ ) -> TTSResult:
836
+ """Synthesize speech from text and save to filesystem."""
837
+ pass
838
+
839
+ @abstractmethod
840
+ def list_available_voices(self) -> List[str]:
841
+ """List available voices."""
842
+ pass