nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (580) hide show
  1. nexaai/__init__.py +99 -0
  2. nexaai/_stub.cpython-310-darwin.so +0 -0
  3. nexaai/_version.py +4 -0
  4. nexaai/asr.py +68 -0
  5. nexaai/asr_impl/__init__.py +0 -0
  6. nexaai/asr_impl/mlx_asr_impl.py +93 -0
  7. nexaai/asr_impl/pybind_asr_impl.py +127 -0
  8. nexaai/base.py +39 -0
  9. nexaai/binds/__init__.py +7 -0
  10. nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
  11. nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
  12. nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
  13. nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
  14. nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
  15. nexaai/binds/cpu_gpu/libggml.dylib +0 -0
  16. nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
  17. nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
  18. nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
  19. nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
  20. nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
  21. nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
  22. nexaai/binds/libnexa_bridge.dylib +0 -0
  23. nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
  24. nexaai/binds/metal/libnexa_plugin.dylib +0 -0
  25. nexaai/binds/metal/py-lib/ml.py +888 -0
  26. nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
  27. nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
  28. nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
  29. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  30. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  31. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  32. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  33. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  34. nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  35. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
  36. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
  37. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
  38. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  39. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  40. nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  41. nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
  42. nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
  43. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
  44. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
  45. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  46. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  47. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  48. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  49. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  50. nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  51. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
  52. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
  53. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
  54. nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
  55. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
  56. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
  57. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
  58. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
  59. nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
  60. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
  61. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
  62. nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
  63. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
  64. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  65. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
  66. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
  67. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
  68. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
  69. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
  70. nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
  71. nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
  72. nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
  73. nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  74. nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
  75. nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
  76. nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
  77. nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
  78. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  79. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  80. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
  81. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
  82. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  83. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  84. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  85. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  86. nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  87. nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  88. nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  89. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
  90. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
  91. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
  92. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
  93. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  94. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
  95. nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
  96. nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
  97. nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
  98. nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
  99. nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
  100. nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
  101. nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
  102. nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
  103. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
  104. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
  105. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
  106. nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
  107. nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
  108. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
  109. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
  110. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
  111. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
  112. nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
  113. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
  114. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
  115. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  116. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
  117. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  118. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  119. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  120. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  121. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  122. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  123. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
  124. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
  125. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
  126. nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  127. nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
  128. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  129. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  130. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  131. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
  132. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  133. nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
  134. nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
  135. nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
  136. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
  137. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  138. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  139. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
  140. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  141. nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
  142. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
  143. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
  144. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
  145. nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  146. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
  147. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  148. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
  149. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  150. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  151. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  152. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  153. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  154. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  155. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  156. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  157. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  158. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  159. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  160. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  161. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  162. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  163. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  164. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
  165. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  166. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
  167. nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  168. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
  169. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
  170. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
  171. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
  172. nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
  173. nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
  174. nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
  175. nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
  176. nexaai/binds/metal/py-lib/profiling.py +239 -0
  177. nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
  178. nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
  179. nexaai/binds/nexaml/libggml-base.dylib +0 -0
  180. nexaai/binds/nexaml/libggml-cpu.so +0 -0
  181. nexaai/binds/nexaml/libggml-metal.so +0 -0
  182. nexaai/binds/nexaml/libggml.dylib +0 -0
  183. nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
  184. nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
  185. nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
  186. nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
  187. nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
  188. nexaai/binds/nexaml/libnexaproc.dylib +0 -0
  189. nexaai/binds/nexaml/libomp.dylib +0 -0
  190. nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
  191. nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
  192. nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
  193. nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
  194. nexaai/common.py +106 -0
  195. nexaai/cv.py +95 -0
  196. nexaai/cv_impl/__init__.py +0 -0
  197. nexaai/cv_impl/mlx_cv_impl.py +91 -0
  198. nexaai/cv_impl/pybind_cv_impl.py +124 -0
  199. nexaai/diarize.py +80 -0
  200. nexaai/diarize_impl/__init__.py +1 -0
  201. nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
  202. nexaai/embedder.py +73 -0
  203. nexaai/embedder_impl/__init__.py +0 -0
  204. nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
  205. nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
  206. nexaai/image_gen.py +141 -0
  207. nexaai/image_gen_impl/__init__.py +0 -0
  208. nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
  209. nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
  210. nexaai/llm.py +98 -0
  211. nexaai/llm_impl/__init__.py +0 -0
  212. nexaai/llm_impl/mlx_llm_impl.py +271 -0
  213. nexaai/llm_impl/pybind_llm_impl.py +238 -0
  214. nexaai/log.py +92 -0
  215. nexaai/mlx_backend/asr/__init__.py +12 -0
  216. nexaai/mlx_backend/asr/interface.py +122 -0
  217. nexaai/mlx_backend/common/__init__.py +0 -0
  218. nexaai/mlx_backend/common/utils.py +25 -0
  219. nexaai/mlx_backend/cv/__init__.py +0 -0
  220. nexaai/mlx_backend/cv/generate.py +195 -0
  221. nexaai/mlx_backend/cv/interface.py +162 -0
  222. nexaai/mlx_backend/cv/main.py +81 -0
  223. nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
  224. nexaai/mlx_backend/embedding/__init__.py +0 -0
  225. nexaai/mlx_backend/embedding/generate.py +333 -0
  226. nexaai/mlx_backend/embedding/interface.py +617 -0
  227. nexaai/mlx_backend/embedding/main.py +173 -0
  228. nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
  229. nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
  230. nexaai/mlx_backend/image_gen/__init__.py +1 -0
  231. nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
  232. nexaai/mlx_backend/image_gen/interface.py +82 -0
  233. nexaai/mlx_backend/image_gen/main.py +281 -0
  234. nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
  235. nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
  236. nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
  237. nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
  238. nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
  239. nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
  240. nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
  241. nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
  242. nexaai/mlx_backend/llm/__init__.py +0 -0
  243. nexaai/mlx_backend/llm/generate.py +149 -0
  244. nexaai/mlx_backend/llm/interface.py +764 -0
  245. nexaai/mlx_backend/llm/main.py +68 -0
  246. nexaai/mlx_backend/ml.py +888 -0
  247. nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
  248. nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
  249. nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
  250. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
  251. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
  252. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
  253. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
  254. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
  255. nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
  256. nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
  257. nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
  258. nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
  259. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
  260. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
  261. nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
  262. nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
  263. nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
  264. nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
  265. nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
  266. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
  267. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
  268. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
  269. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
  270. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
  271. nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
  272. nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
  273. nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
  274. nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
  275. nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
  276. nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
  277. nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
  278. nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
  279. nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
  280. nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
  281. nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
  282. nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
  283. nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
  284. nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
  285. nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
  286. nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
  287. nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
  288. nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
  289. nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
  290. nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
  291. nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
  292. nexaai/mlx_backend/mlx_audio/server.py +525 -0
  293. nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
  294. nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
  295. nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
  296. nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
  297. nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
  298. nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
  299. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
  300. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
  301. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
  302. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
  303. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
  304. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
  305. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
  306. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
  307. nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
  308. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
  309. nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
  310. nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
  311. nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
  312. nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
  313. nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
  314. nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
  315. nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
  316. nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
  317. nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
  318. nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
  319. nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
  320. nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
  321. nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
  322. nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
  323. nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
  324. nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
  325. nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
  326. nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
  327. nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
  328. nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
  329. nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
  330. nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
  331. nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
  332. nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
  333. nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
  334. nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
  335. nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
  336. nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
  337. nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
  338. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
  339. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
  340. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
  341. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
  342. nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
  343. nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
  344. nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
  345. nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
  346. nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
  347. nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
  348. nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
  349. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  350. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
  351. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
  352. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
  353. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
  354. nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
  355. nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
  356. nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
  357. nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
  358. nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
  359. nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
  360. nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
  361. nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
  362. nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
  363. nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
  364. nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
  365. nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
  366. nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
  367. nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
  368. nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
  369. nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
  370. nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
  371. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
  372. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
  373. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
  374. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
  375. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
  376. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
  377. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
  378. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
  379. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
  380. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
  381. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
  382. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
  383. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
  384. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
  385. nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
  386. nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
  387. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
  388. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
  389. nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
  390. nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
  391. nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
  392. nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
  393. nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
  394. nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
  395. nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
  396. nexaai/mlx_backend/mlx_audio/utils.py +237 -0
  397. nexaai/mlx_backend/mlx_audio/version.py +1 -0
  398. nexaai/mlx_backend/profiling.py +239 -0
  399. nexaai/mlx_backend/rerank/__init__.py +0 -0
  400. nexaai/mlx_backend/rerank/generate.py +174 -0
  401. nexaai/mlx_backend/rerank/interface.py +287 -0
  402. nexaai/mlx_backend/rerank/main.py +127 -0
  403. nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
  404. nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
  405. nexaai/mlx_backend/sd/__init__.py +1 -0
  406. nexaai/mlx_backend/sd/interface.py +362 -0
  407. nexaai/mlx_backend/sd/main.py +286 -0
  408. nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
  409. nexaai/mlx_backend/sd/modeling/clip.py +116 -0
  410. nexaai/mlx_backend/sd/modeling/config.py +65 -0
  411. nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
  412. nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
  413. nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
  414. nexaai/mlx_backend/sd/modeling/unet.py +460 -0
  415. nexaai/mlx_backend/sd/modeling/vae.py +274 -0
  416. nexaai/mlx_backend/tts/__init__.py +12 -0
  417. nexaai/mlx_backend/tts/interface.py +276 -0
  418. nexaai/mlx_backend/vlm/__init__.py +3 -0
  419. nexaai/mlx_backend/vlm/generate.py +572 -0
  420. nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
  421. nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
  422. nexaai/mlx_backend/vlm/interface.py +559 -0
  423. nexaai/mlx_backend/vlm/main.py +365 -0
  424. nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
  425. nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
  426. nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
  427. nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
  428. nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
  429. nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
  430. nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
  431. nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
  432. nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
  433. nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
  434. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
  435. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
  436. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
  437. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
  438. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
  439. nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
  440. nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
  441. nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
  442. nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
  443. nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
  444. nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
  445. nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
  446. nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
  447. nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
  448. nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
  449. nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
  450. nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
  451. nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
  452. nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
  453. nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
  454. nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
  455. nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
  456. nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
  457. nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
  458. nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
  459. nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
  460. nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
  461. nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
  462. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
  463. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
  464. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
  465. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
  466. nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
  467. nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
  468. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
  469. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
  470. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
  471. nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
  472. nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
  473. nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
  474. nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
  475. nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
  476. nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
  477. nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
  478. nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
  479. nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
  480. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
  481. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
  482. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
  483. nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
  484. nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
  485. nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
  486. nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
  487. nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
  488. nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
  489. nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
  490. nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
  491. nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
  492. nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
  493. nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
  494. nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
  495. nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
  496. nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
  497. nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
  498. nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
  499. nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
  500. nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
  501. nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
  502. nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
  503. nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
  504. nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
  505. nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
  506. nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
  507. nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
  508. nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
  509. nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
  510. nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
  511. nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
  512. nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
  513. nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
  514. nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
  515. nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
  516. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
  517. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
  518. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
  519. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
  520. nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
  521. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
  522. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
  523. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
  524. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
  525. nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
  526. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
  527. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
  528. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
  529. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
  530. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
  531. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
  532. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
  533. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
  534. nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
  535. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
  536. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
  537. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
  538. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
  539. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
  540. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
  541. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
  542. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
  543. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
  544. nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
  545. nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
  546. nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
  547. nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
  548. nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
  549. nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
  550. nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
  551. nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
  552. nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
  553. nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
  554. nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
  555. nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
  556. nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
  557. nexaai/rerank.py +57 -0
  558. nexaai/rerank_impl/__init__.py +0 -0
  559. nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
  560. nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
  561. nexaai/runtime.py +68 -0
  562. nexaai/runtime_error.py +24 -0
  563. nexaai/tts.py +75 -0
  564. nexaai/tts_impl/__init__.py +0 -0
  565. nexaai/tts_impl/mlx_tts_impl.py +94 -0
  566. nexaai/tts_impl/pybind_tts_impl.py +43 -0
  567. nexaai/utils/decode.py +18 -0
  568. nexaai/utils/manifest_utils.py +531 -0
  569. nexaai/utils/model_manager.py +1745 -0
  570. nexaai/utils/model_types.py +49 -0
  571. nexaai/utils/progress_tracker.py +389 -0
  572. nexaai/utils/quantization_utils.py +245 -0
  573. nexaai/vlm.py +130 -0
  574. nexaai/vlm_impl/__init__.py +0 -0
  575. nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
  576. nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
  577. nexaai-1.0.29.dist-info/METADATA +35 -0
  578. nexaai-1.0.29.dist-info/RECORD +580 -0
  579. nexaai-1.0.29.dist-info/WHEEL +5 -0
  580. nexaai-1.0.29.dist-info/top_level.txt +1 -0
@@ -0,0 +1,888 @@
1
+ # This file defines the python interface that c-lib expects from a python backend
2
+
3
+ from __future__ import annotations
4
+ from typing import Optional
5
+ from pathlib import Path
6
+ from dataclasses import dataclass
7
+
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass, field
10
+ from typing import (
11
+ Any,
12
+ Callable,
13
+ List,
14
+ Optional,
15
+ Protocol,
16
+ Sequence,
17
+ Tuple,
18
+ TypedDict,
19
+ )
20
+
21
+ # --------------------------------------------------------------------------------------
22
+ # Core aliases & callback protocols
23
+ # --------------------------------------------------------------------------------------
24
+
25
+ Path = str
26
+
27
+ LogCallback = Callable[[str], None]
28
+
29
+
30
+ class TokenCallback(Protocol):
31
+ def __call__(self, token: str, user_data: Any) -> bool: ...
32
+
33
+
34
+ # --------------------------------------------------------------------------------------
35
+ # Core module functions
36
+ # --------------------------------------------------------------------------------------
37
+
38
+ def init() -> None:
39
+ """Initialize the ML module."""
40
+ pass
41
+
42
+
43
+ def deinit() -> None:
44
+ """Deinitialize the ML module."""
45
+ pass
46
+
47
+
48
+ def set_log(callback: LogCallback) -> None:
49
+ """Set the logging callback."""
50
+ pass
51
+
52
+
53
+ def log(message: str) -> None:
54
+ """Log a message."""
55
+ pass
56
+
57
+
58
+ def model_config_default() -> ModelConfig:
59
+ """Get default model configuration with sensible defaults."""
60
+ return ModelConfig()
61
+
62
+
63
+ # --------------------------------------------------------------------------------------
64
+ # Basic data structures
65
+ # --------------------------------------------------------------------------------------
66
+
67
+ @dataclass
68
+ class Image:
69
+ """Image data structure."""
70
+ data: List[float] # width × height × channels
71
+ width: int
72
+ height: int
73
+ channels: int # 3 = RGB, 4 = RGBA
74
+
75
+
76
+ @dataclass
77
+ class Audio:
78
+ """Audio data structure."""
79
+ data: List[float] # num_samples × channels
80
+ sample_rate: int
81
+ channels: int
82
+ num_samples: int
83
+
84
+
85
+ @dataclass
86
+ class Video:
87
+ """Video data structure."""
88
+ data: List[float] # width × height × channels × num_frames
89
+ width: int
90
+ height: int
91
+ channels: int
92
+ num_frames: int
93
+
94
+
95
+ # --------------------------------------------------------------------------------------
96
+ # Language-model structures
97
+ # --------------------------------------------------------------------------------------
98
+
99
+ @dataclass
100
+ class ModelConfig:
101
+ """Configuration for model parameters."""
102
+ n_ctx: int = 0 # text context, 0 = from model
103
+ n_threads: int = 0 # number of threads to use for generation
104
+ n_threads_batch: int = 0 # number of threads to use for batch processing
105
+ n_batch: int = 0 # logical maximum batch size that can be submitted to llama_decode
106
+ n_ubatch: int = 0 # physical maximum batch size
107
+ # max number of sequences (i.e. distinct states for recurrent models)
108
+ n_seq_max: int = 0
109
+ # path to chat template file, optional
110
+ chat_template_path: Optional[Path] = None
111
+ # content of chat template file, optional
112
+ chat_template_content: Optional[str] = None
113
+
114
+
115
+ @dataclass
116
+ class SamplerConfig:
117
+ """Configuration for text sampling."""
118
+ temperature: float = 0.7
119
+ top_p: float = 0.9
120
+ top_k: int = 40
121
+ min_p: float = 0.0 # Minimum probability for nucleus sampling
122
+ repetition_penalty: float = 1.0
123
+ presence_penalty: float = 0.0
124
+ frequency_penalty: float = 0.0
125
+ seed: int = -1 # –1 for random
126
+ grammar_path: Optional[Path] = None
127
+ # Optional grammar string (BNF-like format)
128
+ grammar_string: Optional[str] = None
129
+
130
+
131
+ @dataclass
132
+ class GenerationConfig:
133
+ """Configuration for text generation."""
134
+ max_tokens: int = 512
135
+ stop: Sequence[str] = field(default_factory=tuple)
136
+ n_past: int = 0
137
+ sampler_config: Optional[SamplerConfig] = None
138
+ # Array of image paths for VLM (None if none)
139
+ image_paths: Optional[Sequence[Path]] = None
140
+ # Array of audio paths for VLM (None if none)
141
+ audio_paths: Optional[Sequence[Path]] = None
142
+
143
+
144
+ @dataclass
145
+ class ChatMessage:
146
+ """A chat message with role and content."""
147
+ role: str # "user" | "assistant" | "system"
148
+ content: str
149
+
150
+
151
+ class ToolFunction(TypedDict):
152
+ name: str
153
+ description: str
154
+ parameters_json: str
155
+
156
+
157
+ class Tool(TypedDict):
158
+ type: str
159
+ function: ToolFunction
160
+
161
+
162
+ # --------------------------------------------------------------------------------------
163
+ # Embedding / rerank / diffusion / OCR / ASR / TTS utilities
164
+ # --------------------------------------------------------------------------------------
165
+
166
+ @dataclass
167
+ class EmbeddingConfig:
168
+ """Configuration for embeddings."""
169
+ batch_size: int = 1
170
+ normalize: bool = True
171
+ normalize_method: str = "l2" # "l2" | "mean" | "none"
172
+
173
+
174
+ @dataclass
175
+ class RerankConfig:
176
+ """Configuration for reranking."""
177
+ batch_size: int = 1
178
+ normalize: bool = True
179
+ normalize_method: str = "softmax" # "softmax" | "min-max" | "none"
180
+
181
+
182
+ # image-gen
183
+
184
+
185
+ @dataclass
186
+ class ImageGenTxt2ImgInput:
187
+ """Input structure for text-to-image generation."""
188
+ prompt: str
189
+ config: ImageGenerationConfig
190
+ output_path: Optional[Path] = None
191
+
192
+
193
+ @dataclass
194
+ class ImageGenImg2ImgInput:
195
+ """Input structure for image-to-image generation."""
196
+ init_image_path: Path
197
+ prompt: str
198
+ config: ImageGenerationConfig
199
+ output_path: Optional[Path] = None
200
+
201
+
202
+ @dataclass
203
+ class ImageGenOutput:
204
+ """Output structure for image generation."""
205
+ output_image_path: Path
206
+
207
+
208
+ @dataclass
209
+ class ImageSamplerConfig:
210
+ """Configuration for image sampling."""
211
+ method: str = "ddim"
212
+ steps: int = 20
213
+ guidance_scale: float = 7.5
214
+ eta: float = 0.0
215
+ seed: int = -1 # –1 for random
216
+
217
+
218
+ @dataclass
219
+ class ImageGenCreateInput:
220
+ """Configuration for image generation."""
221
+ model_name: str
222
+ model_path: Path
223
+ config: ModelConfig
224
+ scheduler_config_path: Path
225
+ plugin_id: str
226
+ device_id: Optional[str] = None
227
+
228
+
229
+ @dataclass
230
+ class ImageGenerationConfig:
231
+ """Configuration for image generation."""
232
+ prompts: List[str]
233
+ sampler_config: ImageSamplerConfig
234
+ scheduler_config: SchedulerConfig
235
+ strength: float
236
+ negative_prompts: Optional[List[str]] = None
237
+ height: int = 512
238
+ width: int = 512
239
+
240
+
241
+ @dataclass
242
+ class SchedulerConfig:
243
+ """Configuration for diffusion scheduler."""
244
+ type: str = "ddim"
245
+ num_train_timesteps: int = 1000
246
+ steps_offset: int = 0 # An offset added to the inference steps
247
+ beta_start: float = 0.00085
248
+ beta_end: float = 0.012
249
+ beta_schedule: str = "scaled_linear"
250
+ prediction_type: str = "epsilon"
251
+ timestep_type: str = "discrete"
252
+ timestep_spacing: str = "linspace"
253
+ interpolation_type: str = "linear"
254
+ config_path: Optional[Path] = None
255
+
256
+
257
+ @dataclass
258
+ class ASRConfig:
259
+ """Configuration for ASR."""
260
+ timestamps: str = "none" # "none" | "segment" | "word"
261
+ beam_size: int = 5
262
+ stream: bool = False
263
+
264
+
265
+ @dataclass
266
+ class ASRResult:
267
+ """Result from ASR processing."""
268
+ transcript: str
269
+ confidence_scores: Sequence[float]
270
+ timestamps: Sequence[Tuple[float, float]]
271
+ duration_us: float
272
+
273
+
274
+ @dataclass
275
+ class TTSConfig:
276
+ """Configuration for TTS."""
277
+ voice: str = "default"
278
+ speed: float = 1.0
279
+ seed: int = -1 # –1 for random
280
+ sample_rate: int = 22050
281
+
282
+
283
+ @dataclass
284
+ class TTSSamplerConfig:
285
+ """Configuration for TTS sampling."""
286
+ temperature: float = 1.0
287
+ noise_scale: float = 0.667
288
+ length_scale: float = 1.0
289
+
290
+
291
+ @dataclass
292
+ class TTSResult:
293
+ """Result from TTS processing."""
294
+ audio_path: str # Path where the synthesized audio is saved
295
+ duration_seconds: float
296
+ sample_rate: int
297
+ channels: int
298
+ num_samples: int
299
+
300
+
301
+ # --------------------------------------------------------------------------------------
302
+ # Computer Vision structures
303
+ # --------------------------------------------------------------------------------------
304
+
305
+ @dataclass
306
+ class BoundingBox:
307
+ """Generic bounding box structure."""
308
+ x: float # X coordinate (normalized or pixel, depends on model)
309
+ y: float # Y coordinate (normalized or pixel, depends on model)
310
+ width: float # Width
311
+ height: float # Height
312
+
313
+
314
+ @dataclass
315
+ class CVResult:
316
+ """Generic detection/classification result."""
317
+ image_paths: Optional[List[Path]] = None # Output image paths
318
+ image_count: int = 0 # Number of output images
319
+ class_id: int = 0 # Class ID (example: ConvNext)
320
+ confidence: float = 0.0 # Confidence score [0.0-1.0]
321
+ bbox: Optional[BoundingBox] = None # Bounding box (example: YOLO)
322
+ text: Optional[str] = None # Text result (example: OCR)
323
+ # Feature embedding (example: CLIP embedding)
324
+ embedding: Optional[List[float]] = None
325
+ embedding_dim: int = 0 # Embedding dimension
326
+
327
+
328
+ @dataclass
329
+ class CVResults:
330
+ """Generic CV inference result."""
331
+ results: List[CVResult] # Array of CV results
332
+ result_count: int # Number of CV results
333
+
334
+
335
+ class CVCapabilities:
336
+ """CV capabilities enum."""
337
+ OCR = 0 # OCR
338
+ CLASSIFICATION = 1 # Classification
339
+ SEGMENTATION = 2 # Segmentation
340
+ CUSTOM = 3 # Custom task
341
+
342
+
343
+ @dataclass
344
+ class CVModelConfig:
345
+ """CV model preprocessing configuration."""
346
+ capabilities: int # CVCapabilities
347
+
348
+ # MLX-OCR
349
+ det_model_path: Optional[str] = None # Detection model path
350
+ rec_model_path: Optional[str] = None # Recognition model path
351
+
352
+ # QNN
353
+ model_path: Optional[str] = None # Model path
354
+ system_library_path: Optional[str] = None # System library path
355
+ backend_library_path: Optional[str] = None # Backend library path
356
+ extension_library_path: Optional[str] = None # Extension library path
357
+ config_file_path: Optional[str] = None # Config file path
358
+ char_dict_path: Optional[str] = None # Character dictionary path
359
+
360
+
361
+ # --------------------------------------------------------------------------------------
362
+ # LLM
363
+ # --------------------------------------------------------------------------------------
364
+
365
+ class LLM(ABC):
366
+ """Abstract base class for Large Language Models."""
367
+
368
+ def __init__(
369
+ self,
370
+ model_path: Path,
371
+ tokenizer_path: Path,
372
+ config: ModelConfig,
373
+ device: Optional[str] = None,
374
+ ) -> None:
375
+ self.model_path = model_path
376
+ self.tokenizer_path = tokenizer_path
377
+ self.config = config
378
+ self.device = device
379
+
380
+ @abstractmethod
381
+ def destroy(self) -> None:
382
+ """Destroy the model and free resources."""
383
+ pass
384
+
385
+ @abstractmethod
386
+ def reset(self) -> None:
387
+ """Reset the model state."""
388
+ pass
389
+
390
+ # Tokenization
391
+ @abstractmethod
392
+ def encode(self, text: str) -> List[int]:
393
+ """Encode text to token IDs."""
394
+ pass
395
+
396
+ @abstractmethod
397
+ def decode(self, token_ids: Sequence[int]) -> str:
398
+ """Decode token IDs to text."""
399
+ pass
400
+
401
+ # KV-cache
402
+ @abstractmethod
403
+ def save_kv_cache(self, path: Path) -> bool:
404
+ """Save KV cache to file."""
405
+ pass
406
+
407
+ @abstractmethod
408
+ def load_kv_cache(self, path: Path) -> bool:
409
+ """Load KV cache from file."""
410
+ pass
411
+
412
+ # LoRA
413
+ @abstractmethod
414
+ def set_lora(self, lora_id: int) -> None:
415
+ """Set active LoRA adapter."""
416
+ pass
417
+
418
+ @abstractmethod
419
+ def add_lora(self, lora_path: Path) -> int:
420
+ """Add LoRA adapter and return its ID."""
421
+ pass
422
+
423
+ @abstractmethod
424
+ def remove_lora(self, lora_id: int) -> None:
425
+ """Remove LoRA adapter."""
426
+ pass
427
+
428
+ @abstractmethod
429
+ def list_loras(self) -> List[int]:
430
+ """List available LoRA adapters."""
431
+ pass
432
+
433
+ # Sampler
434
+ @abstractmethod
435
+ def set_sampler(self, config: SamplerConfig) -> None:
436
+ """Set sampler configuration."""
437
+ pass
438
+
439
+ @abstractmethod
440
+ def reset_sampler(self) -> None:
441
+ """Reset sampler to default configuration."""
442
+ pass
443
+
444
+ @abstractmethod
445
+ def generate_stream(
446
+ self,
447
+ prompt: str,
448
+ config: Optional[GenerationConfig],
449
+ on_token: TokenCallback,
450
+ user_data: Any = None,
451
+ ) -> str:
452
+ """Generate text with streaming callback."""
453
+ pass
454
+
455
+ @abstractmethod
456
+ def get_chat_template(self, template_name: str) -> str:
457
+ """Get chat template by name."""
458
+ pass
459
+
460
+ @abstractmethod
461
+ def apply_chat_template(self, messages: Sequence[ChatMessage], tools: Optional[Sequence[Tool]] = None, enable_thinking: bool = True) -> str:
462
+ """Apply chat template to messages with optional tools support."""
463
+ pass
464
+
465
+ # Embeddings
466
+ @abstractmethod
467
+ def embed(
468
+ self,
469
+ texts: Sequence[str],
470
+ config: Optional[EmbeddingConfig] = None,
471
+ ) -> List[List[float]]:
472
+ """Generate embeddings for texts."""
473
+ pass
474
+
475
+
476
+ # --------------------------------------------------------------------------------------
477
+ # VLM (Vision-Language Model)
478
+ # --------------------------------------------------------------------------------------
479
+
480
+ class VLM(ABC):
481
+ """Abstract base class for Vision-Language Models."""
482
+
483
+ def __init__(
484
+ self,
485
+ model_path: Path,
486
+ mmproj_path: Path,
487
+ context_length: int,
488
+ device: Optional[str] = None,
489
+ ) -> None:
490
+ self.model_path = model_path
491
+ self.mmproj_path = mmproj_path
492
+ self.context_length = context_length
493
+ self.device = device
494
+
495
+ @abstractmethod
496
+ def destroy(self) -> None:
497
+ """Destroy the model and free resources."""
498
+ pass
499
+
500
+ @abstractmethod
501
+ def reset(self) -> None:
502
+ """Reset the model state."""
503
+ pass
504
+
505
+ # Tokenization
506
+ @abstractmethod
507
+ def encode(self, text: str) -> List[int]:
508
+ """Encode text to token IDs."""
509
+ pass
510
+
511
+ @abstractmethod
512
+ def decode(self, token_ids: Sequence[int]) -> str:
513
+ """Decode token IDs to text."""
514
+ pass
515
+
516
+ # Sampler
517
+ @abstractmethod
518
+ def set_sampler(self, config: SamplerConfig) -> None:
519
+ """Set sampler configuration."""
520
+ pass
521
+
522
+ @abstractmethod
523
+ def reset_sampler(self) -> None:
524
+ """Reset sampler to default configuration."""
525
+ pass
526
+
527
+ # Generation
528
+ @abstractmethod
529
+ def generate(
530
+ self,
531
+ prompt: str,
532
+ config: Optional[GenerationConfig] = None,
533
+ ) -> str:
534
+ """Generate text from prompt."""
535
+ pass
536
+
537
+ @abstractmethod
538
+ def generate_multimodal(
539
+ self,
540
+ prompt: str,
541
+ image_paths: Optional[Sequence[Path]] = None,
542
+ audio_paths: Optional[Sequence[Path]] = None,
543
+ config: Optional[GenerationConfig] = None,
544
+ ) -> str:
545
+ """Generate text from prompt with multiple images and audio."""
546
+ pass
547
+
548
+ @abstractmethod
549
+ def generate_stream(
550
+ self,
551
+ prompt: str,
552
+ config: Optional[GenerationConfig],
553
+ on_token: TokenCallback,
554
+ user_data: Any = None,
555
+ ) -> str:
556
+ """Generate text with streaming callback."""
557
+ pass
558
+
559
+ @abstractmethod
560
+ def generate_stream_multimodal(
561
+ self,
562
+ prompt: str,
563
+ image_paths: Optional[Sequence[Path]] = None,
564
+ audio_paths: Optional[Sequence[Path]] = None,
565
+ config: Optional[GenerationConfig] = None,
566
+ on_token: Optional[TokenCallback] = None,
567
+ user_data: Any = None,
568
+ ) -> str:
569
+ """Generate text from prompt with multiple images and audio using streaming callback."""
570
+ pass
571
+
572
+ @abstractmethod
573
+ def get_chat_template(self, template_name: str) -> str:
574
+ """Get chat template by name."""
575
+ pass
576
+
577
+ @abstractmethod
578
+ def apply_chat_template(self, messages: Sequence[ChatMessage], tools: Optional[Sequence[Tool]] = None, enable_thinking: bool = True) -> str:
579
+ """Apply chat template to messages with optional tools support."""
580
+ pass
581
+
582
+ # Embeddings
583
+ @abstractmethod
584
+ def embed(
585
+ self,
586
+ texts: Sequence[str],
587
+ config: Optional[EmbeddingConfig] = None,
588
+ ) -> List[List[float]]:
589
+ """Generate embeddings for texts."""
590
+ pass
591
+
592
+
593
+ # --------------------------------------------------------------------------------------
594
+ # Embedding Model
595
+ # --------------------------------------------------------------------------------------
596
+
597
+ class Embedder(ABC):
598
+ """Abstract base class for embedding models."""
599
+
600
+ def __init__(
601
+ self,
602
+ model_path: Path,
603
+ tokenizer_path: Path,
604
+ device: Optional[str] = None,
605
+ ) -> None:
606
+ self.model_path = model_path
607
+ self.tokenizer_path = tokenizer_path
608
+ self.device = device
609
+
610
+ @abstractmethod
611
+ def destroy(self) -> None:
612
+ """Destroy the model and free resources."""
613
+ pass
614
+
615
+ @abstractmethod
616
+ def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
617
+ """Load model from path."""
618
+ pass
619
+
620
+ @abstractmethod
621
+ def close(self) -> None:
622
+ """Close the model."""
623
+ pass
624
+
625
+ @abstractmethod
626
+ def embed(
627
+ self,
628
+ texts: Sequence[str],
629
+ config: Optional[EmbeddingConfig] = None,
630
+ ) -> List[List[float]]:
631
+ """Generate embeddings for texts."""
632
+ pass
633
+
634
+ @abstractmethod
635
+ def embedding_dim(self) -> int:
636
+ """Get embedding dimension."""
637
+ pass
638
+
639
+ @abstractmethod
640
+ def set_lora(self, lora_id: int) -> None:
641
+ """Set active LoRA adapter."""
642
+ pass
643
+
644
+ @abstractmethod
645
+ def add_lora(self, lora_path: Path) -> int:
646
+ """Add LoRA adapter and return its ID."""
647
+ pass
648
+
649
+ @abstractmethod
650
+ def remove_lora(self, lora_id: int) -> None:
651
+ """Remove LoRA adapter."""
652
+ pass
653
+
654
+ @abstractmethod
655
+ def list_loras(self) -> List[int]:
656
+ """List available LoRA adapters."""
657
+ pass
658
+
659
+
660
+ # --------------------------------------------------------------------------------------
661
+ # Reranker Model
662
+ # --------------------------------------------------------------------------------------
663
+
664
+ class Reranker(ABC):
665
+ """Abstract base class for reranker models."""
666
+
667
+ def __init__(
668
+ self,
669
+ model_path: Path,
670
+ tokenizer_path: Path,
671
+ device: Optional[str] = None,
672
+ ) -> None:
673
+ self.model_path = model_path
674
+ self.tokenizer_path = tokenizer_path
675
+ self.device = device
676
+
677
+ @abstractmethod
678
+ def destroy(self) -> None:
679
+ """Destroy the model and free resources."""
680
+ pass
681
+
682
+ @abstractmethod
683
+ def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
684
+ """Load model from path."""
685
+ pass
686
+
687
+ @abstractmethod
688
+ def close(self) -> None:
689
+ """Close the model."""
690
+ pass
691
+
692
+ @abstractmethod
693
+ def rerank(
694
+ self,
695
+ query: str,
696
+ documents: Sequence[str],
697
+ config: Optional[RerankConfig] = None,
698
+ ) -> List[float]:
699
+ """Rerank documents given a query."""
700
+ pass
701
+
702
+
703
+ # --------------------------------------------------------------------------------------
704
+ # Image generation
705
+ # --------------------------------------------------------------------------------------
706
+
707
+ class ImageGen(ABC):
708
+ """Abstract base class for image generation models."""
709
+
710
+ def __init__(
711
+ self,
712
+ model_path: Path,
713
+ scheduler_config_path: Path,
714
+ device: Optional[str] = None,
715
+ ) -> None:
716
+ self.model_path = model_path
717
+ self.scheduler_config_path = scheduler_config_path
718
+ self.device = device
719
+
720
+ @abstractmethod
721
+ def destroy(self) -> None:
722
+ """Destroy the model and free resources."""
723
+ pass
724
+
725
+ @abstractmethod
726
+ def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
727
+ """Load model from path."""
728
+ pass
729
+
730
+ @abstractmethod
731
+ def close(self) -> None:
732
+ """Close the model."""
733
+ pass
734
+
735
+ @abstractmethod
736
+ def set_scheduler(self, config: SchedulerConfig) -> None:
737
+ """Set scheduler configuration."""
738
+ pass
739
+
740
+ @abstractmethod
741
+ def set_sampler(self, config: ImageSamplerConfig) -> None:
742
+ """Set sampler configuration."""
743
+ pass
744
+
745
+ @abstractmethod
746
+ def reset_sampler(self) -> None:
747
+ """Reset sampler to default configuration."""
748
+ pass
749
+
750
+ @abstractmethod
751
+ def txt2img(self, prompt: str, config: ImageGenerationConfig) -> Image:
752
+ """Generate image from text prompt."""
753
+ pass
754
+
755
+ @abstractmethod
756
+ def img2img(self, init_image: Image, prompt: str, config: ImageGenerationConfig) -> Image:
757
+ """Generate image from initial image and text prompt."""
758
+ pass
759
+
760
+ @abstractmethod
761
+ def generate(self, config: ImageGenerationConfig) -> Image:
762
+ """Generate image from configuration."""
763
+ pass
764
+
765
+ @abstractmethod
766
+ def set_lora(self, lora_id: int) -> None:
767
+ """Set active LoRA adapter."""
768
+ pass
769
+
770
+ @abstractmethod
771
+ def add_lora(self, lora_path: Path) -> int:
772
+ """Add LoRA adapter and return its ID."""
773
+ pass
774
+
775
+ @abstractmethod
776
+ def remove_lora(self, lora_id: int) -> None:
777
+ """Remove LoRA adapter."""
778
+ pass
779
+
780
+ @abstractmethod
781
+ def list_loras(self) -> List[int]:
782
+ """List available LoRA adapters."""
783
+ pass
784
+
785
+
786
+ # --------------------------------------------------------------------------------------
787
+ # Computer vision – Generic CV Model
788
+ # --------------------------------------------------------------------------------------
789
+
790
+ class CVModel(ABC):
791
+ """Abstract base class for generic computer vision models."""
792
+
793
+ def __init__(self, config: CVModelConfig, device: Optional[str] = None) -> None:
794
+ self.config = config
795
+ self.device = device
796
+
797
+ @abstractmethod
798
+ def destroy(self) -> None:
799
+ """Destroy the model and free resources."""
800
+ pass
801
+
802
+ @abstractmethod
803
+ def infer(self, input_image_path: str) -> CVResults:
804
+ """Perform inference on image."""
805
+ pass
806
+
807
+
808
+ # --------------------------------------------------------------------------------------
809
+ # Speech recognition – ASR
810
+ # --------------------------------------------------------------------------------------
811
+
812
+ class ASR(ABC):
813
+ """Abstract base class for Automatic Speech Recognition models."""
814
+
815
+ def __init__(
816
+ self,
817
+ model_path: Path,
818
+ tokenizer_path: Optional[Path],
819
+ language: Optional[str],
820
+ device: Optional[str] = None,
821
+ ) -> None:
822
+ self.model_path = model_path
823
+ self.tokenizer_path = tokenizer_path
824
+ self.language = language
825
+ self.device = device
826
+
827
+ @abstractmethod
828
+ def destroy(self) -> None:
829
+ """Destroy the model and free resources."""
830
+ pass
831
+
832
+ @abstractmethod
833
+ def close(self) -> None:
834
+ """Close the model."""
835
+ pass
836
+
837
+ @abstractmethod
838
+ def transcribe(
839
+ self,
840
+ audio_path: Path,
841
+ language: Optional[str] = None,
842
+ config: Optional[ASRConfig] = None,
843
+ ) -> ASRResult:
844
+ """Transcribe audio file to text."""
845
+ pass
846
+
847
+ @abstractmethod
848
+ def list_supported_languages(self) -> List[str]:
849
+ """List supported languages."""
850
+ pass
851
+
852
+
853
+ # --------------------------------------------------------------------------------------
854
+ # Speech synthesis – TTS
855
+ # --------------------------------------------------------------------------------------
856
+
857
+ class TTS(ABC):
858
+ """Abstract base class for Text-to-Speech models."""
859
+
860
+ def __init__(
861
+ self,
862
+ model_path: Path,
863
+ vocoder_path: Path,
864
+ device: Optional[str] = None,
865
+ ) -> None:
866
+ self.model_path = model_path
867
+ self.vocoder_path = vocoder_path
868
+ self.device = device
869
+
870
+ @abstractmethod
871
+ def destroy(self) -> None:
872
+ """Destroy the model and free resources."""
873
+ pass
874
+
875
+ @abstractmethod
876
+ def synthesize(
877
+ self,
878
+ text: str,
879
+ config: Optional[TTSConfig] = None,
880
+ output_path: Optional[Path] = None,
881
+ ) -> TTSResult:
882
+ """Synthesize speech from text and save to filesystem."""
883
+ pass
884
+
885
+ @abstractmethod
886
+ def list_available_voices(self) -> List[str]:
887
+ """List available voices."""
888
+ pass