nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexaai/__init__.py +99 -0
- nexaai/_stub.cpython-310-darwin.so +0 -0
- nexaai/_version.py +4 -0
- nexaai/asr.py +68 -0
- nexaai/asr_impl/__init__.py +0 -0
- nexaai/asr_impl/mlx_asr_impl.py +93 -0
- nexaai/asr_impl/pybind_asr_impl.py +127 -0
- nexaai/base.py +39 -0
- nexaai/binds/__init__.py +7 -0
- nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
- nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
- nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
- nexaai/binds/cpu_gpu/libggml.dylib +0 -0
- nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
- nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/libnexa_bridge.dylib +0 -0
- nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/metal/libnexa_plugin.dylib +0 -0
- nexaai/binds/metal/py-lib/ml.py +888 -0
- nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
- nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
- nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
- nexaai/binds/metal/py-lib/profiling.py +239 -0
- nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
- nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
- nexaai/binds/nexaml/libggml-base.dylib +0 -0
- nexaai/binds/nexaml/libggml-cpu.so +0 -0
- nexaai/binds/nexaml/libggml-metal.so +0 -0
- nexaai/binds/nexaml/libggml.dylib +0 -0
- nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
- nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
- nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
- nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
- nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
- nexaai/binds/nexaml/libnexaproc.dylib +0 -0
- nexaai/binds/nexaml/libomp.dylib +0 -0
- nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
- nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
- nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
- nexaai/common.py +106 -0
- nexaai/cv.py +95 -0
- nexaai/cv_impl/__init__.py +0 -0
- nexaai/cv_impl/mlx_cv_impl.py +91 -0
- nexaai/cv_impl/pybind_cv_impl.py +124 -0
- nexaai/diarize.py +80 -0
- nexaai/diarize_impl/__init__.py +1 -0
- nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
- nexaai/embedder.py +73 -0
- nexaai/embedder_impl/__init__.py +0 -0
- nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
- nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
- nexaai/image_gen.py +141 -0
- nexaai/image_gen_impl/__init__.py +0 -0
- nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
- nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
- nexaai/llm.py +98 -0
- nexaai/llm_impl/__init__.py +0 -0
- nexaai/llm_impl/mlx_llm_impl.py +271 -0
- nexaai/llm_impl/pybind_llm_impl.py +238 -0
- nexaai/log.py +92 -0
- nexaai/mlx_backend/asr/__init__.py +12 -0
- nexaai/mlx_backend/asr/interface.py +122 -0
- nexaai/mlx_backend/common/__init__.py +0 -0
- nexaai/mlx_backend/common/utils.py +25 -0
- nexaai/mlx_backend/cv/__init__.py +0 -0
- nexaai/mlx_backend/cv/generate.py +195 -0
- nexaai/mlx_backend/cv/interface.py +162 -0
- nexaai/mlx_backend/cv/main.py +81 -0
- nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
- nexaai/mlx_backend/embedding/__init__.py +0 -0
- nexaai/mlx_backend/embedding/generate.py +333 -0
- nexaai/mlx_backend/embedding/interface.py +617 -0
- nexaai/mlx_backend/embedding/main.py +173 -0
- nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
- nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
- nexaai/mlx_backend/image_gen/__init__.py +1 -0
- nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
- nexaai/mlx_backend/image_gen/interface.py +82 -0
- nexaai/mlx_backend/image_gen/main.py +281 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
- nexaai/mlx_backend/llm/__init__.py +0 -0
- nexaai/mlx_backend/llm/generate.py +149 -0
- nexaai/mlx_backend/llm/interface.py +764 -0
- nexaai/mlx_backend/llm/main.py +68 -0
- nexaai/mlx_backend/ml.py +888 -0
- nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/mlx_backend/mlx_audio/server.py +525 -0
- nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
- nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
- nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
- nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
- nexaai/mlx_backend/mlx_audio/utils.py +237 -0
- nexaai/mlx_backend/mlx_audio/version.py +1 -0
- nexaai/mlx_backend/profiling.py +239 -0
- nexaai/mlx_backend/rerank/__init__.py +0 -0
- nexaai/mlx_backend/rerank/generate.py +174 -0
- nexaai/mlx_backend/rerank/interface.py +287 -0
- nexaai/mlx_backend/rerank/main.py +127 -0
- nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
- nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
- nexaai/mlx_backend/sd/__init__.py +1 -0
- nexaai/mlx_backend/sd/interface.py +362 -0
- nexaai/mlx_backend/sd/main.py +286 -0
- nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
- nexaai/mlx_backend/sd/modeling/clip.py +116 -0
- nexaai/mlx_backend/sd/modeling/config.py +65 -0
- nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
- nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
- nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
- nexaai/mlx_backend/sd/modeling/unet.py +460 -0
- nexaai/mlx_backend/sd/modeling/vae.py +274 -0
- nexaai/mlx_backend/tts/__init__.py +12 -0
- nexaai/mlx_backend/tts/interface.py +276 -0
- nexaai/mlx_backend/vlm/__init__.py +3 -0
- nexaai/mlx_backend/vlm/generate.py +572 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
- nexaai/mlx_backend/vlm/interface.py +559 -0
- nexaai/mlx_backend/vlm/main.py +365 -0
- nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
- nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
- nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
- nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
- nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
- nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
- nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
- nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
- nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
- nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
- nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
- nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
- nexaai/rerank.py +57 -0
- nexaai/rerank_impl/__init__.py +0 -0
- nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
- nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
- nexaai/runtime.py +68 -0
- nexaai/runtime_error.py +24 -0
- nexaai/tts.py +75 -0
- nexaai/tts_impl/__init__.py +0 -0
- nexaai/tts_impl/mlx_tts_impl.py +94 -0
- nexaai/tts_impl/pybind_tts_impl.py +43 -0
- nexaai/utils/decode.py +18 -0
- nexaai/utils/manifest_utils.py +531 -0
- nexaai/utils/model_manager.py +1745 -0
- nexaai/utils/model_types.py +49 -0
- nexaai/utils/progress_tracker.py +389 -0
- nexaai/utils/quantization_utils.py +245 -0
- nexaai/vlm.py +130 -0
- nexaai/vlm_impl/__init__.py +0 -0
- nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
- nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
- nexaai-1.0.29.dist-info/METADATA +35 -0
- nexaai-1.0.29.dist-info/RECORD +580 -0
- nexaai-1.0.29.dist-info/WHEEL +5 -0
- nexaai-1.0.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,888 @@
|
|
|
1
|
+
# This file defines the python interface that c-lib expects from a python backend
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import (
|
|
11
|
+
Any,
|
|
12
|
+
Callable,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Protocol,
|
|
16
|
+
Sequence,
|
|
17
|
+
Tuple,
|
|
18
|
+
TypedDict,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# --------------------------------------------------------------------------------------
|
|
22
|
+
# Core aliases & callback protocols
|
|
23
|
+
# --------------------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
Path = str
|
|
26
|
+
|
|
27
|
+
LogCallback = Callable[[str], None]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TokenCallback(Protocol):
|
|
31
|
+
def __call__(self, token: str, user_data: Any) -> bool: ...
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# --------------------------------------------------------------------------------------
|
|
35
|
+
# Core module functions
|
|
36
|
+
# --------------------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
def init() -> None:
|
|
39
|
+
"""Initialize the ML module."""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def deinit() -> None:
|
|
44
|
+
"""Deinitialize the ML module."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def set_log(callback: LogCallback) -> None:
|
|
49
|
+
"""Set the logging callback."""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def log(message: str) -> None:
|
|
54
|
+
"""Log a message."""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def model_config_default() -> ModelConfig:
|
|
59
|
+
"""Get default model configuration with sensible defaults."""
|
|
60
|
+
return ModelConfig()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# --------------------------------------------------------------------------------------
|
|
64
|
+
# Basic data structures
|
|
65
|
+
# --------------------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class Image:
|
|
69
|
+
"""Image data structure."""
|
|
70
|
+
data: List[float] # width × height × channels
|
|
71
|
+
width: int
|
|
72
|
+
height: int
|
|
73
|
+
channels: int # 3 = RGB, 4 = RGBA
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class Audio:
|
|
78
|
+
"""Audio data structure."""
|
|
79
|
+
data: List[float] # num_samples × channels
|
|
80
|
+
sample_rate: int
|
|
81
|
+
channels: int
|
|
82
|
+
num_samples: int
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class Video:
|
|
87
|
+
"""Video data structure."""
|
|
88
|
+
data: List[float] # width × height × channels × num_frames
|
|
89
|
+
width: int
|
|
90
|
+
height: int
|
|
91
|
+
channels: int
|
|
92
|
+
num_frames: int
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# --------------------------------------------------------------------------------------
|
|
96
|
+
# Language-model structures
|
|
97
|
+
# --------------------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class ModelConfig:
|
|
101
|
+
"""Configuration for model parameters."""
|
|
102
|
+
n_ctx: int = 0 # text context, 0 = from model
|
|
103
|
+
n_threads: int = 0 # number of threads to use for generation
|
|
104
|
+
n_threads_batch: int = 0 # number of threads to use for batch processing
|
|
105
|
+
n_batch: int = 0 # logical maximum batch size that can be submitted to llama_decode
|
|
106
|
+
n_ubatch: int = 0 # physical maximum batch size
|
|
107
|
+
# max number of sequences (i.e. distinct states for recurrent models)
|
|
108
|
+
n_seq_max: int = 0
|
|
109
|
+
# path to chat template file, optional
|
|
110
|
+
chat_template_path: Optional[Path] = None
|
|
111
|
+
# content of chat template file, optional
|
|
112
|
+
chat_template_content: Optional[str] = None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class SamplerConfig:
|
|
117
|
+
"""Configuration for text sampling."""
|
|
118
|
+
temperature: float = 0.7
|
|
119
|
+
top_p: float = 0.9
|
|
120
|
+
top_k: int = 40
|
|
121
|
+
min_p: float = 0.0 # Minimum probability for nucleus sampling
|
|
122
|
+
repetition_penalty: float = 1.0
|
|
123
|
+
presence_penalty: float = 0.0
|
|
124
|
+
frequency_penalty: float = 0.0
|
|
125
|
+
seed: int = -1 # –1 for random
|
|
126
|
+
grammar_path: Optional[Path] = None
|
|
127
|
+
# Optional grammar string (BNF-like format)
|
|
128
|
+
grammar_string: Optional[str] = None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass
|
|
132
|
+
class GenerationConfig:
|
|
133
|
+
"""Configuration for text generation."""
|
|
134
|
+
max_tokens: int = 512
|
|
135
|
+
stop: Sequence[str] = field(default_factory=tuple)
|
|
136
|
+
n_past: int = 0
|
|
137
|
+
sampler_config: Optional[SamplerConfig] = None
|
|
138
|
+
# Array of image paths for VLM (None if none)
|
|
139
|
+
image_paths: Optional[Sequence[Path]] = None
|
|
140
|
+
# Array of audio paths for VLM (None if none)
|
|
141
|
+
audio_paths: Optional[Sequence[Path]] = None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class ChatMessage:
|
|
146
|
+
"""A chat message with role and content."""
|
|
147
|
+
role: str # "user" | "assistant" | "system"
|
|
148
|
+
content: str
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class ToolFunction(TypedDict):
|
|
152
|
+
name: str
|
|
153
|
+
description: str
|
|
154
|
+
parameters_json: str
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class Tool(TypedDict):
|
|
158
|
+
type: str
|
|
159
|
+
function: ToolFunction
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# --------------------------------------------------------------------------------------
|
|
163
|
+
# Embedding / rerank / diffusion / OCR / ASR / TTS utilities
|
|
164
|
+
# --------------------------------------------------------------------------------------
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class EmbeddingConfig:
|
|
168
|
+
"""Configuration for embeddings."""
|
|
169
|
+
batch_size: int = 1
|
|
170
|
+
normalize: bool = True
|
|
171
|
+
normalize_method: str = "l2" # "l2" | "mean" | "none"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class RerankConfig:
|
|
176
|
+
"""Configuration for reranking."""
|
|
177
|
+
batch_size: int = 1
|
|
178
|
+
normalize: bool = True
|
|
179
|
+
normalize_method: str = "softmax" # "softmax" | "min-max" | "none"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# image-gen
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@dataclass
|
|
186
|
+
class ImageGenTxt2ImgInput:
|
|
187
|
+
"""Input structure for text-to-image generation."""
|
|
188
|
+
prompt: str
|
|
189
|
+
config: ImageGenerationConfig
|
|
190
|
+
output_path: Optional[Path] = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@dataclass
|
|
194
|
+
class ImageGenImg2ImgInput:
|
|
195
|
+
"""Input structure for image-to-image generation."""
|
|
196
|
+
init_image_path: Path
|
|
197
|
+
prompt: str
|
|
198
|
+
config: ImageGenerationConfig
|
|
199
|
+
output_path: Optional[Path] = None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class ImageGenOutput:
|
|
204
|
+
"""Output structure for image generation."""
|
|
205
|
+
output_image_path: Path
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@dataclass
|
|
209
|
+
class ImageSamplerConfig:
|
|
210
|
+
"""Configuration for image sampling."""
|
|
211
|
+
method: str = "ddim"
|
|
212
|
+
steps: int = 20
|
|
213
|
+
guidance_scale: float = 7.5
|
|
214
|
+
eta: float = 0.0
|
|
215
|
+
seed: int = -1 # –1 for random
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@dataclass
|
|
219
|
+
class ImageGenCreateInput:
|
|
220
|
+
"""Configuration for image generation."""
|
|
221
|
+
model_name: str
|
|
222
|
+
model_path: Path
|
|
223
|
+
config: ModelConfig
|
|
224
|
+
scheduler_config_path: Path
|
|
225
|
+
plugin_id: str
|
|
226
|
+
device_id: Optional[str] = None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@dataclass
|
|
230
|
+
class ImageGenerationConfig:
|
|
231
|
+
"""Configuration for image generation."""
|
|
232
|
+
prompts: List[str]
|
|
233
|
+
sampler_config: ImageSamplerConfig
|
|
234
|
+
scheduler_config: SchedulerConfig
|
|
235
|
+
strength: float
|
|
236
|
+
negative_prompts: Optional[List[str]] = None
|
|
237
|
+
height: int = 512
|
|
238
|
+
width: int = 512
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@dataclass
|
|
242
|
+
class SchedulerConfig:
|
|
243
|
+
"""Configuration for diffusion scheduler."""
|
|
244
|
+
type: str = "ddim"
|
|
245
|
+
num_train_timesteps: int = 1000
|
|
246
|
+
steps_offset: int = 0 # An offset added to the inference steps
|
|
247
|
+
beta_start: float = 0.00085
|
|
248
|
+
beta_end: float = 0.012
|
|
249
|
+
beta_schedule: str = "scaled_linear"
|
|
250
|
+
prediction_type: str = "epsilon"
|
|
251
|
+
timestep_type: str = "discrete"
|
|
252
|
+
timestep_spacing: str = "linspace"
|
|
253
|
+
interpolation_type: str = "linear"
|
|
254
|
+
config_path: Optional[Path] = None
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@dataclass
|
|
258
|
+
class ASRConfig:
|
|
259
|
+
"""Configuration for ASR."""
|
|
260
|
+
timestamps: str = "none" # "none" | "segment" | "word"
|
|
261
|
+
beam_size: int = 5
|
|
262
|
+
stream: bool = False
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
@dataclass
|
|
266
|
+
class ASRResult:
|
|
267
|
+
"""Result from ASR processing."""
|
|
268
|
+
transcript: str
|
|
269
|
+
confidence_scores: Sequence[float]
|
|
270
|
+
timestamps: Sequence[Tuple[float, float]]
|
|
271
|
+
duration_us: float
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
@dataclass
|
|
275
|
+
class TTSConfig:
|
|
276
|
+
"""Configuration for TTS."""
|
|
277
|
+
voice: str = "default"
|
|
278
|
+
speed: float = 1.0
|
|
279
|
+
seed: int = -1 # –1 for random
|
|
280
|
+
sample_rate: int = 22050
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@dataclass
|
|
284
|
+
class TTSSamplerConfig:
|
|
285
|
+
"""Configuration for TTS sampling."""
|
|
286
|
+
temperature: float = 1.0
|
|
287
|
+
noise_scale: float = 0.667
|
|
288
|
+
length_scale: float = 1.0
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@dataclass
|
|
292
|
+
class TTSResult:
|
|
293
|
+
"""Result from TTS processing."""
|
|
294
|
+
audio_path: str # Path where the synthesized audio is saved
|
|
295
|
+
duration_seconds: float
|
|
296
|
+
sample_rate: int
|
|
297
|
+
channels: int
|
|
298
|
+
num_samples: int
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# --------------------------------------------------------------------------------------
|
|
302
|
+
# Computer Vision structures
|
|
303
|
+
# --------------------------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
@dataclass
|
|
306
|
+
class BoundingBox:
|
|
307
|
+
"""Generic bounding box structure."""
|
|
308
|
+
x: float # X coordinate (normalized or pixel, depends on model)
|
|
309
|
+
y: float # Y coordinate (normalized or pixel, depends on model)
|
|
310
|
+
width: float # Width
|
|
311
|
+
height: float # Height
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
@dataclass
|
|
315
|
+
class CVResult:
|
|
316
|
+
"""Generic detection/classification result."""
|
|
317
|
+
image_paths: Optional[List[Path]] = None # Output image paths
|
|
318
|
+
image_count: int = 0 # Number of output images
|
|
319
|
+
class_id: int = 0 # Class ID (example: ConvNext)
|
|
320
|
+
confidence: float = 0.0 # Confidence score [0.0-1.0]
|
|
321
|
+
bbox: Optional[BoundingBox] = None # Bounding box (example: YOLO)
|
|
322
|
+
text: Optional[str] = None # Text result (example: OCR)
|
|
323
|
+
# Feature embedding (example: CLIP embedding)
|
|
324
|
+
embedding: Optional[List[float]] = None
|
|
325
|
+
embedding_dim: int = 0 # Embedding dimension
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
@dataclass
|
|
329
|
+
class CVResults:
|
|
330
|
+
"""Generic CV inference result."""
|
|
331
|
+
results: List[CVResult] # Array of CV results
|
|
332
|
+
result_count: int # Number of CV results
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class CVCapabilities:
|
|
336
|
+
"""CV capabilities enum."""
|
|
337
|
+
OCR = 0 # OCR
|
|
338
|
+
CLASSIFICATION = 1 # Classification
|
|
339
|
+
SEGMENTATION = 2 # Segmentation
|
|
340
|
+
CUSTOM = 3 # Custom task
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@dataclass
|
|
344
|
+
class CVModelConfig:
|
|
345
|
+
"""CV model preprocessing configuration."""
|
|
346
|
+
capabilities: int # CVCapabilities
|
|
347
|
+
|
|
348
|
+
# MLX-OCR
|
|
349
|
+
det_model_path: Optional[str] = None # Detection model path
|
|
350
|
+
rec_model_path: Optional[str] = None # Recognition model path
|
|
351
|
+
|
|
352
|
+
# QNN
|
|
353
|
+
model_path: Optional[str] = None # Model path
|
|
354
|
+
system_library_path: Optional[str] = None # System library path
|
|
355
|
+
backend_library_path: Optional[str] = None # Backend library path
|
|
356
|
+
extension_library_path: Optional[str] = None # Extension library path
|
|
357
|
+
config_file_path: Optional[str] = None # Config file path
|
|
358
|
+
char_dict_path: Optional[str] = None # Character dictionary path
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
# --------------------------------------------------------------------------------------
|
|
362
|
+
# LLM
|
|
363
|
+
# --------------------------------------------------------------------------------------
|
|
364
|
+
|
|
365
|
+
class LLM(ABC):
|
|
366
|
+
"""Abstract base class for Large Language Models."""
|
|
367
|
+
|
|
368
|
+
def __init__(
|
|
369
|
+
self,
|
|
370
|
+
model_path: Path,
|
|
371
|
+
tokenizer_path: Path,
|
|
372
|
+
config: ModelConfig,
|
|
373
|
+
device: Optional[str] = None,
|
|
374
|
+
) -> None:
|
|
375
|
+
self.model_path = model_path
|
|
376
|
+
self.tokenizer_path = tokenizer_path
|
|
377
|
+
self.config = config
|
|
378
|
+
self.device = device
|
|
379
|
+
|
|
380
|
+
@abstractmethod
|
|
381
|
+
def destroy(self) -> None:
|
|
382
|
+
"""Destroy the model and free resources."""
|
|
383
|
+
pass
|
|
384
|
+
|
|
385
|
+
@abstractmethod
|
|
386
|
+
def reset(self) -> None:
|
|
387
|
+
"""Reset the model state."""
|
|
388
|
+
pass
|
|
389
|
+
|
|
390
|
+
# Tokenization
|
|
391
|
+
@abstractmethod
|
|
392
|
+
def encode(self, text: str) -> List[int]:
|
|
393
|
+
"""Encode text to token IDs."""
|
|
394
|
+
pass
|
|
395
|
+
|
|
396
|
+
@abstractmethod
|
|
397
|
+
def decode(self, token_ids: Sequence[int]) -> str:
|
|
398
|
+
"""Decode token IDs to text."""
|
|
399
|
+
pass
|
|
400
|
+
|
|
401
|
+
# KV-cache
|
|
402
|
+
@abstractmethod
|
|
403
|
+
def save_kv_cache(self, path: Path) -> bool:
|
|
404
|
+
"""Save KV cache to file."""
|
|
405
|
+
pass
|
|
406
|
+
|
|
407
|
+
@abstractmethod
|
|
408
|
+
def load_kv_cache(self, path: Path) -> bool:
|
|
409
|
+
"""Load KV cache from file."""
|
|
410
|
+
pass
|
|
411
|
+
|
|
412
|
+
# LoRA
|
|
413
|
+
@abstractmethod
|
|
414
|
+
def set_lora(self, lora_id: int) -> None:
|
|
415
|
+
"""Set active LoRA adapter."""
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
@abstractmethod
|
|
419
|
+
def add_lora(self, lora_path: Path) -> int:
|
|
420
|
+
"""Add LoRA adapter and return its ID."""
|
|
421
|
+
pass
|
|
422
|
+
|
|
423
|
+
@abstractmethod
|
|
424
|
+
def remove_lora(self, lora_id: int) -> None:
|
|
425
|
+
"""Remove LoRA adapter."""
|
|
426
|
+
pass
|
|
427
|
+
|
|
428
|
+
@abstractmethod
|
|
429
|
+
def list_loras(self) -> List[int]:
|
|
430
|
+
"""List available LoRA adapters."""
|
|
431
|
+
pass
|
|
432
|
+
|
|
433
|
+
# Sampler
|
|
434
|
+
@abstractmethod
|
|
435
|
+
def set_sampler(self, config: SamplerConfig) -> None:
|
|
436
|
+
"""Set sampler configuration."""
|
|
437
|
+
pass
|
|
438
|
+
|
|
439
|
+
@abstractmethod
|
|
440
|
+
def reset_sampler(self) -> None:
|
|
441
|
+
"""Reset sampler to default configuration."""
|
|
442
|
+
pass
|
|
443
|
+
|
|
444
|
+
@abstractmethod
|
|
445
|
+
def generate_stream(
|
|
446
|
+
self,
|
|
447
|
+
prompt: str,
|
|
448
|
+
config: Optional[GenerationConfig],
|
|
449
|
+
on_token: TokenCallback,
|
|
450
|
+
user_data: Any = None,
|
|
451
|
+
) -> str:
|
|
452
|
+
"""Generate text with streaming callback."""
|
|
453
|
+
pass
|
|
454
|
+
|
|
455
|
+
@abstractmethod
|
|
456
|
+
def get_chat_template(self, template_name: str) -> str:
|
|
457
|
+
"""Get chat template by name."""
|
|
458
|
+
pass
|
|
459
|
+
|
|
460
|
+
@abstractmethod
|
|
461
|
+
def apply_chat_template(self, messages: Sequence[ChatMessage], tools: Optional[Sequence[Tool]] = None, enable_thinking: bool = True) -> str:
|
|
462
|
+
"""Apply chat template to messages with optional tools support."""
|
|
463
|
+
pass
|
|
464
|
+
|
|
465
|
+
# Embeddings
|
|
466
|
+
@abstractmethod
|
|
467
|
+
def embed(
|
|
468
|
+
self,
|
|
469
|
+
texts: Sequence[str],
|
|
470
|
+
config: Optional[EmbeddingConfig] = None,
|
|
471
|
+
) -> List[List[float]]:
|
|
472
|
+
"""Generate embeddings for texts."""
|
|
473
|
+
pass
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
# --------------------------------------------------------------------------------------
|
|
477
|
+
# VLM (Vision-Language Model)
|
|
478
|
+
# --------------------------------------------------------------------------------------
|
|
479
|
+
|
|
480
|
+
class VLM(ABC):
|
|
481
|
+
"""Abstract base class for Vision-Language Models."""
|
|
482
|
+
|
|
483
|
+
def __init__(
|
|
484
|
+
self,
|
|
485
|
+
model_path: Path,
|
|
486
|
+
mmproj_path: Path,
|
|
487
|
+
context_length: int,
|
|
488
|
+
device: Optional[str] = None,
|
|
489
|
+
) -> None:
|
|
490
|
+
self.model_path = model_path
|
|
491
|
+
self.mmproj_path = mmproj_path
|
|
492
|
+
self.context_length = context_length
|
|
493
|
+
self.device = device
|
|
494
|
+
|
|
495
|
+
@abstractmethod
|
|
496
|
+
def destroy(self) -> None:
|
|
497
|
+
"""Destroy the model and free resources."""
|
|
498
|
+
pass
|
|
499
|
+
|
|
500
|
+
@abstractmethod
|
|
501
|
+
def reset(self) -> None:
|
|
502
|
+
"""Reset the model state."""
|
|
503
|
+
pass
|
|
504
|
+
|
|
505
|
+
# Tokenization
|
|
506
|
+
@abstractmethod
|
|
507
|
+
def encode(self, text: str) -> List[int]:
|
|
508
|
+
"""Encode text to token IDs."""
|
|
509
|
+
pass
|
|
510
|
+
|
|
511
|
+
@abstractmethod
|
|
512
|
+
def decode(self, token_ids: Sequence[int]) -> str:
|
|
513
|
+
"""Decode token IDs to text."""
|
|
514
|
+
pass
|
|
515
|
+
|
|
516
|
+
# Sampler
|
|
517
|
+
@abstractmethod
|
|
518
|
+
def set_sampler(self, config: SamplerConfig) -> None:
|
|
519
|
+
"""Set sampler configuration."""
|
|
520
|
+
pass
|
|
521
|
+
|
|
522
|
+
@abstractmethod
|
|
523
|
+
def reset_sampler(self) -> None:
|
|
524
|
+
"""Reset sampler to default configuration."""
|
|
525
|
+
pass
|
|
526
|
+
|
|
527
|
+
# Generation
|
|
528
|
+
@abstractmethod
|
|
529
|
+
def generate(
|
|
530
|
+
self,
|
|
531
|
+
prompt: str,
|
|
532
|
+
config: Optional[GenerationConfig] = None,
|
|
533
|
+
) -> str:
|
|
534
|
+
"""Generate text from prompt."""
|
|
535
|
+
pass
|
|
536
|
+
|
|
537
|
+
@abstractmethod
|
|
538
|
+
def generate_multimodal(
|
|
539
|
+
self,
|
|
540
|
+
prompt: str,
|
|
541
|
+
image_paths: Optional[Sequence[Path]] = None,
|
|
542
|
+
audio_paths: Optional[Sequence[Path]] = None,
|
|
543
|
+
config: Optional[GenerationConfig] = None,
|
|
544
|
+
) -> str:
|
|
545
|
+
"""Generate text from prompt with multiple images and audio."""
|
|
546
|
+
pass
|
|
547
|
+
|
|
548
|
+
@abstractmethod
|
|
549
|
+
def generate_stream(
|
|
550
|
+
self,
|
|
551
|
+
prompt: str,
|
|
552
|
+
config: Optional[GenerationConfig],
|
|
553
|
+
on_token: TokenCallback,
|
|
554
|
+
user_data: Any = None,
|
|
555
|
+
) -> str:
|
|
556
|
+
"""Generate text with streaming callback."""
|
|
557
|
+
pass
|
|
558
|
+
|
|
559
|
+
@abstractmethod
|
|
560
|
+
def generate_stream_multimodal(
|
|
561
|
+
self,
|
|
562
|
+
prompt: str,
|
|
563
|
+
image_paths: Optional[Sequence[Path]] = None,
|
|
564
|
+
audio_paths: Optional[Sequence[Path]] = None,
|
|
565
|
+
config: Optional[GenerationConfig] = None,
|
|
566
|
+
on_token: Optional[TokenCallback] = None,
|
|
567
|
+
user_data: Any = None,
|
|
568
|
+
) -> str:
|
|
569
|
+
"""Generate text from prompt with multiple images and audio using streaming callback."""
|
|
570
|
+
pass
|
|
571
|
+
|
|
572
|
+
@abstractmethod
|
|
573
|
+
def get_chat_template(self, template_name: str) -> str:
|
|
574
|
+
"""Get chat template by name."""
|
|
575
|
+
pass
|
|
576
|
+
|
|
577
|
+
@abstractmethod
|
|
578
|
+
def apply_chat_template(self, messages: Sequence[ChatMessage], tools: Optional[Sequence[Tool]] = None, enable_thinking: bool = True) -> str:
|
|
579
|
+
"""Apply chat template to messages with optional tools support."""
|
|
580
|
+
pass
|
|
581
|
+
|
|
582
|
+
# Embeddings
|
|
583
|
+
@abstractmethod
|
|
584
|
+
def embed(
|
|
585
|
+
self,
|
|
586
|
+
texts: Sequence[str],
|
|
587
|
+
config: Optional[EmbeddingConfig] = None,
|
|
588
|
+
) -> List[List[float]]:
|
|
589
|
+
"""Generate embeddings for texts."""
|
|
590
|
+
pass
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
# --------------------------------------------------------------------------------------
|
|
594
|
+
# Embedding Model
|
|
595
|
+
# --------------------------------------------------------------------------------------
|
|
596
|
+
|
|
597
|
+
class Embedder(ABC):
|
|
598
|
+
"""Abstract base class for embedding models."""
|
|
599
|
+
|
|
600
|
+
def __init__(
|
|
601
|
+
self,
|
|
602
|
+
model_path: Path,
|
|
603
|
+
tokenizer_path: Path,
|
|
604
|
+
device: Optional[str] = None,
|
|
605
|
+
) -> None:
|
|
606
|
+
self.model_path = model_path
|
|
607
|
+
self.tokenizer_path = tokenizer_path
|
|
608
|
+
self.device = device
|
|
609
|
+
|
|
610
|
+
@abstractmethod
|
|
611
|
+
def destroy(self) -> None:
|
|
612
|
+
"""Destroy the model and free resources."""
|
|
613
|
+
pass
|
|
614
|
+
|
|
615
|
+
@abstractmethod
|
|
616
|
+
def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
|
|
617
|
+
"""Load model from path."""
|
|
618
|
+
pass
|
|
619
|
+
|
|
620
|
+
@abstractmethod
|
|
621
|
+
def close(self) -> None:
|
|
622
|
+
"""Close the model."""
|
|
623
|
+
pass
|
|
624
|
+
|
|
625
|
+
@abstractmethod
|
|
626
|
+
def embed(
|
|
627
|
+
self,
|
|
628
|
+
texts: Sequence[str],
|
|
629
|
+
config: Optional[EmbeddingConfig] = None,
|
|
630
|
+
) -> List[List[float]]:
|
|
631
|
+
"""Generate embeddings for texts."""
|
|
632
|
+
pass
|
|
633
|
+
|
|
634
|
+
@abstractmethod
|
|
635
|
+
def embedding_dim(self) -> int:
|
|
636
|
+
"""Get embedding dimension."""
|
|
637
|
+
pass
|
|
638
|
+
|
|
639
|
+
@abstractmethod
|
|
640
|
+
def set_lora(self, lora_id: int) -> None:
|
|
641
|
+
"""Set active LoRA adapter."""
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
@abstractmethod
|
|
645
|
+
def add_lora(self, lora_path: Path) -> int:
|
|
646
|
+
"""Add LoRA adapter and return its ID."""
|
|
647
|
+
pass
|
|
648
|
+
|
|
649
|
+
@abstractmethod
|
|
650
|
+
def remove_lora(self, lora_id: int) -> None:
|
|
651
|
+
"""Remove LoRA adapter."""
|
|
652
|
+
pass
|
|
653
|
+
|
|
654
|
+
@abstractmethod
|
|
655
|
+
def list_loras(self) -> List[int]:
|
|
656
|
+
"""List available LoRA adapters."""
|
|
657
|
+
pass
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
# --------------------------------------------------------------------------------------
|
|
661
|
+
# Reranker Model
|
|
662
|
+
# --------------------------------------------------------------------------------------
|
|
663
|
+
|
|
664
|
+
class Reranker(ABC):
|
|
665
|
+
"""Abstract base class for reranker models."""
|
|
666
|
+
|
|
667
|
+
def __init__(
|
|
668
|
+
self,
|
|
669
|
+
model_path: Path,
|
|
670
|
+
tokenizer_path: Path,
|
|
671
|
+
device: Optional[str] = None,
|
|
672
|
+
) -> None:
|
|
673
|
+
self.model_path = model_path
|
|
674
|
+
self.tokenizer_path = tokenizer_path
|
|
675
|
+
self.device = device
|
|
676
|
+
|
|
677
|
+
@abstractmethod
|
|
678
|
+
def destroy(self) -> None:
|
|
679
|
+
"""Destroy the model and free resources."""
|
|
680
|
+
pass
|
|
681
|
+
|
|
682
|
+
@abstractmethod
|
|
683
|
+
def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
|
|
684
|
+
"""Load model from path."""
|
|
685
|
+
pass
|
|
686
|
+
|
|
687
|
+
@abstractmethod
|
|
688
|
+
def close(self) -> None:
|
|
689
|
+
"""Close the model."""
|
|
690
|
+
pass
|
|
691
|
+
|
|
692
|
+
@abstractmethod
|
|
693
|
+
def rerank(
|
|
694
|
+
self,
|
|
695
|
+
query: str,
|
|
696
|
+
documents: Sequence[str],
|
|
697
|
+
config: Optional[RerankConfig] = None,
|
|
698
|
+
) -> List[float]:
|
|
699
|
+
"""Rerank documents given a query."""
|
|
700
|
+
pass
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
# --------------------------------------------------------------------------------------
|
|
704
|
+
# Image generation
|
|
705
|
+
# --------------------------------------------------------------------------------------
|
|
706
|
+
|
|
707
|
+
class ImageGen(ABC):
|
|
708
|
+
"""Abstract base class for image generation models."""
|
|
709
|
+
|
|
710
|
+
def __init__(
|
|
711
|
+
self,
|
|
712
|
+
model_path: Path,
|
|
713
|
+
scheduler_config_path: Path,
|
|
714
|
+
device: Optional[str] = None,
|
|
715
|
+
) -> None:
|
|
716
|
+
self.model_path = model_path
|
|
717
|
+
self.scheduler_config_path = scheduler_config_path
|
|
718
|
+
self.device = device
|
|
719
|
+
|
|
720
|
+
@abstractmethod
|
|
721
|
+
def destroy(self) -> None:
|
|
722
|
+
"""Destroy the model and free resources."""
|
|
723
|
+
pass
|
|
724
|
+
|
|
725
|
+
@abstractmethod
|
|
726
|
+
def load_model(self, model_path: Path, extra_data: Any = None) -> bool:
|
|
727
|
+
"""Load model from path."""
|
|
728
|
+
pass
|
|
729
|
+
|
|
730
|
+
@abstractmethod
|
|
731
|
+
def close(self) -> None:
|
|
732
|
+
"""Close the model."""
|
|
733
|
+
pass
|
|
734
|
+
|
|
735
|
+
@abstractmethod
|
|
736
|
+
def set_scheduler(self, config: SchedulerConfig) -> None:
|
|
737
|
+
"""Set scheduler configuration."""
|
|
738
|
+
pass
|
|
739
|
+
|
|
740
|
+
@abstractmethod
|
|
741
|
+
def set_sampler(self, config: ImageSamplerConfig) -> None:
|
|
742
|
+
"""Set sampler configuration."""
|
|
743
|
+
pass
|
|
744
|
+
|
|
745
|
+
@abstractmethod
|
|
746
|
+
def reset_sampler(self) -> None:
|
|
747
|
+
"""Reset sampler to default configuration."""
|
|
748
|
+
pass
|
|
749
|
+
|
|
750
|
+
@abstractmethod
|
|
751
|
+
def txt2img(self, prompt: str, config: ImageGenerationConfig) -> Image:
|
|
752
|
+
"""Generate image from text prompt."""
|
|
753
|
+
pass
|
|
754
|
+
|
|
755
|
+
@abstractmethod
|
|
756
|
+
def img2img(self, init_image: Image, prompt: str, config: ImageGenerationConfig) -> Image:
|
|
757
|
+
"""Generate image from initial image and text prompt."""
|
|
758
|
+
pass
|
|
759
|
+
|
|
760
|
+
@abstractmethod
|
|
761
|
+
def generate(self, config: ImageGenerationConfig) -> Image:
|
|
762
|
+
"""Generate image from configuration."""
|
|
763
|
+
pass
|
|
764
|
+
|
|
765
|
+
@abstractmethod
|
|
766
|
+
def set_lora(self, lora_id: int) -> None:
|
|
767
|
+
"""Set active LoRA adapter."""
|
|
768
|
+
pass
|
|
769
|
+
|
|
770
|
+
@abstractmethod
|
|
771
|
+
def add_lora(self, lora_path: Path) -> int:
|
|
772
|
+
"""Add LoRA adapter and return its ID."""
|
|
773
|
+
pass
|
|
774
|
+
|
|
775
|
+
@abstractmethod
|
|
776
|
+
def remove_lora(self, lora_id: int) -> None:
|
|
777
|
+
"""Remove LoRA adapter."""
|
|
778
|
+
pass
|
|
779
|
+
|
|
780
|
+
@abstractmethod
|
|
781
|
+
def list_loras(self) -> List[int]:
|
|
782
|
+
"""List available LoRA adapters."""
|
|
783
|
+
pass
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
# --------------------------------------------------------------------------------------
|
|
787
|
+
# Computer vision – Generic CV Model
|
|
788
|
+
# --------------------------------------------------------------------------------------
|
|
789
|
+
|
|
790
|
+
class CVModel(ABC):
|
|
791
|
+
"""Abstract base class for generic computer vision models."""
|
|
792
|
+
|
|
793
|
+
def __init__(self, config: CVModelConfig, device: Optional[str] = None) -> None:
|
|
794
|
+
self.config = config
|
|
795
|
+
self.device = device
|
|
796
|
+
|
|
797
|
+
@abstractmethod
|
|
798
|
+
def destroy(self) -> None:
|
|
799
|
+
"""Destroy the model and free resources."""
|
|
800
|
+
pass
|
|
801
|
+
|
|
802
|
+
@abstractmethod
|
|
803
|
+
def infer(self, input_image_path: str) -> CVResults:
|
|
804
|
+
"""Perform inference on image."""
|
|
805
|
+
pass
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
# --------------------------------------------------------------------------------------
|
|
809
|
+
# Speech recognition – ASR
|
|
810
|
+
# --------------------------------------------------------------------------------------
|
|
811
|
+
|
|
812
|
+
class ASR(ABC):
|
|
813
|
+
"""Abstract base class for Automatic Speech Recognition models."""
|
|
814
|
+
|
|
815
|
+
def __init__(
|
|
816
|
+
self,
|
|
817
|
+
model_path: Path,
|
|
818
|
+
tokenizer_path: Optional[Path],
|
|
819
|
+
language: Optional[str],
|
|
820
|
+
device: Optional[str] = None,
|
|
821
|
+
) -> None:
|
|
822
|
+
self.model_path = model_path
|
|
823
|
+
self.tokenizer_path = tokenizer_path
|
|
824
|
+
self.language = language
|
|
825
|
+
self.device = device
|
|
826
|
+
|
|
827
|
+
@abstractmethod
|
|
828
|
+
def destroy(self) -> None:
|
|
829
|
+
"""Destroy the model and free resources."""
|
|
830
|
+
pass
|
|
831
|
+
|
|
832
|
+
@abstractmethod
|
|
833
|
+
def close(self) -> None:
|
|
834
|
+
"""Close the model."""
|
|
835
|
+
pass
|
|
836
|
+
|
|
837
|
+
@abstractmethod
|
|
838
|
+
def transcribe(
|
|
839
|
+
self,
|
|
840
|
+
audio_path: Path,
|
|
841
|
+
language: Optional[str] = None,
|
|
842
|
+
config: Optional[ASRConfig] = None,
|
|
843
|
+
) -> ASRResult:
|
|
844
|
+
"""Transcribe audio file to text."""
|
|
845
|
+
pass
|
|
846
|
+
|
|
847
|
+
@abstractmethod
|
|
848
|
+
def list_supported_languages(self) -> List[str]:
|
|
849
|
+
"""List supported languages."""
|
|
850
|
+
pass
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
# --------------------------------------------------------------------------------------
|
|
854
|
+
# Speech synthesis – TTS
|
|
855
|
+
# --------------------------------------------------------------------------------------
|
|
856
|
+
|
|
857
|
+
class TTS(ABC):
|
|
858
|
+
"""Abstract base class for Text-to-Speech models."""
|
|
859
|
+
|
|
860
|
+
def __init__(
|
|
861
|
+
self,
|
|
862
|
+
model_path: Path,
|
|
863
|
+
vocoder_path: Path,
|
|
864
|
+
device: Optional[str] = None,
|
|
865
|
+
) -> None:
|
|
866
|
+
self.model_path = model_path
|
|
867
|
+
self.vocoder_path = vocoder_path
|
|
868
|
+
self.device = device
|
|
869
|
+
|
|
870
|
+
@abstractmethod
|
|
871
|
+
def destroy(self) -> None:
|
|
872
|
+
"""Destroy the model and free resources."""
|
|
873
|
+
pass
|
|
874
|
+
|
|
875
|
+
@abstractmethod
|
|
876
|
+
def synthesize(
|
|
877
|
+
self,
|
|
878
|
+
text: str,
|
|
879
|
+
config: Optional[TTSConfig] = None,
|
|
880
|
+
output_path: Optional[Path] = None,
|
|
881
|
+
) -> TTSResult:
|
|
882
|
+
"""Synthesize speech from text and save to filesystem."""
|
|
883
|
+
pass
|
|
884
|
+
|
|
885
|
+
@abstractmethod
|
|
886
|
+
def list_available_voices(self) -> List[str]:
|
|
887
|
+
"""List available voices."""
|
|
888
|
+
pass
|