nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexaai/__init__.py +99 -0
- nexaai/_stub.cpython-310-darwin.so +0 -0
- nexaai/_version.py +4 -0
- nexaai/asr.py +68 -0
- nexaai/asr_impl/__init__.py +0 -0
- nexaai/asr_impl/mlx_asr_impl.py +93 -0
- nexaai/asr_impl/pybind_asr_impl.py +127 -0
- nexaai/base.py +39 -0
- nexaai/binds/__init__.py +7 -0
- nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
- nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
- nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
- nexaai/binds/cpu_gpu/libggml.dylib +0 -0
- nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
- nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/libnexa_bridge.dylib +0 -0
- nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/metal/libnexa_plugin.dylib +0 -0
- nexaai/binds/metal/py-lib/ml.py +888 -0
- nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
- nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
- nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
- nexaai/binds/metal/py-lib/profiling.py +239 -0
- nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
- nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
- nexaai/binds/nexaml/libggml-base.dylib +0 -0
- nexaai/binds/nexaml/libggml-cpu.so +0 -0
- nexaai/binds/nexaml/libggml-metal.so +0 -0
- nexaai/binds/nexaml/libggml.dylib +0 -0
- nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
- nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
- nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
- nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
- nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
- nexaai/binds/nexaml/libnexaproc.dylib +0 -0
- nexaai/binds/nexaml/libomp.dylib +0 -0
- nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
- nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
- nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
- nexaai/common.py +106 -0
- nexaai/cv.py +95 -0
- nexaai/cv_impl/__init__.py +0 -0
- nexaai/cv_impl/mlx_cv_impl.py +91 -0
- nexaai/cv_impl/pybind_cv_impl.py +124 -0
- nexaai/diarize.py +80 -0
- nexaai/diarize_impl/__init__.py +1 -0
- nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
- nexaai/embedder.py +73 -0
- nexaai/embedder_impl/__init__.py +0 -0
- nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
- nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
- nexaai/image_gen.py +141 -0
- nexaai/image_gen_impl/__init__.py +0 -0
- nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
- nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
- nexaai/llm.py +98 -0
- nexaai/llm_impl/__init__.py +0 -0
- nexaai/llm_impl/mlx_llm_impl.py +271 -0
- nexaai/llm_impl/pybind_llm_impl.py +238 -0
- nexaai/log.py +92 -0
- nexaai/mlx_backend/asr/__init__.py +12 -0
- nexaai/mlx_backend/asr/interface.py +122 -0
- nexaai/mlx_backend/common/__init__.py +0 -0
- nexaai/mlx_backend/common/utils.py +25 -0
- nexaai/mlx_backend/cv/__init__.py +0 -0
- nexaai/mlx_backend/cv/generate.py +195 -0
- nexaai/mlx_backend/cv/interface.py +162 -0
- nexaai/mlx_backend/cv/main.py +81 -0
- nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
- nexaai/mlx_backend/embedding/__init__.py +0 -0
- nexaai/mlx_backend/embedding/generate.py +333 -0
- nexaai/mlx_backend/embedding/interface.py +617 -0
- nexaai/mlx_backend/embedding/main.py +173 -0
- nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
- nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
- nexaai/mlx_backend/image_gen/__init__.py +1 -0
- nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
- nexaai/mlx_backend/image_gen/interface.py +82 -0
- nexaai/mlx_backend/image_gen/main.py +281 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
- nexaai/mlx_backend/llm/__init__.py +0 -0
- nexaai/mlx_backend/llm/generate.py +149 -0
- nexaai/mlx_backend/llm/interface.py +764 -0
- nexaai/mlx_backend/llm/main.py +68 -0
- nexaai/mlx_backend/ml.py +888 -0
- nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/mlx_backend/mlx_audio/server.py +525 -0
- nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
- nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
- nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
- nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
- nexaai/mlx_backend/mlx_audio/utils.py +237 -0
- nexaai/mlx_backend/mlx_audio/version.py +1 -0
- nexaai/mlx_backend/profiling.py +239 -0
- nexaai/mlx_backend/rerank/__init__.py +0 -0
- nexaai/mlx_backend/rerank/generate.py +174 -0
- nexaai/mlx_backend/rerank/interface.py +287 -0
- nexaai/mlx_backend/rerank/main.py +127 -0
- nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
- nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
- nexaai/mlx_backend/sd/__init__.py +1 -0
- nexaai/mlx_backend/sd/interface.py +362 -0
- nexaai/mlx_backend/sd/main.py +286 -0
- nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
- nexaai/mlx_backend/sd/modeling/clip.py +116 -0
- nexaai/mlx_backend/sd/modeling/config.py +65 -0
- nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
- nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
- nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
- nexaai/mlx_backend/sd/modeling/unet.py +460 -0
- nexaai/mlx_backend/sd/modeling/vae.py +274 -0
- nexaai/mlx_backend/tts/__init__.py +12 -0
- nexaai/mlx_backend/tts/interface.py +276 -0
- nexaai/mlx_backend/vlm/__init__.py +3 -0
- nexaai/mlx_backend/vlm/generate.py +572 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
- nexaai/mlx_backend/vlm/interface.py +559 -0
- nexaai/mlx_backend/vlm/main.py +365 -0
- nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
- nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
- nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
- nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
- nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
- nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
- nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
- nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
- nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
- nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
- nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
- nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
- nexaai/rerank.py +57 -0
- nexaai/rerank_impl/__init__.py +0 -0
- nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
- nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
- nexaai/runtime.py +68 -0
- nexaai/runtime_error.py +24 -0
- nexaai/tts.py +75 -0
- nexaai/tts_impl/__init__.py +0 -0
- nexaai/tts_impl/mlx_tts_impl.py +94 -0
- nexaai/tts_impl/pybind_tts_impl.py +43 -0
- nexaai/utils/decode.py +18 -0
- nexaai/utils/manifest_utils.py +531 -0
- nexaai/utils/model_manager.py +1745 -0
- nexaai/utils/model_types.py +49 -0
- nexaai/utils/progress_tracker.py +389 -0
- nexaai/utils/quantization_utils.py +245 -0
- nexaai/vlm.py +130 -0
- nexaai/vlm_impl/__init__.py +0 -0
- nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
- nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
- nexaai-1.0.29.dist-info/METADATA +35 -0
- nexaai-1.0.29.dist-info/RECORD +580 -0
- nexaai-1.0.29.dist-info/WHEEL +5 -0
- nexaai-1.0.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright (c) 2025 SparkAudio
|
|
2
|
+
# 2025 Xinsheng Wang (w.xinshawn@gmail.com)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Description:
|
|
17
|
+
This script contains a collection of functions designed to handle various
|
|
18
|
+
file reading and writing operations. It provides utilities to read from files,
|
|
19
|
+
write data to files, and perform file manipulation tasks.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
import csv
|
|
24
|
+
import json
|
|
25
|
+
import os
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, Dict, List, Set, Union
|
|
28
|
+
|
|
29
|
+
from omegaconf import DictConfig, OmegaConf
|
|
30
|
+
from tqdm import tqdm
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
|
|
34
|
+
"""
|
|
35
|
+
Resolves the absolute path of a symbolic link.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
symbolic_link_path (Path): The path to the symbolic link.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Path: The absolute path that the symbolic link points to.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
link_directory = os.path.dirname(symbolic_link_path)
|
|
45
|
+
target_path_relative = os.readlink(symbolic_link_path)
|
|
46
|
+
return os.path.join(link_directory, target_path_relative)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def write_jsonl(metadata: List[dict], file_path: Path) -> None:
|
|
50
|
+
"""Writes a list of dictionaries to a JSONL file.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
metadata : List[dict]
|
|
54
|
+
A list of dictionaries, each representing a piece of meta.
|
|
55
|
+
file_path : Path
|
|
56
|
+
The file path to save the JSONL file
|
|
57
|
+
|
|
58
|
+
This function writes each dictionary in the list to a new line in the specified file.
|
|
59
|
+
"""
|
|
60
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
61
|
+
for meta in tqdm(metadata, desc="writing jsonl"):
|
|
62
|
+
# Convert dictionary to JSON string and write it to the file with a newline
|
|
63
|
+
json_str = json.dumps(meta, ensure_ascii=False) + "\n"
|
|
64
|
+
f.write(json_str)
|
|
65
|
+
print(f"jsonl saved to {file_path}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def read_jsonl(file_path: Path) -> List[dict]:
|
|
69
|
+
"""
|
|
70
|
+
Reads a JSONL file and returns a list of dictionaries.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
file_path : Path
|
|
74
|
+
The path to the JSONL file to be read.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
List[dict]
|
|
78
|
+
A list of dictionaries parsed from each line of the JSONL file.
|
|
79
|
+
"""
|
|
80
|
+
metadata = []
|
|
81
|
+
# Open the file for reading
|
|
82
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
83
|
+
# Split the file into lines
|
|
84
|
+
lines = f.read().splitlines()
|
|
85
|
+
# Process each line
|
|
86
|
+
for line in lines:
|
|
87
|
+
# Convert JSON string back to dictionary and append to list
|
|
88
|
+
meta = json.loads(line)
|
|
89
|
+
metadata.append(meta)
|
|
90
|
+
# Return the list of metadata
|
|
91
|
+
return metadata
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def read_json_as_jsonl(file_path: Path) -> List[dict]:
|
|
95
|
+
metadata = []
|
|
96
|
+
with open(file_path, "r", encoding="utf-8") as infile:
|
|
97
|
+
data = json.load(infile)
|
|
98
|
+
for k in sorted(data.keys()):
|
|
99
|
+
meta = {"index": k}
|
|
100
|
+
meta.update(data[k])
|
|
101
|
+
metadata.append(meta)
|
|
102
|
+
return metadata
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
|
|
106
|
+
processed_meta = {}
|
|
107
|
+
for k, v in meta.items():
|
|
108
|
+
if isinstance(v, str):
|
|
109
|
+
processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
|
|
110
|
+
else:
|
|
111
|
+
processed_meta[k] = v
|
|
112
|
+
return processed_meta
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def load_config(config_path: Path) -> DictConfig:
|
|
116
|
+
"""Loads a configuration file and optionally merges it with a base configuration.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
config_path (Path): Path to the configuration file.
|
|
120
|
+
"""
|
|
121
|
+
# Load the initial configuration from the given path
|
|
122
|
+
config = OmegaConf.load(config_path)
|
|
123
|
+
|
|
124
|
+
# Check if there is a base configuration specified and merge if necessary
|
|
125
|
+
if config.get("base_config", None) is not None:
|
|
126
|
+
base_config = OmegaConf.load(config["base_config"])
|
|
127
|
+
config = OmegaConf.merge(base_config, config)
|
|
128
|
+
|
|
129
|
+
return config
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
|
|
133
|
+
"""
|
|
134
|
+
Converts a JSONL file to a CSV file.
|
|
135
|
+
|
|
136
|
+
This function reads a JSONL file, determines all unique keys present in the file,
|
|
137
|
+
and writes the data to a CSV file with columns for all these keys.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
all_keys = set()
|
|
141
|
+
data_rows = []
|
|
142
|
+
|
|
143
|
+
# Read the JSONL file once to extract keys and collect data
|
|
144
|
+
with open(jsonl_file_path, "r") as file:
|
|
145
|
+
for line in file:
|
|
146
|
+
data = json.loads(line.strip())
|
|
147
|
+
data_rows.append(data)
|
|
148
|
+
all_keys.update(data.keys())
|
|
149
|
+
|
|
150
|
+
# Convert the set of keys to a sorted list for consistent column order
|
|
151
|
+
sorted_keys = sorted(all_keys)
|
|
152
|
+
|
|
153
|
+
# Write the data to a CSV file
|
|
154
|
+
with open(csv_file_path, "w", newline="") as csvfile:
|
|
155
|
+
writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
|
|
156
|
+
|
|
157
|
+
# Write the header row
|
|
158
|
+
writer.writeheader()
|
|
159
|
+
|
|
160
|
+
# Write each row of data
|
|
161
|
+
for data in data_rows:
|
|
162
|
+
writer.writerow(data)
|
|
163
|
+
|
|
164
|
+
print(f"CSV file has been created at {csv_file_path}")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def save_metadata(data, filename, headers=None):
|
|
168
|
+
"""
|
|
169
|
+
Save metadata to a file.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
data (list of dict): Metadata to be saved.
|
|
173
|
+
filename (str): Name of the file to save the metadata.
|
|
174
|
+
headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
|
|
175
|
+
"""
|
|
176
|
+
# Set headers to keys from the first dictionary in data if not explicitly provided
|
|
177
|
+
if headers is None:
|
|
178
|
+
headers = list(data[0].keys())
|
|
179
|
+
|
|
180
|
+
with open(filename, "w", encoding="utf-8") as file:
|
|
181
|
+
# Write the headers to the file
|
|
182
|
+
file.write("|".join(headers) + "\n")
|
|
183
|
+
for entry in data:
|
|
184
|
+
# Retrieve values in the order of headers, replacing any '|' characters with a space to prevent formatting errors
|
|
185
|
+
formatted_values = [
|
|
186
|
+
str(entry.get(key, "")).replace("|", " ") for key in headers
|
|
187
|
+
]
|
|
188
|
+
# Write the formatted values to the file
|
|
189
|
+
file.write("|".join(formatted_values) + "\n")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def read_metadata(filename, headers=None):
|
|
193
|
+
"""
|
|
194
|
+
Read metadata from a file.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
filename (str): The file from which to read the metadata.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
list of dict: The metadata read from the file.
|
|
201
|
+
list of str: The headers used in the file.
|
|
202
|
+
"""
|
|
203
|
+
with open(filename, "r", encoding="utf-8") as file:
|
|
204
|
+
lines = file.readlines()
|
|
205
|
+
|
|
206
|
+
data = []
|
|
207
|
+
# Set headers from the first line of the file if not provided
|
|
208
|
+
if headers is None:
|
|
209
|
+
headers = lines[0].strip().split("|")
|
|
210
|
+
lines = lines[1:]
|
|
211
|
+
|
|
212
|
+
for line in lines:
|
|
213
|
+
line = line.strip()
|
|
214
|
+
# Skip empty lines
|
|
215
|
+
if not line:
|
|
216
|
+
continue
|
|
217
|
+
# Split the line by '|' and pair with headers to form a dictionary
|
|
218
|
+
entry_data = dict(zip(headers, line.split("|")))
|
|
219
|
+
data.append(entry_data)
|
|
220
|
+
|
|
221
|
+
return data, headers
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
TASK_TOKEN_MAP = {
|
|
2
|
+
"vc": "<|task_vc|>",
|
|
3
|
+
"tts": "<|task_tts|>",
|
|
4
|
+
"asr": "<|task_asr|>",
|
|
5
|
+
"s2s": "<|task_s2s|>",
|
|
6
|
+
"t2s": "<|task_t2s|>",
|
|
7
|
+
"understand": "<|task_understand|>",
|
|
8
|
+
"caption": "<|task_cap|>",
|
|
9
|
+
"controllable_tts": "<|task_controllable_tts|>",
|
|
10
|
+
"prompt_tts": "<|task_prompt_tts|>",
|
|
11
|
+
"speech_edit": "<|task_edit|>",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
LEVELS_MAP = {
|
|
15
|
+
"very_low": 0,
|
|
16
|
+
"low": 1,
|
|
17
|
+
"moderate": 2,
|
|
18
|
+
"high": 3,
|
|
19
|
+
"very_high": 4,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
LEVELS_MAP_UI = {1: "very_low", 2: "low", 3: "moderate", 4: "high", 5: "very_high"}
|
|
23
|
+
|
|
24
|
+
GENDER_MAP = {
|
|
25
|
+
"female": 0,
|
|
26
|
+
"male": 1,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4}
|
|
30
|
+
|
|
31
|
+
EMO_MAP = {
|
|
32
|
+
"UNKNOWN": 0,
|
|
33
|
+
"NEUTRAL": 1,
|
|
34
|
+
"ANGRY": 2,
|
|
35
|
+
"HAPPY": 3,
|
|
36
|
+
"SAD": 4,
|
|
37
|
+
"FEARFUL": 5,
|
|
38
|
+
"DISGUSTED": 6,
|
|
39
|
+
"SURPRISED": 7,
|
|
40
|
+
"SARCASTIC": 8,
|
|
41
|
+
"EXCITED": 9,
|
|
42
|
+
"SLEEPY": 10,
|
|
43
|
+
"CONFUSED": 11,
|
|
44
|
+
"EMPHASIS": 12,
|
|
45
|
+
"LAUGHING": 13,
|
|
46
|
+
"SINGING": 14,
|
|
47
|
+
"WORRIED": 15,
|
|
48
|
+
"WHISPER": 16,
|
|
49
|
+
"ANXIOUS": 17,
|
|
50
|
+
"NO-AGREEMENT": 18,
|
|
51
|
+
"APOLOGETIC": 19,
|
|
52
|
+
"CONCERNED": 20,
|
|
53
|
+
"ENUNCIATED": 21,
|
|
54
|
+
"ASSERTIVE": 22,
|
|
55
|
+
"ENCOURAGING": 23,
|
|
56
|
+
"CONTEMPT": 24,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TokenParser:
|
|
61
|
+
"""Turn label to special token"""
|
|
62
|
+
|
|
63
|
+
def __init__(self):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
"""Parse the attributes of a person."""
|
|
67
|
+
|
|
68
|
+
def __init__(self):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def age(age: str) -> str:
|
|
73
|
+
"""Turn age token."""
|
|
74
|
+
age_id = AGE_MAP[age]
|
|
75
|
+
return f"<|age_{age_id}|>"
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def gender(gender: str) -> str:
|
|
79
|
+
"""Turn gender token."""
|
|
80
|
+
gender_id = GENDER_MAP[gender]
|
|
81
|
+
return f"<|gender_{gender_id}|>"
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def mel_value(mel: int):
|
|
85
|
+
"""Turn special token of mel scale pitch."""
|
|
86
|
+
mel = max(0, int(mel))
|
|
87
|
+
mel = min(1000, int(mel))
|
|
88
|
+
return f"<|pitch_value_{mel}|>"
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def mel_level(level: str):
|
|
92
|
+
"""Turn special token of mel level."""
|
|
93
|
+
level_tag = LEVELS_MAP[level]
|
|
94
|
+
return f"<|pitch_label_{level_tag}|>"
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def pitch_var_value(pitch_std: int):
|
|
98
|
+
"""Turn special token of pitch_std value."""
|
|
99
|
+
assert isinstance(pitch_std, int)
|
|
100
|
+
pitch_std = max(0, int(pitch_std))
|
|
101
|
+
pitch_std = min(10, int(pitch_std))
|
|
102
|
+
return f"<|pitch_var_value_{pitch_std}|>"
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def pitch_var_level(level: str):
|
|
106
|
+
"""Turn special token of pitch std level."""
|
|
107
|
+
level_tag = LEVELS_MAP[level]
|
|
108
|
+
return f"<|pitch_var_label_{level_tag}|>"
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def loudness_value(loudness: int):
|
|
112
|
+
"""Turn special toak of loudness value [0, 30]"""
|
|
113
|
+
assert loudness >= 0
|
|
114
|
+
loudness = max(0, int(loudness))
|
|
115
|
+
loudness = min(30, int(loudness))
|
|
116
|
+
return f"<|loudness_value_{loudness}|>"
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def loudness_level(level: str):
|
|
120
|
+
"""Turn special token of loudness level."""
|
|
121
|
+
level_tag = LEVELS_MAP[level]
|
|
122
|
+
return f"<|loudness_label_{level_tag}|>"
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def speed_value(speed: int):
|
|
126
|
+
"""Turn special token of speed value."""
|
|
127
|
+
speed = max(0, int(speed))
|
|
128
|
+
speed = min(10, int(speed))
|
|
129
|
+
return f"<|speed_value_{speed}|>"
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def speed_level(level: str):
|
|
133
|
+
"""Turn special token of speed level."""
|
|
134
|
+
level_tag = LEVELS_MAP[level]
|
|
135
|
+
return f"<|speed_label_{level_tag}|>"
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def task(task: str) -> str:
|
|
139
|
+
"""Turn special token of task."""
|
|
140
|
+
assert task in TASK_TOKEN_MAP.keys()
|
|
141
|
+
|
|
142
|
+
return TASK_TOKEN_MAP[task]
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def emotion(emotion: str):
|
|
146
|
+
emo_id = EMO_MAP[emotion]
|
|
147
|
+
|
|
148
|
+
return f"<|emotion_{emo_id}|>"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# test
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
from transformers import AutoTokenizer
|
|
154
|
+
|
|
155
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
156
|
+
"/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"]
|
|
160
|
+
ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"]
|
|
161
|
+
genders = ["female", "female", "female", "male", "male"]
|
|
162
|
+
mels = [100, 200, 300, 400, 500]
|
|
163
|
+
mel_levels = ["very_low", "low", "moderate", "high", "very_high"]
|
|
164
|
+
loudnesses = [1, 10, 23, 19, 30]
|
|
165
|
+
loudness_levels = ["very_low", "low", "moderate", "high", "very_high"]
|
|
166
|
+
emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"]
|
|
167
|
+
|
|
168
|
+
for i in range(5):
|
|
169
|
+
task = TokenParser.task(tasks[i])
|
|
170
|
+
age = TokenParser.age(ages[i])
|
|
171
|
+
gender = TokenParser.gender(genders[i])
|
|
172
|
+
mel = TokenParser.mel_value(mels[i])
|
|
173
|
+
mel_level = TokenParser.mel_level(mel_levels[i])
|
|
174
|
+
loudness = TokenParser.loudness_value(loudnesses[i])
|
|
175
|
+
loudness_level = TokenParser.loudness_level(loudness_levels[i])
|
|
176
|
+
emotion = TokenParser.emotion(emotions[i])
|
|
177
|
+
inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion]
|
|
178
|
+
inputs = "".join(inputs)
|
|
179
|
+
ids = tokenizer.encode(inputs, add_special_tokens=False)
|
|
180
|
+
print(ids)
|
|
181
|
+
print("decode", tokenizer.decode(ids))
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
import mlx.core as mx
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from mlx_audio.tts.models.base import BaseModelArgs, check_array_shape
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestBaseModel(unittest.TestCase):
|
|
10
|
+
def test_base_model_args_from_dict(self):
|
|
11
|
+
"""Test BaseModelArgs.from_dict method."""
|
|
12
|
+
|
|
13
|
+
# Define a test subclass
|
|
14
|
+
class TestArgs(BaseModelArgs):
|
|
15
|
+
def __init__(self, param1, param2, param3=None):
|
|
16
|
+
self.param1 = param1
|
|
17
|
+
self.param2 = param2
|
|
18
|
+
self.param3 = param3
|
|
19
|
+
|
|
20
|
+
# Test with all parameters
|
|
21
|
+
params = {"param1": 1, "param2": "test", "param3": True}
|
|
22
|
+
args = TestArgs.from_dict(params)
|
|
23
|
+
self.assertEqual(args.param1, 1)
|
|
24
|
+
self.assertEqual(args.param2, "test")
|
|
25
|
+
self.assertEqual(args.param3, True)
|
|
26
|
+
|
|
27
|
+
# Test with extra parameters (should be ignored)
|
|
28
|
+
params = {"param1": 1, "param2": "test", "param3": True, "extra": "ignored"}
|
|
29
|
+
args = TestArgs.from_dict(params)
|
|
30
|
+
self.assertEqual(args.param1, 1)
|
|
31
|
+
self.assertEqual(args.param2, "test")
|
|
32
|
+
self.assertEqual(args.param3, True)
|
|
33
|
+
self.assertFalse(hasattr(args, "extra"))
|
|
34
|
+
|
|
35
|
+
# Test with missing optional parameter
|
|
36
|
+
params = {"param1": 1, "param2": "test"}
|
|
37
|
+
args = TestArgs.from_dict(params)
|
|
38
|
+
self.assertEqual(args.param1, 1)
|
|
39
|
+
self.assertEqual(args.param2, "test")
|
|
40
|
+
self.assertIsNone(args.param3)
|
|
41
|
+
|
|
42
|
+
def test_check_array_shape(self):
|
|
43
|
+
"""Test check_array_shape function."""
|
|
44
|
+
# Valid shape: out_channels >= kH == kW
|
|
45
|
+
valid_array = mx.array(np.zeros((64, 3, 3)))
|
|
46
|
+
self.assertTrue(check_array_shape(valid_array))
|
|
47
|
+
|
|
48
|
+
# Invalid shape: kH != kW
|
|
49
|
+
invalid_array1 = mx.array(np.zeros((64, 3, 4)))
|
|
50
|
+
self.assertFalse(check_array_shape(invalid_array1))
|
|
51
|
+
|
|
52
|
+
# Invalid shape: out_channels < kH
|
|
53
|
+
invalid_array2 = mx.array(np.zeros((2, 3, 3)))
|
|
54
|
+
self.assertFalse(check_array_shape(invalid_array2))
|
|
55
|
+
|
|
56
|
+
# Invalid shape: wrong number of dimensions
|
|
57
|
+
invalid_array3 = mx.array(np.zeros((64, 3)))
|
|
58
|
+
self.assertFalse(check_array_shape(invalid_array3))
|
|
59
|
+
|
|
60
|
+
# Invalid shape: wrong number of dimensions
|
|
61
|
+
invalid_array4 = mx.array(np.zeros((64, 3, 3, 3)))
|
|
62
|
+
self.assertFalse(check_array_shape(invalid_array4))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
unittest.main()
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import sys # Import sys to patch argv
|
|
2
|
+
import unittest
|
|
3
|
+
from unittest.mock import MagicMock, patch
|
|
4
|
+
|
|
5
|
+
from mlx_audio.tts.convert import configure_parser, main
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestConvert(unittest.TestCase):
|
|
9
|
+
def setUp(self):
|
|
10
|
+
self.parser = configure_parser()
|
|
11
|
+
|
|
12
|
+
# Mock the actual convert function
|
|
13
|
+
self.convert_mock = MagicMock()
|
|
14
|
+
self.patcher = patch("mlx_audio.tts.convert.convert", new=self.convert_mock)
|
|
15
|
+
self.patcher.start()
|
|
16
|
+
|
|
17
|
+
def tearDown(self):
|
|
18
|
+
self.patcher.stop()
|
|
19
|
+
|
|
20
|
+
def test_basic_conversion(self):
|
|
21
|
+
test_args = [
|
|
22
|
+
"--hf-path",
|
|
23
|
+
"dummy_hf",
|
|
24
|
+
"--mlx-path",
|
|
25
|
+
"dummy_mlx",
|
|
26
|
+
"--dtype",
|
|
27
|
+
"float16",
|
|
28
|
+
]
|
|
29
|
+
# Patch sys.argv for this test run
|
|
30
|
+
with patch.object(sys, "argv", ["convert.py"] + test_args):
|
|
31
|
+
main()
|
|
32
|
+
|
|
33
|
+
self.convert_mock.assert_called_once_with(
|
|
34
|
+
hf_path="dummy_hf",
|
|
35
|
+
mlx_path="dummy_mlx",
|
|
36
|
+
quantize=False,
|
|
37
|
+
q_group_size=64,
|
|
38
|
+
q_bits=4,
|
|
39
|
+
quant_predicate=None,
|
|
40
|
+
dtype="float16",
|
|
41
|
+
upload_repo=None,
|
|
42
|
+
dequantize=False,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def test_quantized_conversion(self):
|
|
46
|
+
test_args = [
|
|
47
|
+
"--hf-path",
|
|
48
|
+
"dummy_hf",
|
|
49
|
+
"--quantize",
|
|
50
|
+
"--q-group-size",
|
|
51
|
+
"128",
|
|
52
|
+
"--q-bits",
|
|
53
|
+
"8",
|
|
54
|
+
]
|
|
55
|
+
# Patch sys.argv for this test run
|
|
56
|
+
with patch.object(sys, "argv", ["convert.py"] + test_args):
|
|
57
|
+
main()
|
|
58
|
+
|
|
59
|
+
self.convert_mock.assert_called_once_with(
|
|
60
|
+
hf_path="dummy_hf",
|
|
61
|
+
mlx_path="mlx_model", # Default mlx_path
|
|
62
|
+
quantize=True,
|
|
63
|
+
q_group_size=128,
|
|
64
|
+
q_bits=8,
|
|
65
|
+
quant_predicate=None,
|
|
66
|
+
dtype="float16", # Should be ignored when quantize=True
|
|
67
|
+
upload_repo=None,
|
|
68
|
+
dequantize=False,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def test_quantized_conversion_invalid_group_size_raises_error(self):
|
|
72
|
+
"""Tests if main raises ValueError for invalid group size."""
|
|
73
|
+
test_args = [
|
|
74
|
+
"--hf-path",
|
|
75
|
+
"dummy_hf",
|
|
76
|
+
"--quantize",
|
|
77
|
+
"--q-group-size",
|
|
78
|
+
"100", # Invalid: not 64 or 128
|
|
79
|
+
"--q-bits",
|
|
80
|
+
"4",
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
# Configure the mock to raise ValueError when called with q_group_size=100
|
|
84
|
+
def side_effect(*args, **kwargs):
|
|
85
|
+
if kwargs.get("q_group_size") == 100:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
"[quantize] The requested group size 100 is not supported."
|
|
88
|
+
)
|
|
89
|
+
return MagicMock() # Default return for other calls if needed
|
|
90
|
+
|
|
91
|
+
self.convert_mock.side_effect = side_effect
|
|
92
|
+
|
|
93
|
+
# Patch sys.argv and assert ValueError is raised
|
|
94
|
+
with patch.object(sys, "argv", ["convert.py"] + test_args):
|
|
95
|
+
with self.assertRaisesRegex(
|
|
96
|
+
ValueError, "requested group size 100 is not supported"
|
|
97
|
+
):
|
|
98
|
+
main()
|
|
99
|
+
|
|
100
|
+
# Verify the mock was called (even though it raised an error)
|
|
101
|
+
self.convert_mock.assert_called_once_with(
|
|
102
|
+
hf_path="dummy_hf",
|
|
103
|
+
mlx_path="mlx_model",
|
|
104
|
+
quantize=True,
|
|
105
|
+
q_group_size=100,
|
|
106
|
+
q_bits=4,
|
|
107
|
+
quant_predicate=None,
|
|
108
|
+
dtype="float16",
|
|
109
|
+
upload_repo=None,
|
|
110
|
+
dequantize=False,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def test_quantization_recipes(self):
|
|
114
|
+
for recipe in ["mixed_2_6", "mixed_3_6", "mixed_4_6"]:
|
|
115
|
+
with self.subTest(recipe=recipe):
|
|
116
|
+
self.convert_mock.reset_mock() # Reset mock for each subtest
|
|
117
|
+
test_args = ["--hf-path", "dummy_hf", "--quant-predicate", recipe]
|
|
118
|
+
# Patch sys.argv for this test run
|
|
119
|
+
with patch.object(sys, "argv", ["convert.py"] + test_args):
|
|
120
|
+
main()
|
|
121
|
+
|
|
122
|
+
self.convert_mock.assert_called_once_with( # Changed to assert_called_once_with
|
|
123
|
+
hf_path="dummy_hf",
|
|
124
|
+
mlx_path="mlx_model", # Default mlx_path
|
|
125
|
+
quantize=False, # Default quantize
|
|
126
|
+
q_group_size=64, # Default q_group_size
|
|
127
|
+
q_bits=4, # Default q_bits
|
|
128
|
+
quant_predicate=recipe,
|
|
129
|
+
dtype="float16", # Default dtype
|
|
130
|
+
upload_repo=None, # Default upload_repo
|
|
131
|
+
dequantize=False, # Default dequantize
|
|
132
|
+
)
|
|
133
|
+
# No need to reset mock here, it's handled at the start of the loop
|
|
134
|
+
|
|
135
|
+
def test_dequantize_flag(self):
|
|
136
|
+
test_args = ["--hf-path", "dummy_hf", "--dequantize"]
|
|
137
|
+
# Patch sys.argv for this test run
|
|
138
|
+
with patch.object(sys, "argv", ["convert.py"] + test_args):
|
|
139
|
+
main()
|
|
140
|
+
|
|
141
|
+
self.convert_mock.assert_called_once_with(
|
|
142
|
+
hf_path="dummy_hf",
|
|
143
|
+
mlx_path="mlx_model", # Default mlx_path
|
|
144
|
+
quantize=False,
|
|
145
|
+
q_group_size=64,
|
|
146
|
+
q_bits=4,
|
|
147
|
+
quant_predicate=None,
|
|
148
|
+
dtype="float16",
|
|
149
|
+
upload_repo=None,
|
|
150
|
+
dequantize=True,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def test_upload_repo_argument(self):
|
|
154
|
+
test_args = ["--hf-path", "dummy_hf", "--upload-repo", "my/repo"]
|
|
155
|
+
# Patch sys.argv for this test run
|
|
156
|
+
with patch.object(sys, "argv", ["convert.py"] + test_args):
|
|
157
|
+
main()
|
|
158
|
+
|
|
159
|
+
self.convert_mock.assert_called_once_with(
|
|
160
|
+
hf_path="dummy_hf",
|
|
161
|
+
mlx_path="mlx_model", # Default mlx_path
|
|
162
|
+
quantize=False,
|
|
163
|
+
q_group_size=64,
|
|
164
|
+
q_bits=4,
|
|
165
|
+
quant_predicate=None,
|
|
166
|
+
dtype="float16",
|
|
167
|
+
upload_repo="my/repo",
|
|
168
|
+
dequantize=False,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
if __name__ == "__main__":
|
|
173
|
+
unittest.main()
|