nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexaai/__init__.py +99 -0
- nexaai/_stub.cpython-310-darwin.so +0 -0
- nexaai/_version.py +4 -0
- nexaai/asr.py +68 -0
- nexaai/asr_impl/__init__.py +0 -0
- nexaai/asr_impl/mlx_asr_impl.py +93 -0
- nexaai/asr_impl/pybind_asr_impl.py +127 -0
- nexaai/base.py +39 -0
- nexaai/binds/__init__.py +7 -0
- nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
- nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
- nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
- nexaai/binds/cpu_gpu/libggml.dylib +0 -0
- nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
- nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/libnexa_bridge.dylib +0 -0
- nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/metal/libnexa_plugin.dylib +0 -0
- nexaai/binds/metal/py-lib/ml.py +888 -0
- nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
- nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
- nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
- nexaai/binds/metal/py-lib/profiling.py +239 -0
- nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
- nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
- nexaai/binds/nexaml/libggml-base.dylib +0 -0
- nexaai/binds/nexaml/libggml-cpu.so +0 -0
- nexaai/binds/nexaml/libggml-metal.so +0 -0
- nexaai/binds/nexaml/libggml.dylib +0 -0
- nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
- nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
- nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
- nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
- nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
- nexaai/binds/nexaml/libnexaproc.dylib +0 -0
- nexaai/binds/nexaml/libomp.dylib +0 -0
- nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
- nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
- nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
- nexaai/common.py +106 -0
- nexaai/cv.py +95 -0
- nexaai/cv_impl/__init__.py +0 -0
- nexaai/cv_impl/mlx_cv_impl.py +91 -0
- nexaai/cv_impl/pybind_cv_impl.py +124 -0
- nexaai/diarize.py +80 -0
- nexaai/diarize_impl/__init__.py +1 -0
- nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
- nexaai/embedder.py +73 -0
- nexaai/embedder_impl/__init__.py +0 -0
- nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
- nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
- nexaai/image_gen.py +141 -0
- nexaai/image_gen_impl/__init__.py +0 -0
- nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
- nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
- nexaai/llm.py +98 -0
- nexaai/llm_impl/__init__.py +0 -0
- nexaai/llm_impl/mlx_llm_impl.py +271 -0
- nexaai/llm_impl/pybind_llm_impl.py +238 -0
- nexaai/log.py +92 -0
- nexaai/mlx_backend/asr/__init__.py +12 -0
- nexaai/mlx_backend/asr/interface.py +122 -0
- nexaai/mlx_backend/common/__init__.py +0 -0
- nexaai/mlx_backend/common/utils.py +25 -0
- nexaai/mlx_backend/cv/__init__.py +0 -0
- nexaai/mlx_backend/cv/generate.py +195 -0
- nexaai/mlx_backend/cv/interface.py +162 -0
- nexaai/mlx_backend/cv/main.py +81 -0
- nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
- nexaai/mlx_backend/embedding/__init__.py +0 -0
- nexaai/mlx_backend/embedding/generate.py +333 -0
- nexaai/mlx_backend/embedding/interface.py +617 -0
- nexaai/mlx_backend/embedding/main.py +173 -0
- nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
- nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
- nexaai/mlx_backend/image_gen/__init__.py +1 -0
- nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
- nexaai/mlx_backend/image_gen/interface.py +82 -0
- nexaai/mlx_backend/image_gen/main.py +281 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
- nexaai/mlx_backend/llm/__init__.py +0 -0
- nexaai/mlx_backend/llm/generate.py +149 -0
- nexaai/mlx_backend/llm/interface.py +764 -0
- nexaai/mlx_backend/llm/main.py +68 -0
- nexaai/mlx_backend/ml.py +888 -0
- nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/mlx_backend/mlx_audio/server.py +525 -0
- nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
- nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
- nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
- nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
- nexaai/mlx_backend/mlx_audio/utils.py +237 -0
- nexaai/mlx_backend/mlx_audio/version.py +1 -0
- nexaai/mlx_backend/profiling.py +239 -0
- nexaai/mlx_backend/rerank/__init__.py +0 -0
- nexaai/mlx_backend/rerank/generate.py +174 -0
- nexaai/mlx_backend/rerank/interface.py +287 -0
- nexaai/mlx_backend/rerank/main.py +127 -0
- nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
- nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
- nexaai/mlx_backend/sd/__init__.py +1 -0
- nexaai/mlx_backend/sd/interface.py +362 -0
- nexaai/mlx_backend/sd/main.py +286 -0
- nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
- nexaai/mlx_backend/sd/modeling/clip.py +116 -0
- nexaai/mlx_backend/sd/modeling/config.py +65 -0
- nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
- nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
- nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
- nexaai/mlx_backend/sd/modeling/unet.py +460 -0
- nexaai/mlx_backend/sd/modeling/vae.py +274 -0
- nexaai/mlx_backend/tts/__init__.py +12 -0
- nexaai/mlx_backend/tts/interface.py +276 -0
- nexaai/mlx_backend/vlm/__init__.py +3 -0
- nexaai/mlx_backend/vlm/generate.py +572 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
- nexaai/mlx_backend/vlm/interface.py +559 -0
- nexaai/mlx_backend/vlm/main.py +365 -0
- nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
- nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
- nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
- nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
- nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
- nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
- nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
- nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
- nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
- nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
- nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
- nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
- nexaai/rerank.py +57 -0
- nexaai/rerank_impl/__init__.py +0 -0
- nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
- nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
- nexaai/runtime.py +68 -0
- nexaai/runtime_error.py +24 -0
- nexaai/tts.py +75 -0
- nexaai/tts_impl/__init__.py +0 -0
- nexaai/tts_impl/mlx_tts_impl.py +94 -0
- nexaai/tts_impl/pybind_tts_impl.py +43 -0
- nexaai/utils/decode.py +18 -0
- nexaai/utils/manifest_utils.py +531 -0
- nexaai/utils/model_manager.py +1745 -0
- nexaai/utils/model_types.py +49 -0
- nexaai/utils/progress_tracker.py +389 -0
- nexaai/utils/quantization_utils.py +245 -0
- nexaai/vlm.py +130 -0
- nexaai/vlm_impl/__init__.py +0 -0
- nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
- nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
- nexaai-1.0.29.dist-info/METADATA +35 -0
- nexaai-1.0.29.dist-info/RECORD +580 -0
- nexaai-1.0.29.dist-info/WHEEL +5 -0
- nexaai-1.0.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
import mlx.core as mx
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pyloudnorm as pyln
|
|
6
|
+
import scipy.signal
|
|
7
|
+
import soundfile as sf
|
|
8
|
+
|
|
9
|
+
from mlx_audio.codec import DAC
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def process_audio_array(
|
|
13
|
+
audio: mx.array,
|
|
14
|
+
sample_rate: int = 24000,
|
|
15
|
+
target_loudness: float = -18.0,
|
|
16
|
+
peak_limit: float = -1,
|
|
17
|
+
block_size: float = 0.400,
|
|
18
|
+
) -> mx.array:
|
|
19
|
+
audio_np = np.array(audio)
|
|
20
|
+
|
|
21
|
+
# handle multi-channel audio
|
|
22
|
+
if len(audio_np.shape) > 1:
|
|
23
|
+
if audio_np.shape[1] > 1:
|
|
24
|
+
audio_np = np.mean(audio_np, axis=1)
|
|
25
|
+
else:
|
|
26
|
+
audio_np = np.squeeze(audio_np)
|
|
27
|
+
|
|
28
|
+
original_length = len(audio_np)
|
|
29
|
+
min_samples = int(block_size * sample_rate)
|
|
30
|
+
|
|
31
|
+
if original_length < min_samples:
|
|
32
|
+
pad_length = min_samples - original_length
|
|
33
|
+
audio_padded = np.pad(audio_np, (0, pad_length), mode="constant")
|
|
34
|
+
else:
|
|
35
|
+
audio_padded = audio_np
|
|
36
|
+
|
|
37
|
+
# measure and normalize loudness
|
|
38
|
+
meter = pyln.Meter(sample_rate, block_size=block_size)
|
|
39
|
+
measured_loudness = meter.integrated_loudness(audio_padded)
|
|
40
|
+
normalized = pyln.normalize.loudness(
|
|
41
|
+
audio_padded, measured_loudness, target_loudness
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# apply peak limiting if necessary
|
|
45
|
+
peak_value = np.max(np.abs(normalized))
|
|
46
|
+
threshold_value = 10 ** (peak_limit / 20)
|
|
47
|
+
if peak_value > threshold_value:
|
|
48
|
+
normalized = pyln.normalize.peak(normalized, peak_limit)
|
|
49
|
+
|
|
50
|
+
if original_length < min_samples:
|
|
51
|
+
normalized = normalized[:original_length]
|
|
52
|
+
|
|
53
|
+
normalized_array = mx.array(normalized).reshape(1, 1, -1)
|
|
54
|
+
return normalized_array
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class DacInterface:
|
|
58
|
+
def __init__(self, repo_id: str = "mlx-community/dac-speech-24khz-1.5kbps"):
|
|
59
|
+
self.model = DAC.from_pretrained(repo_id)
|
|
60
|
+
self.sr = 24000
|
|
61
|
+
|
|
62
|
+
def convert_audio(
|
|
63
|
+
self, audio: mx.array, sr: int, target_sr: int, target_channels: int
|
|
64
|
+
):
|
|
65
|
+
audio_np = np.array(audio)
|
|
66
|
+
|
|
67
|
+
if len(audio_np.shape) < 2:
|
|
68
|
+
audio_np = audio_np.reshape(1, -1)
|
|
69
|
+
|
|
70
|
+
channels, length = audio_np.shape[-2:]
|
|
71
|
+
|
|
72
|
+
if target_channels == 1:
|
|
73
|
+
if channels > 1:
|
|
74
|
+
audio_np = np.mean(audio_np, axis=-2, keepdims=True)
|
|
75
|
+
elif target_channels == 2:
|
|
76
|
+
if channels == 1:
|
|
77
|
+
audio_np = np.repeat(audio_np, 2, axis=-2)
|
|
78
|
+
elif channels > 2:
|
|
79
|
+
audio_np = audio_np[..., :2, :]
|
|
80
|
+
|
|
81
|
+
if sr != target_sr:
|
|
82
|
+
new_length = int(length * target_sr / sr)
|
|
83
|
+
resampled = np.zeros((target_channels, new_length))
|
|
84
|
+
|
|
85
|
+
for ch in range(target_channels):
|
|
86
|
+
resampled[ch] = scipy.signal.resample(audio_np[ch], new_length)
|
|
87
|
+
|
|
88
|
+
audio_np = resampled
|
|
89
|
+
|
|
90
|
+
return mx.array(audio_np)
|
|
91
|
+
|
|
92
|
+
def convert_audio_array(self, audio: mx.array, sr):
|
|
93
|
+
return self.convert_audio(audio, sr, self.sr, 1)
|
|
94
|
+
|
|
95
|
+
def load_audio(self, path):
|
|
96
|
+
audio_np, sr = sf.read(path)
|
|
97
|
+
audio = mx.array(audio_np)
|
|
98
|
+
if len(audio.shape) == 1:
|
|
99
|
+
audio = audio.reshape(1, -1)
|
|
100
|
+
# if stereo, reshape to channels-first format
|
|
101
|
+
elif len(audio.shape) > 1 and audio.shape[0] > audio.shape[1]:
|
|
102
|
+
audio = audio.T
|
|
103
|
+
return self.convert_audio_array(audio, sr).reshape(1, 1, -1)
|
|
104
|
+
|
|
105
|
+
def preprocess(self, audio_data):
|
|
106
|
+
length = audio_data.shape[-1]
|
|
107
|
+
hop_length = self.model.hop_length
|
|
108
|
+
right_pad = math.ceil(length / hop_length) * hop_length - length
|
|
109
|
+
audio_data = mx.pad(audio_data, [(0, 0), (0, 0), (0, right_pad)])
|
|
110
|
+
return audio_data
|
|
111
|
+
|
|
112
|
+
def encode(self, x: mx.array, win_duration: int = 5.0, verbose: bool = False):
|
|
113
|
+
x = process_audio_array(x)
|
|
114
|
+
nb, nac, nt = x.shape
|
|
115
|
+
x = x.reshape(nb * nac, 1, nt)
|
|
116
|
+
n_samples = int(win_duration * self.sr)
|
|
117
|
+
n_samples = int(
|
|
118
|
+
math.ceil(n_samples / self.model.hop_length) * self.model.hop_length
|
|
119
|
+
)
|
|
120
|
+
hop = n_samples
|
|
121
|
+
codes_list = []
|
|
122
|
+
|
|
123
|
+
if verbose:
|
|
124
|
+
from tqdm import trange
|
|
125
|
+
|
|
126
|
+
range_fn = trange
|
|
127
|
+
else:
|
|
128
|
+
range_fn = range
|
|
129
|
+
|
|
130
|
+
for i in range_fn(0, nt, hop):
|
|
131
|
+
chunk = x[..., i : i + n_samples]
|
|
132
|
+
audio_data = self.preprocess(chunk)
|
|
133
|
+
_, c, _, _, _ = self.model.encode(audio_data, None)
|
|
134
|
+
codes_list.append(c)
|
|
135
|
+
|
|
136
|
+
codes = mx.concatenate(codes_list, axis=-1)
|
|
137
|
+
return codes
|
|
138
|
+
|
|
139
|
+
def decode(self, codes: mx.array, verbose: bool = False) -> mx.array:
|
|
140
|
+
model = self.model
|
|
141
|
+
chunk_length = 4096
|
|
142
|
+
recons = []
|
|
143
|
+
|
|
144
|
+
if verbose:
|
|
145
|
+
from tqdm import trange
|
|
146
|
+
|
|
147
|
+
range_fn = trange
|
|
148
|
+
else:
|
|
149
|
+
range_fn = range
|
|
150
|
+
|
|
151
|
+
@mx.compile
|
|
152
|
+
def decode_chunk(codes):
|
|
153
|
+
z = model.quantizer.from_codes(codes)[0]
|
|
154
|
+
r = model.decode(z)
|
|
155
|
+
return r
|
|
156
|
+
|
|
157
|
+
for i in range_fn(0, codes.shape[-1], chunk_length):
|
|
158
|
+
c = codes[..., i : i + chunk_length]
|
|
159
|
+
recons.append(decode_chunk(c))
|
|
160
|
+
|
|
161
|
+
recons = mx.concatenate(recons, axis=-1)
|
|
162
|
+
return process_audio_array(recons.swapaxes(1, 2))
|
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
{
|
|
2
|
+
"text": "The cat watched from the windowsill, tail flicking with quiet curiosity as the first snowflakes of winter began to fall, dusting the world in fragile white.",
|
|
3
|
+
"words": [
|
|
4
|
+
{
|
|
5
|
+
"word": "The",
|
|
6
|
+
"duration": 0.2,
|
|
7
|
+
"c1": [
|
|
8
|
+
720, 720, 474, 691, 607, 126, 597, 607, 897, 288, 362, 903, 333, 1009,
|
|
9
|
+
79
|
|
10
|
+
],
|
|
11
|
+
"c2": [
|
|
12
|
+
658, 663, 237, 915, 74, 74, 966, 721, 893, 722, 630, 516, 861, 385, 149
|
|
13
|
+
],
|
|
14
|
+
"features": {
|
|
15
|
+
"energy": 10,
|
|
16
|
+
"spectral_centroid": 15,
|
|
17
|
+
"pitch": 45
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"word": "cat",
|
|
22
|
+
"duration": 0.33,
|
|
23
|
+
"c1": [
|
|
24
|
+
700, 597, 639, 838, 622, 336, 975, 326, 67, 375, 853, 761, 35, 363, 31,
|
|
25
|
+
1000, 982, 192, 647, 564, 329, 1002, 275, 480, 551
|
|
26
|
+
],
|
|
27
|
+
"c2": [
|
|
28
|
+
34, 810, 457, 546, 42, 631, 339, 867, 115, 1011, 509, 369, 473, 85, 190,
|
|
29
|
+
715, 391, 518, 562, 986, 749, 193, 530, 327, 820
|
|
30
|
+
],
|
|
31
|
+
"features": {
|
|
32
|
+
"energy": 14,
|
|
33
|
+
"spectral_centroid": 21,
|
|
34
|
+
"pitch": 35
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"word": "watched",
|
|
39
|
+
"duration": 0.44,
|
|
40
|
+
"c1": [
|
|
41
|
+
625, 668, 168, 524, 462, 151, 549, 951, 597, 820, 489, 329, 377, 144,
|
|
42
|
+
112, 16, 481, 133, 195, 744, 144, 750, 288, 500, 1000, 58, 916, 597, 72,
|
|
43
|
+
336, 224, 476, 581
|
|
44
|
+
],
|
|
45
|
+
"c2": [
|
|
46
|
+
204, 421, 318, 677, 74, 953, 903, 413, 809, 37, 634, 824, 933, 200, 14,
|
|
47
|
+
1007, 111, 17, 435, 718, 559, 783, 415, 821, 958, 247, 14, 721, 158,
|
|
48
|
+
235, 276, 875, 683
|
|
49
|
+
],
|
|
50
|
+
"features": {
|
|
51
|
+
"energy": 19,
|
|
52
|
+
"spectral_centroid": 21,
|
|
53
|
+
"pitch": 26
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
"word": "from",
|
|
58
|
+
"duration": 0.2,
|
|
59
|
+
"c1": [
|
|
60
|
+
528, 668, 738, 985, 126, 924, 1003, 325, 393, 86, 114, 392, 638, 915,
|
|
61
|
+
549
|
|
62
|
+
],
|
|
63
|
+
"c2": [
|
|
64
|
+
929, 872, 332, 296, 983, 406, 867, 568, 374, 328, 419, 348, 177, 379,
|
|
65
|
+
181
|
|
66
|
+
],
|
|
67
|
+
"features": {
|
|
68
|
+
"energy": 10,
|
|
69
|
+
"spectral_centroid": 29,
|
|
70
|
+
"pitch": 14
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"word": "the",
|
|
75
|
+
"duration": 0.12,
|
|
76
|
+
"c1": [470, 985, 152, 474, 967, 558, 460, 728, 470],
|
|
77
|
+
"c2": [596, 246, 314, 246, 756, 238, 606, 262, 499],
|
|
78
|
+
"features": {
|
|
79
|
+
"energy": 23,
|
|
80
|
+
"spectral_centroid": 10,
|
|
81
|
+
"pitch": 23
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"word": "windowsill,",
|
|
86
|
+
"duration": 0.75,
|
|
87
|
+
"c1": [
|
|
88
|
+
217, 126, 549, 700, 198, 891, 95, 683, 158, 680, 16, 769, 402, 776, 295,
|
|
89
|
+
258, 68, 213, 669, 865, 719, 29, 949, 329, 216, 481, 284, 224, 221, 359,
|
|
90
|
+
328, 311, 415, 443, 410, 359, 600, 590, 932, 611, 905, 304, 292, 72,
|
|
91
|
+
388, 333, 66, 943, 489, 648, 630, 648, 402, 972, 392, 558
|
|
92
|
+
],
|
|
93
|
+
"c2": [
|
|
94
|
+
911, 19, 1007, 169, 185, 182, 399, 849, 656, 963, 265, 80, 453, 768,
|
|
95
|
+
919, 1010, 501, 794, 141, 123, 93, 694, 499, 174, 768, 689, 598, 686,
|
|
96
|
+
10, 381, 282, 556, 126, 672, 872, 650, 990, 556, 913, 635, 174, 819,
|
|
97
|
+
999, 423, 64, 272, 112, 600, 453, 678, 791, 301, 206, 187, 819, 948
|
|
98
|
+
],
|
|
99
|
+
"features": {
|
|
100
|
+
"energy": 17,
|
|
101
|
+
"spectral_centroid": 25,
|
|
102
|
+
"pitch": 24
|
|
103
|
+
}
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"word": "tail",
|
|
107
|
+
"duration": 0.6,
|
|
108
|
+
"c1": [
|
|
109
|
+
669, 94, 917, 202, 607, 720, 625, 597, 126, 607, 885, 700, 474, 480,
|
|
110
|
+
126, 126, 551, 720, 126, 551, 720, 607, 572, 234, 114, 963, 963, 975,
|
|
111
|
+
587, 119, 378, 696, 730, 375, 46, 827, 515, 447, 979, 138, 22, 267, 43,
|
|
112
|
+
495, 16
|
|
113
|
+
],
|
|
114
|
+
"c2": [
|
|
115
|
+
1011, 336, 157, 39, 1000, 721, 862, 413, 557, 569, 74, 569, 141, 493,
|
|
116
|
+
124, 775, 204, 588, 74, 588, 810, 124, 102, 1021, 83, 848, 297, 339,
|
|
117
|
+
335, 684, 400, 905, 909, 710, 460, 115, 81, 628, 224, 663, 892, 247,
|
|
118
|
+
392, 234, 132
|
|
119
|
+
],
|
|
120
|
+
"features": {
|
|
121
|
+
"energy": 15,
|
|
122
|
+
"spectral_centroid": 23,
|
|
123
|
+
"pitch": 34
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"word": "flicking",
|
|
128
|
+
"duration": 0.45,
|
|
129
|
+
"c1": [
|
|
130
|
+
978, 489, 630, 588, 436, 798, 4, 975, 245, 325, 415, 4, 393, 4, 4, 997,
|
|
131
|
+
982, 437, 444, 180, 861, 868, 225, 440, 780, 597, 720, 639, 168, 426,
|
|
132
|
+
114, 621, 854, 869
|
|
133
|
+
],
|
|
134
|
+
"c2": [
|
|
135
|
+
571, 321, 376, 232, 301, 678, 904, 630, 990, 772, 690, 870, 719, 694,
|
|
136
|
+
332, 558, 301, 194, 279, 443, 852, 64, 709, 401, 401, 14, 74, 873, 134,
|
|
137
|
+
754, 1002, 595, 540, 525
|
|
138
|
+
],
|
|
139
|
+
"features": {
|
|
140
|
+
"energy": 9,
|
|
141
|
+
"spectral_centroid": 22,
|
|
142
|
+
"pitch": 23
|
|
143
|
+
}
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"word": "with",
|
|
147
|
+
"duration": 0.23,
|
|
148
|
+
"c1": [
|
|
149
|
+
621, 392, 756, 459, 433, 881, 786, 198, 702, 847, 490, 27, 680, 146, 58,
|
|
150
|
+
808, 997
|
|
151
|
+
],
|
|
152
|
+
"c2": [
|
|
153
|
+
460, 840, 840, 303, 847, 534, 801, 99, 662, 666, 510, 132, 376, 96, 639,
|
|
154
|
+
240, 668
|
|
155
|
+
],
|
|
156
|
+
"features": {
|
|
157
|
+
"energy": 11,
|
|
158
|
+
"spectral_centroid": 15,
|
|
159
|
+
"pitch": 20
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
"word": "quiet",
|
|
164
|
+
"duration": 0.37,
|
|
165
|
+
"c1": [
|
|
166
|
+
969, 291, 572, 720, 625, 85, 698, 478, 811, 956, 232, 85, 962, 817, 986,
|
|
167
|
+
483, 835, 526, 77, 187, 178, 50, 440, 16, 198, 237, 418, 862
|
|
168
|
+
],
|
|
169
|
+
"c2": [
|
|
170
|
+
498, 606, 24, 629, 662, 181, 119, 678, 340, 736, 217, 204, 935, 796,
|
|
171
|
+
118, 478, 818, 791, 329, 209, 5, 234, 337, 647, 110, 922, 933, 1011
|
|
172
|
+
],
|
|
173
|
+
"features": {
|
|
174
|
+
"energy": 12,
|
|
175
|
+
"spectral_centroid": 12,
|
|
176
|
+
"pitch": 43
|
|
177
|
+
}
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"word": "curiosity",
|
|
181
|
+
"duration": 0.71,
|
|
182
|
+
"c1": [
|
|
183
|
+
321, 402, 215, 607, 720, 224, 731, 621, 491, 720, 551, 456, 336, 688,
|
|
184
|
+
476, 953, 718, 806, 410, 786, 976, 664, 855, 433, 756, 396, 699, 776,
|
|
185
|
+
443, 739, 932, 22, 305, 353, 503, 564, 978, 407, 395, 798, 324, 168,
|
|
186
|
+
909, 328, 328, 443, 738, 114, 962, 681, 535, 701, 382
|
|
187
|
+
],
|
|
188
|
+
"c2": [
|
|
189
|
+
777, 665, 629, 327, 831, 764, 162, 725, 810, 170, 629, 774, 108, 948,
|
|
190
|
+
972, 449, 600, 905, 81, 765, 601, 422, 820, 746, 450, 346, 733, 77, 733,
|
|
191
|
+
81, 722, 576, 286, 271, 714, 95, 346, 133, 514, 799, 122, 900, 568, 666,
|
|
192
|
+
209, 668, 558, 630, 165, 587, 423, 904, 629
|
|
193
|
+
],
|
|
194
|
+
"features": {
|
|
195
|
+
"energy": 10,
|
|
196
|
+
"spectral_centroid": 29,
|
|
197
|
+
"pitch": 22
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
{
|
|
201
|
+
"word": "as",
|
|
202
|
+
"duration": 0.48,
|
|
203
|
+
"c1": [
|
|
204
|
+
474, 936, 336, 589, 254, 854, 79, 140, 863, 854, 701, 260, 929, 140,
|
|
205
|
+
669, 808, 411, 232, 434, 542, 597, 126, 551, 126, 607, 1011, 774, 681,
|
|
206
|
+
94, 25, 971, 288, 305, 347, 355, 415
|
|
207
|
+
],
|
|
208
|
+
"c2": [
|
|
209
|
+
267, 813, 232, 361, 77, 607, 252, 933, 508, 658, 846, 849, 873, 496,
|
|
210
|
+
832, 167, 440, 124, 557, 124, 736, 588, 569, 983, 497, 360, 810, 274,
|
|
211
|
+
588, 365, 517, 934, 957, 839, 646, 720
|
|
212
|
+
],
|
|
213
|
+
"features": {
|
|
214
|
+
"energy": 7,
|
|
215
|
+
"spectral_centroid": 31,
|
|
216
|
+
"pitch": 23
|
|
217
|
+
}
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"word": "the",
|
|
221
|
+
"duration": 0.13,
|
|
222
|
+
"c1": [359, 568, 700, 985, 80, 580, 274, 129, 600, 794],
|
|
223
|
+
"c2": [423, 833, 245, 690, 209, 688, 765, 453, 677, 615],
|
|
224
|
+
"features": {
|
|
225
|
+
"energy": 9,
|
|
226
|
+
"spectral_centroid": 26,
|
|
227
|
+
"pitch": 20
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
{
|
|
231
|
+
"word": "first",
|
|
232
|
+
"duration": 0.36,
|
|
233
|
+
"c1": [
|
|
234
|
+
997, 325, 147, 4, 780, 669, 621, 896, 30, 686, 526, 399, 210, 783, 216,
|
|
235
|
+
144, 329, 448, 481, 288, 132, 600, 168, 221, 415, 415, 528
|
|
236
|
+
],
|
|
237
|
+
"c2": [
|
|
238
|
+
325, 666, 627, 629, 240, 665, 650, 481, 962, 328, 128, 358, 166, 264,
|
|
239
|
+
555, 30, 815, 10, 669, 525, 450, 746, 919, 621, 647, 16, 601
|
|
240
|
+
],
|
|
241
|
+
"features": {
|
|
242
|
+
"energy": 13,
|
|
243
|
+
"spectral_centroid": 28,
|
|
244
|
+
"pitch": 22
|
|
245
|
+
}
|
|
246
|
+
},
|
|
247
|
+
{
|
|
248
|
+
"word": "snowflakes",
|
|
249
|
+
"duration": 0.76,
|
|
250
|
+
"c1": [
|
|
251
|
+
1003, 680, 607, 720, 126, 668, 336, 224, 114, 997, 426, 997, 147, 221,
|
|
252
|
+
359, 328, 1003, 738, 974, 151, 782, 179, 190, 553, 453, 761, 778, 23,
|
|
253
|
+
128, 643, 125, 7, 345, 223, 275, 524, 325, 764, 114, 953, 70, 75, 449,
|
|
254
|
+
513, 783, 830, 825, 365, 819, 920, 669, 700, 700, 720, 220, 209, 221
|
|
255
|
+
],
|
|
256
|
+
"c2": [
|
|
257
|
+
276, 489, 810, 975, 775, 913, 1022, 818, 340, 481, 690, 366, 924, 782,
|
|
258
|
+
366, 481, 400, 998, 872, 556, 688, 719, 78, 952, 119, 412, 286, 847, 60,
|
|
259
|
+
381, 86, 694, 779, 55, 246, 374, 143, 91, 209, 640, 313, 873, 295, 355,
|
|
260
|
+
333, 705, 468, 1008, 317, 87, 105, 511, 260, 650, 574, 88, 690
|
|
261
|
+
],
|
|
262
|
+
"features": {
|
|
263
|
+
"energy": 12,
|
|
264
|
+
"spectral_centroid": 29,
|
|
265
|
+
"pitch": 22
|
|
266
|
+
}
|
|
267
|
+
},
|
|
268
|
+
{
|
|
269
|
+
"word": "of",
|
|
270
|
+
"duration": 0.15,
|
|
271
|
+
"c1": [443, 328, 528, 85, 313, 145, 588, 140, 114, 325, 325],
|
|
272
|
+
"c2": [924, 835, 400, 832, 397, 1011, 695, 716, 366, 489, 487],
|
|
273
|
+
"features": {
|
|
274
|
+
"energy": 7,
|
|
275
|
+
"spectral_centroid": 34,
|
|
276
|
+
"pitch": 13
|
|
277
|
+
}
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
"word": "winter",
|
|
281
|
+
"duration": 0.29,
|
|
282
|
+
"c1": [
|
|
283
|
+
559, 71, 549, 64, 902, 609, 206, 386, 428, 529, 92, 1020, 148, 456, 605,
|
|
284
|
+
673, 958, 897, 250, 716, 236, 232
|
|
285
|
+
],
|
|
286
|
+
"c2": [
|
|
287
|
+
891, 358, 1016, 185, 558, 392, 63, 45, 238, 404, 603, 520, 657, 628,
|
|
288
|
+
748, 649, 629, 298, 772, 483, 1008, 401
|
|
289
|
+
],
|
|
290
|
+
"features": {
|
|
291
|
+
"energy": 18,
|
|
292
|
+
"spectral_centroid": 16,
|
|
293
|
+
"pitch": 31
|
|
294
|
+
}
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
"word": "began",
|
|
298
|
+
"duration": 0.24,
|
|
299
|
+
"c1": [
|
|
300
|
+
490, 6, 596, 669, 1011, 700, 583, 349, 666, 783, 215, 126, 61, 22, 945,
|
|
301
|
+
773, 920, 975
|
|
302
|
+
],
|
|
303
|
+
"c2": [
|
|
304
|
+
194, 225, 140, 243, 14, 650, 929, 671, 323, 365, 556, 298, 707, 483,
|
|
305
|
+
550, 57, 127, 886
|
|
306
|
+
],
|
|
307
|
+
"features": {
|
|
308
|
+
"energy": 11,
|
|
309
|
+
"spectral_centroid": 12,
|
|
310
|
+
"pitch": 18
|
|
311
|
+
}
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
"word": "to",
|
|
315
|
+
"duration": 0.2,
|
|
316
|
+
"c1": [
|
|
317
|
+
265, 1021, 113, 178, 698, 561, 97, 402, 25, 916, 766, 660, 159, 945, 967
|
|
318
|
+
],
|
|
319
|
+
"c2": [
|
|
320
|
+
141, 976, 455, 403, 760, 738, 519, 123, 327, 721, 690, 904, 689, 140,
|
|
321
|
+
615
|
|
322
|
+
],
|
|
323
|
+
"features": {
|
|
324
|
+
"energy": 13,
|
|
325
|
+
"spectral_centroid": 19,
|
|
326
|
+
"pitch": 20
|
|
327
|
+
}
|
|
328
|
+
},
|
|
329
|
+
{
|
|
330
|
+
"word": "fall,",
|
|
331
|
+
"duration": 0.39,
|
|
332
|
+
"c1": [
|
|
333
|
+
781, 325, 4, 114, 997, 415, 4, 443, 953, 781, 399, 993, 489, 383, 920,
|
|
334
|
+
383, 272, 755, 843, 450, 763, 392, 411, 682, 895, 443, 490, 863, 79
|
|
335
|
+
],
|
|
336
|
+
"c2": [
|
|
337
|
+
143, 990, 209, 990, 990, 556, 462, 952, 914, 702, 301, 833, 779, 982,
|
|
338
|
+
26, 458, 519, 9, 264, 74, 304, 110, 646, 905, 185, 959, 53, 543, 909
|
|
339
|
+
],
|
|
340
|
+
"features": {
|
|
341
|
+
"energy": 13,
|
|
342
|
+
"spectral_centroid": 14,
|
|
343
|
+
"pitch": 18
|
|
344
|
+
}
|
|
345
|
+
},
|
|
346
|
+
{
|
|
347
|
+
"word": "dusting",
|
|
348
|
+
"duration": 0.89,
|
|
349
|
+
"c1": [
|
|
350
|
+
27, 669, 490, 691, 691, 625, 625, 572, 474, 885, 215, 215, 215, 215,
|
|
351
|
+
215, 215, 75, 718, 94, 924, 232, 818, 14, 232, 985, 547, 955, 4, 627,
|
|
352
|
+
524, 524, 579, 462, 104, 597, 720, 720, 491, 597, 571, 802, 864, 315,
|
|
353
|
+
515, 832, 219, 133, 923, 773, 245, 415, 328, 590, 80, 528, 322, 808,
|
|
354
|
+
551, 625, 716, 158, 562, 712, 477, 905, 920, 424
|
|
355
|
+
],
|
|
356
|
+
"c2": [
|
|
357
|
+
206, 521, 77, 447, 260, 810, 74, 301, 243, 775, 243, 775, 880, 862,
|
|
358
|
+
1017, 806, 806, 631, 873, 806, 806, 722, 14, 531, 630, 500, 990, 240,
|
|
359
|
+
690, 431, 240, 815, 449, 273, 903, 569, 325, 629, 872, 239, 686, 189,
|
|
360
|
+
774, 264, 314, 628, 107, 120, 560, 929, 1008, 610, 24, 929, 400, 949,
|
|
361
|
+
431, 721, 447, 443, 774, 392, 923, 855, 747, 144, 460
|
|
362
|
+
],
|
|
363
|
+
"features": {
|
|
364
|
+
"energy": 14,
|
|
365
|
+
"spectral_centroid": 28,
|
|
366
|
+
"pitch": 30
|
|
367
|
+
}
|
|
368
|
+
},
|
|
369
|
+
{
|
|
370
|
+
"word": "the",
|
|
371
|
+
"duration": 0.12,
|
|
372
|
+
"c1": [396, 433, 276, 530, 316, 117, 112, 7, 531],
|
|
373
|
+
"c2": [332, 479, 262, 239, 123, 239, 453, 499, 545],
|
|
374
|
+
"features": {
|
|
375
|
+
"energy": 23,
|
|
376
|
+
"spectral_centroid": 11,
|
|
377
|
+
"pitch": 30
|
|
378
|
+
}
|
|
379
|
+
},
|
|
380
|
+
{
|
|
381
|
+
"word": "world",
|
|
382
|
+
"duration": 0.32,
|
|
383
|
+
"c1": [
|
|
384
|
+
217, 489, 897, 607, 402, 383, 496, 937, 247, 206, 790, 32, 406, 856,
|
|
385
|
+
715, 458, 278, 481, 503, 399, 871, 453, 858, 392
|
|
386
|
+
],
|
|
387
|
+
"c2": [
|
|
388
|
+
593, 959, 461, 546, 242, 438, 81, 99, 939, 361, 269, 571, 525, 542, 246,
|
|
389
|
+
10, 613, 228, 913, 252, 132, 132, 287, 559
|
|
390
|
+
],
|
|
391
|
+
"features": {
|
|
392
|
+
"energy": 22,
|
|
393
|
+
"spectral_centroid": 11,
|
|
394
|
+
"pitch": 31
|
|
395
|
+
}
|
|
396
|
+
},
|
|
397
|
+
{
|
|
398
|
+
"word": "in",
|
|
399
|
+
"duration": 0.23,
|
|
400
|
+
"c1": [
|
|
401
|
+
558, 497, 436, 598, 607, 416, 311, 906, 955, 905, 448, 54, 92, 487, 770,
|
|
402
|
+
298, 490
|
|
403
|
+
],
|
|
404
|
+
"c2": [
|
|
405
|
+
838, 399, 420, 819, 325, 929, 124, 214, 1021, 728, 975, 688, 132, 718,
|
|
406
|
+
724, 911, 536
|
|
407
|
+
],
|
|
408
|
+
"features": {
|
|
409
|
+
"energy": 14,
|
|
410
|
+
"spectral_centroid": 16,
|
|
411
|
+
"pitch": 22
|
|
412
|
+
}
|
|
413
|
+
},
|
|
414
|
+
{
|
|
415
|
+
"word": "fragile",
|
|
416
|
+
"duration": 0.41,
|
|
417
|
+
"c1": [
|
|
418
|
+
415, 325, 953, 359, 325, 838, 359, 764, 842, 341, 706, 674, 971, 592,
|
|
419
|
+
507, 16, 628, 481, 626, 691, 1011, 610, 336, 476, 528, 637, 472, 251,
|
|
420
|
+
945, 811, 406
|
|
421
|
+
],
|
|
422
|
+
"c2": [
|
|
423
|
+
126, 990, 374, 143, 629, 868, 338, 91, 346, 393, 407, 987, 987, 1009,
|
|
424
|
+
617, 854, 824, 439, 789, 311, 810, 497, 664, 549, 135, 908, 702, 639,
|
|
425
|
+
320, 698, 414
|
|
426
|
+
],
|
|
427
|
+
"features": {
|
|
428
|
+
"energy": 13,
|
|
429
|
+
"spectral_centroid": 20,
|
|
430
|
+
"pitch": 18
|
|
431
|
+
}
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
"word": "white.",
|
|
435
|
+
"duration": 0.75,
|
|
436
|
+
"c1": [
|
|
437
|
+
26, 432, 1, 651, 998, 716, 998, 727, 978, 311, 85, 895, 279, 392, 669,
|
|
438
|
+
916, 549, 1011, 97, 597, 296, 392, 526, 998, 835, 468, 871, 405, 26,
|
|
439
|
+
759, 524, 107, 77, 22, 260, 682, 621, 79, 682, 411, 701, 972, 691, 720,
|
|
440
|
+
551, 597, 660, 224, 236, 70, 652, 215, 126, 474, 597, 625
|
|
441
|
+
],
|
|
442
|
+
"c2": [
|
|
443
|
+
475, 778, 695, 612, 913, 315, 536, 593, 55, 371, 19, 560, 821, 646, 151,
|
|
444
|
+
801, 821, 413, 14, 922, 629, 380, 417, 679, 487, 562, 821, 706, 324,
|
|
445
|
+
896, 169, 594, 810, 864, 810, 588, 862, 969, 14, 105, 528, 165, 420,
|
|
446
|
+
170, 821, 423, 977, 904, 690, 235, 702, 14, 124, 350, 74, 413
|
|
447
|
+
],
|
|
448
|
+
"features": {
|
|
449
|
+
"energy": 13,
|
|
450
|
+
"spectral_centroid": 11,
|
|
451
|
+
"pitch": 23
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
],
|
|
455
|
+
"global_features": {
|
|
456
|
+
"energy": 13,
|
|
457
|
+
"spectral_centroid": 20,
|
|
458
|
+
"pitch": 28
|
|
459
|
+
},
|
|
460
|
+
"interface_version": 3
|
|
461
|
+
}
|