nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexaai/__init__.py +99 -0
- nexaai/_stub.cpython-310-darwin.so +0 -0
- nexaai/_version.py +4 -0
- nexaai/asr.py +68 -0
- nexaai/asr_impl/__init__.py +0 -0
- nexaai/asr_impl/mlx_asr_impl.py +93 -0
- nexaai/asr_impl/pybind_asr_impl.py +127 -0
- nexaai/base.py +39 -0
- nexaai/binds/__init__.py +7 -0
- nexaai/binds/asr_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/common_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/cpu_gpu/libggml-base.dylib +0 -0
- nexaai/binds/cpu_gpu/libggml-cpu.so +0 -0
- nexaai/binds/cpu_gpu/libggml-metal.so +0 -0
- nexaai/binds/cpu_gpu/libggml.dylib +0 -0
- nexaai/binds/cpu_gpu/libmtmd.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_cpu_gpu.dylib +0 -0
- nexaai/binds/cpu_gpu/libnexa_plugin.dylib +0 -0
- nexaai/binds/cv_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/diarize_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/embedder_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/libnexa_bridge.dylib +0 -0
- nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/metal/libnexa_plugin.dylib +0 -0
- nexaai/binds/metal/py-lib/ml.py +888 -0
- nexaai/binds/metal/py-lib/mlx_audio/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/binds/metal/py-lib/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/binds/metal/py-lib/mlx_audio/server.py +525 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/binds/metal/py-lib/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/generate.py +174 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/binds/metal/py-lib/mlx_audio/stt/utils.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/audio_player.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/convert.py +71 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/generate.py +449 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/base.py +84 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/binds/metal/py-lib/mlx_audio/tts/utils.py +337 -0
- nexaai/binds/metal/py-lib/mlx_audio/utils.py +237 -0
- nexaai/binds/metal/py-lib/mlx_audio/version.py +1 -0
- nexaai/binds/metal/py-lib/profiling.py +239 -0
- nexaai/binds/nexaml/libfftw3.3.dylib +0 -0
- nexaai/binds/nexaml/libfftw3f.3.dylib +0 -0
- nexaai/binds/nexaml/libggml-base.dylib +0 -0
- nexaai/binds/nexaml/libggml-cpu.so +0 -0
- nexaai/binds/nexaml/libggml-metal.so +0 -0
- nexaai/binds/nexaml/libggml.dylib +0 -0
- nexaai/binds/nexaml/libmp3lame.0.dylib +0 -0
- nexaai/binds/nexaml/libmpg123.0.dylib +0 -0
- nexaai/binds/nexaml/libnexa-mm-process.dylib +0 -0
- nexaai/binds/nexaml/libnexa-sampling.dylib +0 -0
- nexaai/binds/nexaml/libnexa_plugin.dylib +0 -0
- nexaai/binds/nexaml/libnexaproc.dylib +0 -0
- nexaai/binds/nexaml/libomp.dylib +0 -0
- nexaai/binds/nexaml/libqwen3-vl.dylib +0 -0
- nexaai/binds/nexaml/libqwen3vl-vision.dylib +0 -0
- nexaai/binds/rerank_bind.cpython-310-darwin.so +0 -0
- nexaai/binds/vlm_bind.cpython-310-darwin.so +0 -0
- nexaai/common.py +106 -0
- nexaai/cv.py +95 -0
- nexaai/cv_impl/__init__.py +0 -0
- nexaai/cv_impl/mlx_cv_impl.py +91 -0
- nexaai/cv_impl/pybind_cv_impl.py +124 -0
- nexaai/diarize.py +80 -0
- nexaai/diarize_impl/__init__.py +1 -0
- nexaai/diarize_impl/pybind_diarize_impl.py +125 -0
- nexaai/embedder.py +73 -0
- nexaai/embedder_impl/__init__.py +0 -0
- nexaai/embedder_impl/mlx_embedder_impl.py +118 -0
- nexaai/embedder_impl/pybind_embedder_impl.py +96 -0
- nexaai/image_gen.py +141 -0
- nexaai/image_gen_impl/__init__.py +0 -0
- nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -0
- nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -0
- nexaai/llm.py +98 -0
- nexaai/llm_impl/__init__.py +0 -0
- nexaai/llm_impl/mlx_llm_impl.py +271 -0
- nexaai/llm_impl/pybind_llm_impl.py +238 -0
- nexaai/log.py +92 -0
- nexaai/mlx_backend/asr/__init__.py +12 -0
- nexaai/mlx_backend/asr/interface.py +122 -0
- nexaai/mlx_backend/common/__init__.py +0 -0
- nexaai/mlx_backend/common/utils.py +25 -0
- nexaai/mlx_backend/cv/__init__.py +0 -0
- nexaai/mlx_backend/cv/generate.py +195 -0
- nexaai/mlx_backend/cv/interface.py +162 -0
- nexaai/mlx_backend/cv/main.py +81 -0
- nexaai/mlx_backend/cv/modeling/pp_ocr_v4.py +1736 -0
- nexaai/mlx_backend/embedding/__init__.py +0 -0
- nexaai/mlx_backend/embedding/generate.py +333 -0
- nexaai/mlx_backend/embedding/interface.py +617 -0
- nexaai/mlx_backend/embedding/main.py +173 -0
- nexaai/mlx_backend/embedding/modeling/__init__.py +0 -0
- nexaai/mlx_backend/embedding/modeling/nexa_jina_v2.py +399 -0
- nexaai/mlx_backend/image_gen/__init__.py +1 -0
- nexaai/mlx_backend/image_gen/generate_sd.py +244 -0
- nexaai/mlx_backend/image_gen/interface.py +82 -0
- nexaai/mlx_backend/image_gen/main.py +281 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/__init__.py +306 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/clip.py +116 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/config.py +65 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/model_io.py +386 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/sampler.py +105 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/tokenizer.py +100 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/unet.py +460 -0
- nexaai/mlx_backend/image_gen/stable_diffusion/vae.py +274 -0
- nexaai/mlx_backend/llm/__init__.py +0 -0
- nexaai/mlx_backend/llm/generate.py +149 -0
- nexaai/mlx_backend/llm/interface.py +764 -0
- nexaai/mlx_backend/llm/main.py +68 -0
- nexaai/mlx_backend/ml.py +888 -0
- nexaai/mlx_backend/mlx_audio/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/__init__.py +5 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/activation.py +51 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/amp.py +96 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/bigvgan.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/conv.py +114 -0
- nexaai/mlx_backend/mlx_audio/codec/models/bigvgan/resample.py +177 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/base.py +228 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/dac.py +285 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/layers.py +129 -0
- nexaai/mlx_backend/mlx_audio/codec/models/descript/nn/quantize.py +149 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/encodec/encodec.py +777 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/mimi.py +286 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/__init__.py +20 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/conv.py +398 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/kv_cache.py +199 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/quantization.py +179 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/seanet.py +314 -0
- nexaai/mlx_backend/mlx_audio/codec/models/mimi/modules/transformer.py +256 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model.py +260 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/model_v2.py +383 -0
- nexaai/mlx_backend/mlx_audio/codec/models/s3/utils.py +122 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/attention.py +97 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/layers.py +306 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/snac.py +154 -0
- nexaai/mlx_backend/mlx_audio/codec/models/snac/vq.py +135 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/mel.py +33 -0
- nexaai/mlx_backend/mlx_audio/codec/models/vocos/vocos.py +359 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_bigvgan.py +54 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_descript.py +109 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_encodec.py +58 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_mimi.py +22 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_s3.py +25 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_snac.py +40 -0
- nexaai/mlx_backend/mlx_audio/codec/tests/test_vocos.py +93 -0
- nexaai/mlx_backend/mlx_audio/server.py +525 -0
- nexaai/mlx_backend/mlx_audio/sts/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/sts/tests/test_voice_pipeline.py +156 -0
- nexaai/mlx_backend/mlx_audio/sts/voice_pipeline.py +327 -0
- nexaai/mlx_backend/mlx_audio/stt/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/generate.py +174 -0
- nexaai/mlx_backend/mlx_audio/stt/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/alignment.py +248 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/attention.py +187 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/audio.py +76 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/conformer.py +331 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/ctc.py +34 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/parakeet.py +604 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/rnnt.py +157 -0
- nexaai/mlx_backend/mlx_audio/stt/models/parakeet/tokenizer.py +2 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/feature_extractor.py +757 -0
- nexaai/mlx_backend/mlx_audio/stt/models/wav2vec/wav2vec.py +738 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/audio.py +82 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/decoding.py +742 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/timing.py +329 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/tokenizer.py +398 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/whisper.py +862 -0
- nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py +268 -0
- nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py +381 -0
- nexaai/mlx_backend/mlx_audio/stt/utils.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/audio_player.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/convert.py +71 -0
- nexaai/mlx_backend/mlx_audio/tts/generate.py +449 -0
- nexaai/mlx_backend/mlx_audio/tts/models/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/bark.py +528 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/isftnet.py +12 -0
- nexaai/mlx_backend/mlx_audio/tts/models/bark/pipeline.py +442 -0
- nexaai/mlx_backend/mlx_audio/tts/models/base.py +84 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/audio.py +287 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/config.py +256 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/dia.py +592 -0
- nexaai/mlx_backend/mlx_audio/tts/models/dia/layers.py +870 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/attention.py +180 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/bigvgan.py +124 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/conformer.py +247 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/asp.py +59 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/ecapa_tdnn.py +91 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/se_res2net.py +132 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/ecapa_tdnn/tdnn.py +42 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/gpt2.py +38 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/indextts.py +412 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/mel.py +37 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/normalize.py +294 -0
- nexaai/mlx_backend/mlx_audio/tts/models/indextts/perceiver.py +62 -0
- nexaai/mlx_backend/mlx_audio/tts/models/interpolate.py +108 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/__init__.py +4 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/istftnet.py +979 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/kokoro.py +331 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/modules.py +659 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/pipeline.py +453 -0
- nexaai/mlx_backend/mlx_audio/tts/models/kokoro/voice.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/llama/llama.py +324 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/audio_processor.py +351 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py +162 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json +461 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/outetts.py +255 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/prompt_processor.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/models/outetts/tokens.py +36 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/__init__.py +3 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/attention.py +195 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/sesame.py +633 -0
- nexaai/mlx_backend/mlx_audio/tts/models/sesame/watermarking.py +105 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/audio_tokenizer.py +138 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/bicodec.py +269 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/blocks/sampler.py +111 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_decoder.py +120 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/feat_encoder.py +136 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/encoder_decoder/wave_generator.py +113 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/finite_scalar_quantization.py +238 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual.py +209 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/residual_fsq.py +309 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/__init__.py +1 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/ecapa_tdnn.py +283 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/perceiver_encoder.py +326 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/pooling_layers.py +297 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/modules/speaker/speaker_encoder.py +155 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/spark.py +382 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/audio.py +220 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/file.py +221 -0
- nexaai/mlx_backend/mlx_audio/tts/models/spark/utils/token_parser.py +181 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/__init__.py +0 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_base.py +66 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_convert.py +173 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_interpolate.py +88 -0
- nexaai/mlx_backend/mlx_audio/tts/tests/test_models.py +974 -0
- nexaai/mlx_backend/mlx_audio/tts/utils.py +337 -0
- nexaai/mlx_backend/mlx_audio/utils.py +237 -0
- nexaai/mlx_backend/mlx_audio/version.py +1 -0
- nexaai/mlx_backend/profiling.py +239 -0
- nexaai/mlx_backend/rerank/__init__.py +0 -0
- nexaai/mlx_backend/rerank/generate.py +174 -0
- nexaai/mlx_backend/rerank/interface.py +287 -0
- nexaai/mlx_backend/rerank/main.py +127 -0
- nexaai/mlx_backend/rerank/modeling/__init__.py +0 -0
- nexaai/mlx_backend/rerank/modeling/nexa_jina_rerank.py +330 -0
- nexaai/mlx_backend/sd/__init__.py +1 -0
- nexaai/mlx_backend/sd/interface.py +362 -0
- nexaai/mlx_backend/sd/main.py +286 -0
- nexaai/mlx_backend/sd/modeling/__init__.py +306 -0
- nexaai/mlx_backend/sd/modeling/clip.py +116 -0
- nexaai/mlx_backend/sd/modeling/config.py +65 -0
- nexaai/mlx_backend/sd/modeling/model_io.py +385 -0
- nexaai/mlx_backend/sd/modeling/sampler.py +105 -0
- nexaai/mlx_backend/sd/modeling/tokenizer.py +100 -0
- nexaai/mlx_backend/sd/modeling/unet.py +460 -0
- nexaai/mlx_backend/sd/modeling/vae.py +274 -0
- nexaai/mlx_backend/tts/__init__.py +12 -0
- nexaai/mlx_backend/tts/interface.py +276 -0
- nexaai/mlx_backend/vlm/__init__.py +3 -0
- nexaai/mlx_backend/vlm/generate.py +572 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py +374 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
- nexaai/mlx_backend/vlm/interface.py +559 -0
- nexaai/mlx_backend/vlm/main.py +365 -0
- nexaai/mlx_backend/vlm/modeling/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/convert.py +68 -0
- nexaai/mlx_backend/vlm/modeling/models/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/aya_vision.py +193 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/interpolate.py +186 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/language.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/aya_vision/vision.py +503 -0
- nexaai/mlx_backend/vlm/modeling/models/base.py +202 -0
- nexaai/mlx_backend/vlm/modeling/models/cache.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/conversation.py +264 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/deepseek_vl_v2.py +472 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/vision.py +356 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/florence2.py +366 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/language.py +488 -0
- nexaai/mlx_backend/vlm/modeling/models/florence2/vision.py +591 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/gemma3.py +213 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/language.py +315 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3/vision.py +238 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/audio.py +1038 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/config.py +139 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/gemma3n.py +322 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/language.py +629 -0
- nexaai/mlx_backend/vlm/modeling/models/gemma3n/vision.py +1022 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/idefics2.py +294 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/language.py +191 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics2/vision.py +267 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/idefics3.py +175 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/language.py +192 -0
- nexaai/mlx_backend/vlm/modeling/models/idefics3/vision.py +233 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/internvl_chat.py +140 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/language.py +220 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/processor.py +393 -0
- nexaai/mlx_backend/vlm/modeling/models/internvl_chat/vision.py +293 -0
- nexaai/mlx_backend/vlm/modeling/models/kernels.py +307 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/kimi_vl.py +143 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/language.py +509 -0
- nexaai/mlx_backend/vlm/modeling/models/kimi_vl/vision.py +522 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/language.py +386 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/llama4.py +138 -0
- nexaai/mlx_backend/vlm/modeling/models/llama4/vision.py +560 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/language.py +240 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/llava.py +153 -0
- nexaai/mlx_backend/vlm/modeling/models/llava/vision.py +259 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/language.py +236 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/llava_bunny.py +256 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_bunny/vision.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/llava_next.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/llava_next/vision.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mistral3/mistral3.py +283 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/language.py +416 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/mllama.py +172 -0
- nexaai/mlx_backend/vlm/modeling/models/mllama/vision.py +499 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/language.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/molmo.py +133 -0
- nexaai/mlx_backend/vlm/modeling/models/molmo/vision.py +465 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/__init__.py +10 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/language.py +230 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/multi_modality.py +385 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/sam.py +557 -0
- nexaai/mlx_backend/vlm/modeling/models/multi_modality/vision.py +526 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/language.py +282 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/paligemma.py +160 -0
- nexaai/mlx_backend/vlm/modeling/models/paligemma/vision.py +242 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/language.py +21 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/phi3_v.py +243 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/su_rope.py +71 -0
- nexaai/mlx_backend/vlm/modeling/models/phi3_v/vision.py +324 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/language.py +229 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/pixtral.py +161 -0
- nexaai/mlx_backend/vlm/modeling/models/pixtral/vision.py +320 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/config.py +108 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/qwen2_5_vl.py +168 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_5_vl/vision.py +414 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/__init__.py +2 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/config.py +104 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/language.py +490 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/qwen2_vl.py +167 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen2_vl/vision.py +312 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +1262 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/__init__.py +8 -0
- nexaai/mlx_backend/vlm/modeling/models/smolvlm/smolvlm.py +62 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_5_vl.py +209 -0
- nexaai/mlx_backend/vlm/modeling/processing_qwen2_vl.py +215 -0
- nexaai/mlx_backend/vlm/modeling/prompt_utils.py +474 -0
- nexaai/mlx_backend/vlm/modeling/sample_utils.py +39 -0
- nexaai/mlx_backend/vlm/modeling/tokenizer_utils.py +344 -0
- nexaai/mlx_backend/vlm/modeling/trainer/__init__.py +9 -0
- nexaai/mlx_backend/vlm/modeling/trainer/lora.py +70 -0
- nexaai/mlx_backend/vlm/modeling/trainer/trainer.py +296 -0
- nexaai/mlx_backend/vlm/modeling/trainer/utils.py +160 -0
- nexaai/mlx_backend/vlm/modeling/utils.py +928 -0
- nexaai/rerank.py +57 -0
- nexaai/rerank_impl/__init__.py +0 -0
- nexaai/rerank_impl/mlx_rerank_impl.py +94 -0
- nexaai/rerank_impl/pybind_rerank_impl.py +136 -0
- nexaai/runtime.py +68 -0
- nexaai/runtime_error.py +24 -0
- nexaai/tts.py +75 -0
- nexaai/tts_impl/__init__.py +0 -0
- nexaai/tts_impl/mlx_tts_impl.py +94 -0
- nexaai/tts_impl/pybind_tts_impl.py +43 -0
- nexaai/utils/decode.py +18 -0
- nexaai/utils/manifest_utils.py +531 -0
- nexaai/utils/model_manager.py +1745 -0
- nexaai/utils/model_types.py +49 -0
- nexaai/utils/progress_tracker.py +389 -0
- nexaai/utils/quantization_utils.py +245 -0
- nexaai/vlm.py +130 -0
- nexaai/vlm_impl/__init__.py +0 -0
- nexaai/vlm_impl/mlx_vlm_impl.py +259 -0
- nexaai/vlm_impl/pybind_vlm_impl.py +275 -0
- nexaai-1.0.29.dist-info/METADATA +35 -0
- nexaai-1.0.29.dist-info/RECORD +580 -0
- nexaai-1.0.29.dist-info/WHEEL +5 -0
- nexaai-1.0.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from typing import Any, List, Optional, Sequence
|
|
2
|
+
import argparse
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
import mlx.core as mx
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from ml import ASR, ASRConfig, ASRResult, Path as MLPath
|
|
10
|
+
from mlx_audio.stt.utils import load_model
|
|
11
|
+
from mlx_audio.stt.models.whisper.tokenizer import LANGUAGES
|
|
12
|
+
from mlx_audio.stt.models.whisper.whisper import Model
|
|
13
|
+
import soundfile as sf
|
|
14
|
+
import scipy.signal
|
|
15
|
+
|
|
16
|
+
from profiling import ProfilingMixin, StopReason
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MlxAsr(ASR, ProfilingMixin):
|
|
20
|
+
"""MLX Audio implementation of ASR interface."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
model_path: MLPath,
|
|
25
|
+
tokenizer_path: Optional[MLPath],
|
|
26
|
+
language: Optional[str],
|
|
27
|
+
device: Optional[str] = None,
|
|
28
|
+
) -> None:
|
|
29
|
+
# Initialize profiling mixin
|
|
30
|
+
ProfilingMixin.__init__(self)
|
|
31
|
+
|
|
32
|
+
if os.path.isfile(model_path):
|
|
33
|
+
model_path = os.path.dirname(model_path)
|
|
34
|
+
|
|
35
|
+
super().__init__(model_path, tokenizer_path, language, device)
|
|
36
|
+
|
|
37
|
+
# Load model immediately in constructor
|
|
38
|
+
self.model: Model = load_model(model_path)
|
|
39
|
+
self.model_path = model_path
|
|
40
|
+
|
|
41
|
+
def destroy(self) -> None:
|
|
42
|
+
"""Destroy the model and free resources."""
|
|
43
|
+
if self.model is not None:
|
|
44
|
+
del self.model
|
|
45
|
+
self.model = None
|
|
46
|
+
mx.clear_cache()
|
|
47
|
+
|
|
48
|
+
def close(self) -> None:
|
|
49
|
+
"""Close the model."""
|
|
50
|
+
self.destroy()
|
|
51
|
+
|
|
52
|
+
def transcribe(
|
|
53
|
+
self,
|
|
54
|
+
audio_path: MLPath,
|
|
55
|
+
language: Optional[str] = None,
|
|
56
|
+
config: Optional[ASRConfig] = None,
|
|
57
|
+
clear_cache: bool = True,
|
|
58
|
+
) -> ASRResult:
|
|
59
|
+
"""Transcribe audio file to text."""
|
|
60
|
+
if self.model is None:
|
|
61
|
+
raise RuntimeError("Model not loaded")
|
|
62
|
+
|
|
63
|
+
# Start profiling
|
|
64
|
+
self._start_profiling()
|
|
65
|
+
self._decode_start()
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = self.model.generate(audio_path)
|
|
69
|
+
|
|
70
|
+
if clear_cache:
|
|
71
|
+
mx.clear_cache()
|
|
72
|
+
|
|
73
|
+
self._decode_end()
|
|
74
|
+
self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
|
|
75
|
+
self._end_profiling()
|
|
76
|
+
except Exception as e:
|
|
77
|
+
self._end_profiling()
|
|
78
|
+
raise RuntimeError(f"Failed to transcribe audio file {audio_path}: {e}")
|
|
79
|
+
|
|
80
|
+
# Extract confidence scores and timestamps
|
|
81
|
+
confidence_scores = []
|
|
82
|
+
timestamps = []
|
|
83
|
+
|
|
84
|
+
# Handle different result types: Whisper (STTOutput) vs Parakeet (AlignedResult)
|
|
85
|
+
if hasattr(result, 'segments') and result.segments:
|
|
86
|
+
# Whisper STTOutput format
|
|
87
|
+
for segment in result.segments:
|
|
88
|
+
if 'avg_logprob' in segment:
|
|
89
|
+
# Convert log probability to confidence score (0-1)
|
|
90
|
+
confidence = max(0.0, min(1.0, np.exp(segment['avg_logprob'])))
|
|
91
|
+
confidence_scores.append(confidence)
|
|
92
|
+
else:
|
|
93
|
+
confidence_scores.append(0.5) # Default confidence
|
|
94
|
+
|
|
95
|
+
start_time = segment.get('start', 0.0)
|
|
96
|
+
end_time = segment.get('end', 0.0)
|
|
97
|
+
timestamps.append((start_time, end_time))
|
|
98
|
+
elif hasattr(result, 'sentences') and result.sentences:
|
|
99
|
+
# Parakeet AlignedResult format
|
|
100
|
+
for sentence in result.sentences:
|
|
101
|
+
confidence_scores.append(0.5) # Default confidence for Parakeet
|
|
102
|
+
timestamps.append((sentence.start, sentence.end))
|
|
103
|
+
else:
|
|
104
|
+
# Single segment case or empty result
|
|
105
|
+
confidence_scores.append(0.5)
|
|
106
|
+
timestamps.append((0.0, 0.0)) # Default timestamps
|
|
107
|
+
|
|
108
|
+
return ASRResult(
|
|
109
|
+
transcript=result.text,
|
|
110
|
+
confidence_scores=confidence_scores,
|
|
111
|
+
timestamps=timestamps,
|
|
112
|
+
duration_us=self._get_audio_duration_us(audio_path)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def list_supported_languages(self) -> List[str]:
|
|
116
|
+
"""List supported languages."""
|
|
117
|
+
return list(LANGUAGES.keys())
|
|
118
|
+
|
|
119
|
+
def _get_audio_duration_us(self, audio_path: MLPath) -> int:
|
|
120
|
+
with sf.SoundFile(audio_path) as f:
|
|
121
|
+
duration_us = f.frames / f.samplerate * 1e6
|
|
122
|
+
return int(duration_us)
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
|
|
3
|
+
# Store the original atexit.register function
|
|
4
|
+
_original_atexit_register = atexit.register
|
|
5
|
+
|
|
6
|
+
def _filtered_atexit_register(func, *args, **kwargs):
|
|
7
|
+
"""
|
|
8
|
+
Clean atexit interceptor that skips nanobind handlers to prevent segfaults due to MLX atexit cleanups.
|
|
9
|
+
This should be registered early during Python runtime initialization.
|
|
10
|
+
"""
|
|
11
|
+
# Skip nanobind handlers silently
|
|
12
|
+
func_type_str = str(type(func))
|
|
13
|
+
if 'nanobind' in func_type_str or func_type_str.startswith("<class 'nb_"):
|
|
14
|
+
return lambda: None
|
|
15
|
+
|
|
16
|
+
# Allow all other handlers to register normally
|
|
17
|
+
return _original_atexit_register(func, *args, **kwargs)
|
|
18
|
+
|
|
19
|
+
def install_atexit_filter():
|
|
20
|
+
"""Install the atexit filter to prevent problematic nanobind registrations."""
|
|
21
|
+
atexit.register = _filtered_atexit_register
|
|
22
|
+
|
|
23
|
+
def uninstall_atexit_filter():
|
|
24
|
+
"""Restore the original atexit.register function."""
|
|
25
|
+
atexit.register = _original_atexit_register
|
|
File without changes
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
import math
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import cv2
|
|
10
|
+
import numpy as np
|
|
11
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
12
|
+
|
|
13
|
+
from .modeling.pp_ocr_v4 import Config, TextSystem
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def is_image_file(file_path):
|
|
17
|
+
"""Check if file is an image based on extension."""
|
|
18
|
+
img_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".rgb"}
|
|
19
|
+
return Path(file_path).suffix.lower() in img_extensions
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_image_file_list(img_file):
|
|
23
|
+
"""Get list of image files from a directory or single file."""
|
|
24
|
+
imgs_lists = []
|
|
25
|
+
if img_file is None or not os.path.exists(img_file):
|
|
26
|
+
raise Exception("not found any img file in {}".format(img_file))
|
|
27
|
+
|
|
28
|
+
if os.path.isfile(img_file) and is_image_file(img_file):
|
|
29
|
+
imgs_lists.append(img_file)
|
|
30
|
+
elif os.path.isdir(img_file):
|
|
31
|
+
for single_file in os.listdir(img_file):
|
|
32
|
+
file_path = os.path.join(img_file, single_file)
|
|
33
|
+
if is_image_file(file_path):
|
|
34
|
+
imgs_lists.append(file_path)
|
|
35
|
+
if len(imgs_lists) == 0:
|
|
36
|
+
raise Exception("not found any img file in {}".format(img_file))
|
|
37
|
+
return imgs_lists
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def check_and_read_gif(img_path):
|
|
41
|
+
"""Check if image is gif and read it properly."""
|
|
42
|
+
if os.path.basename(img_path)[-3:] in ["gif", "GIF"]:
|
|
43
|
+
gif = cv2.VideoCapture(img_path)
|
|
44
|
+
ret, frame = gif.read()
|
|
45
|
+
if not ret:
|
|
46
|
+
print("Cannot read {}. This gif image maybe corrupted.".format(img_path))
|
|
47
|
+
return None, False
|
|
48
|
+
if len(frame.shape) == 2 or frame.shape[-1] == 1:
|
|
49
|
+
frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
|
|
50
|
+
imgvalue = frame[:, :, ::-1]
|
|
51
|
+
return imgvalue, True
|
|
52
|
+
return None, False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def draw_ocr_box_txt(
|
|
56
|
+
image, boxes, txts, scores=None, drop_score=0.5, font_path="./doc/simfang.ttf"
|
|
57
|
+
):
|
|
58
|
+
"""Draw OCR results with boxes and text."""
|
|
59
|
+
h, w = image.height, image.width
|
|
60
|
+
img_left = image.copy()
|
|
61
|
+
img_right = Image.new("RGB", (w, h), (255, 255, 255))
|
|
62
|
+
|
|
63
|
+
import random
|
|
64
|
+
random.seed(0)
|
|
65
|
+
|
|
66
|
+
draw_left = ImageDraw.Draw(img_left)
|
|
67
|
+
draw_right = ImageDraw.Draw(img_right)
|
|
68
|
+
|
|
69
|
+
for idx, (box, txt) in enumerate(zip(boxes, txts)):
|
|
70
|
+
if scores is not None and scores[idx] < drop_score:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
74
|
+
draw_left.polygon(box, fill=color)
|
|
75
|
+
draw_right.polygon(
|
|
76
|
+
[
|
|
77
|
+
box[0][0],
|
|
78
|
+
box[0][1],
|
|
79
|
+
box[1][0],
|
|
80
|
+
box[1][1],
|
|
81
|
+
box[2][0],
|
|
82
|
+
box[2][1],
|
|
83
|
+
box[3][0],
|
|
84
|
+
box[3][1],
|
|
85
|
+
],
|
|
86
|
+
outline=color,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
box_height = math.sqrt((box[0][0] - box[3][0]) ** 2 + (box[0][1] - box[3][1]) ** 2)
|
|
90
|
+
box_width = math.sqrt((box[0][0] - box[1][0]) ** 2 + (box[0][1] - box[1][1]) ** 2)
|
|
91
|
+
|
|
92
|
+
if box_height > 2 * box_width:
|
|
93
|
+
font_size = max(int(box_width * 0.9), 10)
|
|
94
|
+
try:
|
|
95
|
+
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
|
|
96
|
+
except:
|
|
97
|
+
font = ImageFont.load_default()
|
|
98
|
+
cur_y = box[0][1]
|
|
99
|
+
for c in txt:
|
|
100
|
+
try:
|
|
101
|
+
bbox = font.getbbox(c)
|
|
102
|
+
char_size = (bbox[2] - bbox[0], bbox[3] - bbox[1])
|
|
103
|
+
except:
|
|
104
|
+
char_size = (font_size, font_size)
|
|
105
|
+
draw_right.text((box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font)
|
|
106
|
+
cur_y += char_size[1]
|
|
107
|
+
else:
|
|
108
|
+
font_size = max(int(box_height * 0.8), 10)
|
|
109
|
+
try:
|
|
110
|
+
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
|
|
111
|
+
except:
|
|
112
|
+
font = ImageFont.load_default()
|
|
113
|
+
draw_right.text([box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font)
|
|
114
|
+
|
|
115
|
+
img_left = Image.blend(image, img_left, 0.5)
|
|
116
|
+
img_show = Image.new("RGB", (w * 2, h), (255, 255, 255))
|
|
117
|
+
img_show.paste(img_left, (0, 0, w, h))
|
|
118
|
+
img_show.paste(img_right, (w, 0, w * 2, h))
|
|
119
|
+
|
|
120
|
+
return np.array(img_show)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def load_model():
|
|
124
|
+
"""Load OCR model and return config and text system."""
|
|
125
|
+
config = Config()
|
|
126
|
+
ocr_system = TextSystem(config)
|
|
127
|
+
return config, ocr_system
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def process_folder(config, ocr_system):
|
|
131
|
+
"""Process all images in the configured folder."""
|
|
132
|
+
img_paths = get_image_file_list(config.image_dir)
|
|
133
|
+
if not img_paths:
|
|
134
|
+
print("[ERR] No images found in", config.image_dir)
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
out_root = Path(config.base_dir) / "output"
|
|
138
|
+
txt_dir = out_root / "inference_txt"
|
|
139
|
+
vis_dir = out_root / "inference_results"
|
|
140
|
+
txt_dir.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
vis_dir.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
|
|
143
|
+
font = config.vis_font_path
|
|
144
|
+
|
|
145
|
+
total = 0.0
|
|
146
|
+
for idx, p in enumerate(img_paths, 1):
|
|
147
|
+
img, is_gif = check_and_read_gif(p)
|
|
148
|
+
if not is_gif:
|
|
149
|
+
img = cv2.imread(p)
|
|
150
|
+
if img is None:
|
|
151
|
+
print(f"[WARN] skip {p}")
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
t0 = time.time()
|
|
155
|
+
boxes, recs = ocr_system(img)
|
|
156
|
+
dt = time.time() - t0
|
|
157
|
+
total += dt
|
|
158
|
+
|
|
159
|
+
name = Path(p).stem
|
|
160
|
+
|
|
161
|
+
with open(txt_dir / f"{name}.txt", "w", encoding="utf-8") as f:
|
|
162
|
+
f.writelines(f"{txt}\n" for txt, sc in recs) # DO NOT write confidence score in txt file
|
|
163
|
+
|
|
164
|
+
vis = draw_ocr_box_txt(
|
|
165
|
+
Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)),
|
|
166
|
+
boxes,
|
|
167
|
+
[t for t, _ in recs],
|
|
168
|
+
[s for _, s in recs],
|
|
169
|
+
drop_score=config.drop_score,
|
|
170
|
+
font_path=font,
|
|
171
|
+
)
|
|
172
|
+
cv2.imwrite(str(vis_dir / f"{name}.jpg"), vis[:, :, ::-1])
|
|
173
|
+
|
|
174
|
+
print(f"[{idx}/{len(img_paths)}] {Path(p).name} boxes={len(boxes)} time={dt:.3f}s")
|
|
175
|
+
|
|
176
|
+
print(f"\nDone {len(img_paths)} images in {total:.2f}s (avg {total/len(img_paths):.3f}s)")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def main():
|
|
180
|
+
"""Main function to demonstrate OCR functionality."""
|
|
181
|
+
print("📥 Loading OCR model...")
|
|
182
|
+
|
|
183
|
+
# Load model and config
|
|
184
|
+
config, ocr_system = load_model()
|
|
185
|
+
|
|
186
|
+
print("✅ OCR model loaded successfully!")
|
|
187
|
+
print(f"📂 Processing images from: {config.image_dir}")
|
|
188
|
+
print("="*50)
|
|
189
|
+
|
|
190
|
+
# Process images
|
|
191
|
+
process_folder(config, ocr_system)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
if __name__ == "__main__":
|
|
195
|
+
main()
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# Copyright © Nexa AI
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import json
|
|
19
|
+
import time
|
|
20
|
+
import cv2
|
|
21
|
+
import numpy as np
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, List, Optional, Sequence, Tuple, Union
|
|
24
|
+
from PIL import Image
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
|
|
27
|
+
# Import necessary modules
|
|
28
|
+
import mlx.core as mx
|
|
29
|
+
|
|
30
|
+
# Import from ml.py for API alignment
|
|
31
|
+
from ml import (
|
|
32
|
+
CVModel as BaseCVModel,
|
|
33
|
+
CVModelConfig,
|
|
34
|
+
CVResults,
|
|
35
|
+
CVResult,
|
|
36
|
+
CVCapabilities,
|
|
37
|
+
Path as PathType,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Import the model implementation
|
|
41
|
+
from .modeling.pp_ocr_v4 import Config, TextSystem
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class CVConfig:
|
|
45
|
+
"""Configuration for CV processing."""
|
|
46
|
+
batch_size: int = 1
|
|
47
|
+
drop_score: float = 0.5
|
|
48
|
+
font_path: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
batch_size: int = 1,
|
|
53
|
+
drop_score: float = 0.5,
|
|
54
|
+
font_path: Optional[str] = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self.batch_size = batch_size
|
|
57
|
+
self.drop_score = drop_score
|
|
58
|
+
self.font_path = font_path
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CVModel(BaseCVModel):
|
|
62
|
+
"""
|
|
63
|
+
CV Model interface for MLX OCR models.
|
|
64
|
+
API aligned with ml.py CVModel abstract base class.
|
|
65
|
+
"""
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
config: CVModelConfig,
|
|
69
|
+
device: Optional[str] = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
super().__init__(config, device)
|
|
72
|
+
# print(f"config: {config}")
|
|
73
|
+
# TODO: this hack is to support local model path
|
|
74
|
+
# hack only support pp_ocr_v4
|
|
75
|
+
|
|
76
|
+
det_path_str = str(config.det_model_path) if config.det_model_path else None
|
|
77
|
+
rec_path_str = str(config.rec_model_path) if config.rec_model_path else None
|
|
78
|
+
|
|
79
|
+
# Determine model_cache_dir (prefer det_model_path, fallback to rec_model_path)
|
|
80
|
+
path_to_check = det_path_str or rec_path_str
|
|
81
|
+
|
|
82
|
+
if path_to_check:
|
|
83
|
+
if os.path.isdir(path_to_check):
|
|
84
|
+
model_cache_dir = path_to_check
|
|
85
|
+
else:
|
|
86
|
+
model_cache_dir = os.path.dirname(path_to_check)
|
|
87
|
+
else:
|
|
88
|
+
model_cache_dir = None
|
|
89
|
+
|
|
90
|
+
cfg = Config(model_cache_dir)
|
|
91
|
+
cfg.device = self.device
|
|
92
|
+
self.ocr_system = TextSystem(cfg)
|
|
93
|
+
|
|
94
|
+
def destroy(self) -> None:
|
|
95
|
+
"""Destroy the model and free resources."""
|
|
96
|
+
self.ocr_system = None
|
|
97
|
+
self.config = None
|
|
98
|
+
|
|
99
|
+
def close(self) -> None:
|
|
100
|
+
"""Close the model."""
|
|
101
|
+
self.destroy()
|
|
102
|
+
|
|
103
|
+
def infer(self, input_image_path: str, clear_cache: bool = True) -> CVResults:
|
|
104
|
+
"""Perform inference on image."""
|
|
105
|
+
if self.ocr_system is None:
|
|
106
|
+
raise RuntimeError("Model not loaded. Call load_model() first.")
|
|
107
|
+
|
|
108
|
+
# Load image
|
|
109
|
+
img = self._load_image(input_image_path)
|
|
110
|
+
if img is None:
|
|
111
|
+
raise ValueError(f"Failed to load image: {input_image_path}")
|
|
112
|
+
|
|
113
|
+
# Process with OCR
|
|
114
|
+
boxes, recs = self.ocr_system(img)
|
|
115
|
+
|
|
116
|
+
if clear_cache:
|
|
117
|
+
mx.clear_cache()
|
|
118
|
+
|
|
119
|
+
# Convert to CVResults format
|
|
120
|
+
results = []
|
|
121
|
+
for box, (text, score) in zip(boxes, recs):
|
|
122
|
+
# Create CVResult
|
|
123
|
+
result = CVResult(
|
|
124
|
+
text=text,
|
|
125
|
+
confidence=score,
|
|
126
|
+
# Note: OCR doesn't use bounding boxes in the same way as detection models
|
|
127
|
+
# but we can store the box coordinates if needed
|
|
128
|
+
)
|
|
129
|
+
results.append(result)
|
|
130
|
+
|
|
131
|
+
return CVResults(results=results, result_count=len(results))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _load_image(self, image_path: Union[str, PathType]) -> Optional[np.ndarray]:
|
|
135
|
+
"""Load image from path."""
|
|
136
|
+
try:
|
|
137
|
+
# Check if it's a GIF
|
|
138
|
+
if str(image_path).lower().endswith('.gif'):
|
|
139
|
+
gif = cv2.VideoCapture(str(image_path))
|
|
140
|
+
ret, frame = gif.read()
|
|
141
|
+
if not ret:
|
|
142
|
+
return None
|
|
143
|
+
if len(frame.shape) == 2 or frame.shape[-1] == 1:
|
|
144
|
+
frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
|
|
145
|
+
return frame[:, :, ::-1] # BGR to RGB
|
|
146
|
+
else:
|
|
147
|
+
img = cv2.imread(str(image_path))
|
|
148
|
+
if img is None:
|
|
149
|
+
return None
|
|
150
|
+
return img
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f"Error loading image {image_path}: {e}")
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def create_cv_model(
|
|
158
|
+
config: CVModelConfig,
|
|
159
|
+
device: Optional[str] = None,
|
|
160
|
+
) -> CVModel:
|
|
161
|
+
"""Create a CV model instance."""
|
|
162
|
+
return CVModel(config, device)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright © Nexa AI
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
from .interface import create_cv_model, CVModelConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_cv_model(model_path, test_image_path):
|
|
21
|
+
"""Test CV model functionality."""
|
|
22
|
+
|
|
23
|
+
# Create CVModelConfig
|
|
24
|
+
config = CVModelConfig(
|
|
25
|
+
capabilities=0, # ML_CV_OCR
|
|
26
|
+
model_path=model_path,
|
|
27
|
+
system_library_path=None,
|
|
28
|
+
backend_library_path=None,
|
|
29
|
+
extension_library_path=None,
|
|
30
|
+
config_file_path=None,
|
|
31
|
+
char_dict_path=None
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
model = create_cv_model(config)
|
|
35
|
+
print("✅ Model loaded successfully!")
|
|
36
|
+
|
|
37
|
+
# Test images (you can replace these with actual image paths)
|
|
38
|
+
test_images = [
|
|
39
|
+
"cv/modeling/input/20250406-170821.jpeg",
|
|
40
|
+
"cv/modeling/input/20250406-170838.jpeg",
|
|
41
|
+
"cv/modeling/input/20250406-170906.jpeg",
|
|
42
|
+
"cv/modeling/input/20250407-154044.jpeg",
|
|
43
|
+
"cv/modeling/input/20250407-154059.jpeg"
|
|
44
|
+
] if test_image_path is None else [test_image_path]
|
|
45
|
+
|
|
46
|
+
for img_path in test_images:
|
|
47
|
+
if not os.path.exists(img_path):
|
|
48
|
+
print(f"❌ Image file not found: {img_path}")
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
results = model.infer(img_path)
|
|
52
|
+
print(f"✅ OCR Results for {img_path}:")
|
|
53
|
+
print("=" * 50)
|
|
54
|
+
|
|
55
|
+
if results.result_count == 0:
|
|
56
|
+
print("No text detected in the image.")
|
|
57
|
+
else:
|
|
58
|
+
print(f"Found {results.result_count} text regions:")
|
|
59
|
+
|
|
60
|
+
for i, result in enumerate(results.results):
|
|
61
|
+
print(f"\nRegion {i+1}:")
|
|
62
|
+
print(f" Text: '{result.text}'")
|
|
63
|
+
print(f" Confidence: {result.confidence:.3f}")
|
|
64
|
+
|
|
65
|
+
print("\n✅ CV model test completed!")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
import argparse
|
|
70
|
+
parser = argparse.ArgumentParser(description="Test CV processor functionality")
|
|
71
|
+
parser.add_argument("--model_path", type=str, default="nexaml/paddle-ocr-mlx",
|
|
72
|
+
help="Path to the CV model")
|
|
73
|
+
parser.add_argument("--image_path", type=str, default=None,
|
|
74
|
+
help="Path to a specific image to process")
|
|
75
|
+
parser.add_argument("--test_mode", action="store_true",
|
|
76
|
+
help="Run in test mode with sample images")
|
|
77
|
+
|
|
78
|
+
args = parser.parse_args()
|
|
79
|
+
|
|
80
|
+
test_cv_model(args.model_path, args.image_path)
|
|
81
|
+
|