xinference 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +11 -28
- xinference/client/restful/async_restful_client.py +20 -3
- xinference/client/restful/restful_client.py +20 -3
- xinference/core/supervisor.py +87 -53
- xinference/core/worker.py +10 -0
- xinference/deploy/cmdline.py +15 -0
- xinference/model/audio/core.py +21 -6
- xinference/model/audio/indextts2.py +166 -0
- xinference/model/audio/model_spec.json +38 -1
- xinference/model/image/model_spec.json +69 -0
- xinference/model/image/stable_diffusion/core.py +13 -4
- xinference/model/llm/__init__.py +4 -0
- xinference/model/llm/llm_family.json +464 -2
- xinference/model/llm/sglang/core.py +30 -11
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
- xinference/model/llm/utils.py +12 -9
- xinference/model/llm/vllm/core.py +93 -17
- xinference/thirdparty/audiotools/__init__.py +10 -0
- xinference/thirdparty/audiotools/core/__init__.py +4 -0
- xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
- xinference/thirdparty/audiotools/core/display.py +194 -0
- xinference/thirdparty/audiotools/core/dsp.py +390 -0
- xinference/thirdparty/audiotools/core/effects.py +647 -0
- xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
- xinference/thirdparty/audiotools/core/loudness.py +320 -0
- xinference/thirdparty/audiotools/core/playback.py +252 -0
- xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
- xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
- xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
- xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
- xinference/thirdparty/audiotools/core/util.py +671 -0
- xinference/thirdparty/audiotools/core/whisper.py +97 -0
- xinference/thirdparty/audiotools/data/__init__.py +3 -0
- xinference/thirdparty/audiotools/data/datasets.py +517 -0
- xinference/thirdparty/audiotools/data/preprocess.py +81 -0
- xinference/thirdparty/audiotools/data/transforms.py +1592 -0
- xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
- xinference/thirdparty/audiotools/metrics/distance.py +131 -0
- xinference/thirdparty/audiotools/metrics/quality.py +159 -0
- xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
- xinference/thirdparty/audiotools/ml/__init__.py +5 -0
- xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
- xinference/thirdparty/audiotools/ml/decorators.py +440 -0
- xinference/thirdparty/audiotools/ml/experiment.py +90 -0
- xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
- xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
- xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
- xinference/thirdparty/audiotools/post.py +140 -0
- xinference/thirdparty/audiotools/preference.py +600 -0
- xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
- xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
- xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
- xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
- xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
- xinference/thirdparty/indextts/__init__.py +0 -0
- xinference/thirdparty/indextts/cli.py +65 -0
- xinference/thirdparty/indextts/gpt/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
- xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
- xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
- xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
- xinference/thirdparty/indextts/gpt/model.py +713 -0
- xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
- xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
- xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
- xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
- xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
- xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
- xinference/thirdparty/indextts/infer.py +690 -0
- xinference/thirdparty/indextts/infer_v2.py +739 -0
- xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
- xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
- xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
- xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
- xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
- xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
- xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
- xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
- xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
- xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
- xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
- xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
- xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
- xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
- xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
- xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
- xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
- xinference/thirdparty/indextts/utils/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/arch_util.py +120 -0
- xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
- xinference/thirdparty/indextts/utils/common.py +121 -0
- xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
- xinference/thirdparty/indextts/utils/front.py +536 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
- xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
- xinference/thirdparty/indextts/utils/text_utils.py +41 -0
- xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
- xinference/thirdparty/indextts/utils/utils.py +93 -0
- xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
- xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
- xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
- xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
- xinference/ui/gradio/media_interface.py +66 -8
- xinference/ui/web/ui/build/asset-manifest.json +6 -6
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
- xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
- xinference/ui/web/ui/build/static/js/main.d192c4f3.js +3 -0
- xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.d192c4f3.js.LICENSE.txt} +0 -7
- xinference/ui/web/ui/build/static/js/main.d192c4f3.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +1 -0
- xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
- xinference/ui/web/ui/package-lock.json +0 -34
- xinference/ui/web/ui/package.json +0 -1
- xinference/ui/web/ui/src/locales/en.json +9 -3
- xinference/ui/web/ui/src/locales/ja.json +9 -3
- xinference/ui/web/ui/src/locales/ko.json +9 -3
- xinference/ui/web/ui/src/locales/zh.json +9 -3
- {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/METADATA +18 -2
- {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/RECORD +285 -67
- xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
- xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
- xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
- xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
- xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
- xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
- xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
- xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
- xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
- xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
- xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
- xinference/ui/web/ui/node_modules/select/bower.json +0 -13
- xinference/ui/web/ui/node_modules/select/package.json +0 -29
- xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
- {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/WHEEL +0 -0
- {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Tuple, Union, Optional
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
import yaml
|
|
7
|
+
from torch import nn
|
|
8
|
+
from .heads import ISTFTHead
|
|
9
|
+
from .models import VocosBackbone
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Vocos(nn.Module):
|
|
13
|
+
"""
|
|
14
|
+
The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
|
|
15
|
+
This class is primarily designed for inference, with support for loading from pretrained
|
|
16
|
+
model checkpoints. It consists of three main components: a feature extractor,
|
|
17
|
+
a backbone, and a head.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self, args,
|
|
22
|
+
):
|
|
23
|
+
super().__init__()
|
|
24
|
+
self.backbone = VocosBackbone(
|
|
25
|
+
input_channels=args.vocos.backbone.input_channels,
|
|
26
|
+
dim=args.vocos.backbone.dim,
|
|
27
|
+
intermediate_dim=args.vocos.backbone.intermediate_dim,
|
|
28
|
+
num_layers=args.vocos.backbone.num_layers,
|
|
29
|
+
)
|
|
30
|
+
self.head = ISTFTHead(
|
|
31
|
+
dim=args.vocos.head.dim,
|
|
32
|
+
n_fft=args.vocos.head.n_fft,
|
|
33
|
+
hop_length=args.vocos.head.hop_length,
|
|
34
|
+
padding=args.vocos.head.padding,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def forward(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
|
|
38
|
+
"""
|
|
39
|
+
Method to decode audio waveform from already calculated features. The features input is passed through
|
|
40
|
+
the backbone and the head to reconstruct the audio output.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
|
|
44
|
+
C denotes the feature dimension, and L is the sequence length.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
|
|
48
|
+
"""
|
|
49
|
+
x = self.backbone(features_input, **kwargs)
|
|
50
|
+
audio_output = self.head(x)
|
|
51
|
+
return audio_output
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import scipy
|
|
3
|
+
import torch
|
|
4
|
+
from torch import nn, view_as_real, view_as_complex
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ISTFT(nn.Module):
|
|
8
|
+
"""
|
|
9
|
+
Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
|
|
10
|
+
windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
|
|
11
|
+
See issue: https://github.com/pytorch/pytorch/issues/62323
|
|
12
|
+
Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
|
|
13
|
+
The NOLA constraint is met as we trim padded samples anyway.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
n_fft (int): Size of Fourier transform.
|
|
17
|
+
hop_length (int): The distance between neighboring sliding window frames.
|
|
18
|
+
win_length (int): The size of window frame and STFT filter.
|
|
19
|
+
padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
|
|
23
|
+
super().__init__()
|
|
24
|
+
if padding not in ["center", "same"]:
|
|
25
|
+
raise ValueError("Padding must be 'center' or 'same'.")
|
|
26
|
+
self.padding = padding
|
|
27
|
+
self.n_fft = n_fft
|
|
28
|
+
self.hop_length = hop_length
|
|
29
|
+
self.win_length = win_length
|
|
30
|
+
window = torch.hann_window(win_length)
|
|
31
|
+
self.register_buffer("window", window)
|
|
32
|
+
|
|
33
|
+
def forward(self, spec: torch.Tensor) -> torch.Tensor:
|
|
34
|
+
"""
|
|
35
|
+
Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
|
|
39
|
+
N is the number of frequency bins, and T is the number of time frames.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
|
|
43
|
+
"""
|
|
44
|
+
if self.padding == "center":
|
|
45
|
+
# Fallback to pytorch native implementation
|
|
46
|
+
return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
|
|
47
|
+
elif self.padding == "same":
|
|
48
|
+
pad = (self.win_length - self.hop_length) // 2
|
|
49
|
+
else:
|
|
50
|
+
raise ValueError("Padding must be 'center' or 'same'.")
|
|
51
|
+
|
|
52
|
+
assert spec.dim() == 3, "Expected a 3D tensor as input"
|
|
53
|
+
B, N, T = spec.shape
|
|
54
|
+
|
|
55
|
+
# Inverse FFT
|
|
56
|
+
ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
|
|
57
|
+
ifft = ifft * self.window[None, :, None]
|
|
58
|
+
|
|
59
|
+
# Overlap and Add
|
|
60
|
+
output_size = (T - 1) * self.hop_length + self.win_length
|
|
61
|
+
y = torch.nn.functional.fold(
|
|
62
|
+
ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
|
|
63
|
+
)[:, 0, 0, pad:-pad]
|
|
64
|
+
|
|
65
|
+
# Window envelope
|
|
66
|
+
window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
|
|
67
|
+
window_envelope = torch.nn.functional.fold(
|
|
68
|
+
window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
|
|
69
|
+
).squeeze()[pad:-pad]
|
|
70
|
+
|
|
71
|
+
# Normalize
|
|
72
|
+
assert (window_envelope > 1e-11).all()
|
|
73
|
+
y = y / window_envelope
|
|
74
|
+
|
|
75
|
+
return y
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class MDCT(nn.Module):
|
|
79
|
+
"""
|
|
80
|
+
Modified Discrete Cosine Transform (MDCT) module.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
frame_len (int): Length of the MDCT frame.
|
|
84
|
+
padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, frame_len: int, padding: str = "same"):
|
|
88
|
+
super().__init__()
|
|
89
|
+
if padding not in ["center", "same"]:
|
|
90
|
+
raise ValueError("Padding must be 'center' or 'same'.")
|
|
91
|
+
self.padding = padding
|
|
92
|
+
self.frame_len = frame_len
|
|
93
|
+
N = frame_len // 2
|
|
94
|
+
n0 = (N + 1) / 2
|
|
95
|
+
window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
|
|
96
|
+
self.register_buffer("window", window)
|
|
97
|
+
|
|
98
|
+
pre_twiddle = torch.exp(-1j * torch.pi * torch.arange(frame_len) / frame_len)
|
|
99
|
+
post_twiddle = torch.exp(-1j * torch.pi * n0 * (torch.arange(N) + 0.5) / N)
|
|
100
|
+
# view_as_real: NCCL Backend does not support ComplexFloat data type
|
|
101
|
+
# https://github.com/pytorch/pytorch/issues/71613
|
|
102
|
+
self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
|
|
103
|
+
self.register_buffer("post_twiddle", view_as_real(post_twiddle))
|
|
104
|
+
|
|
105
|
+
def forward(self, audio: torch.Tensor) -> torch.Tensor:
|
|
106
|
+
"""
|
|
107
|
+
Apply the Modified Discrete Cosine Transform (MDCT) to the input audio.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size
|
|
111
|
+
and T is the length of the audio.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames
|
|
115
|
+
and N is the number of frequency bins.
|
|
116
|
+
"""
|
|
117
|
+
if self.padding == "center":
|
|
118
|
+
audio = torch.nn.functional.pad(audio, (self.frame_len // 2, self.frame_len // 2))
|
|
119
|
+
elif self.padding == "same":
|
|
120
|
+
# hop_length is 1/2 frame_len
|
|
121
|
+
audio = torch.nn.functional.pad(audio, (self.frame_len // 4, self.frame_len // 4))
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError("Padding must be 'center' or 'same'.")
|
|
124
|
+
|
|
125
|
+
x = audio.unfold(-1, self.frame_len, self.frame_len // 2)
|
|
126
|
+
N = self.frame_len // 2
|
|
127
|
+
x = x * self.window.expand(x.shape)
|
|
128
|
+
X = torch.fft.fft(x * view_as_complex(self.pre_twiddle).expand(x.shape), dim=-1)[..., :N]
|
|
129
|
+
res = X * view_as_complex(self.post_twiddle).expand(X.shape) * np.sqrt(1 / N)
|
|
130
|
+
return torch.real(res) * np.sqrt(2)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class IMDCT(nn.Module):
|
|
134
|
+
"""
|
|
135
|
+
Inverse Modified Discrete Cosine Transform (IMDCT) module.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
frame_len (int): Length of the MDCT frame.
|
|
139
|
+
padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self, frame_len: int, padding: str = "same"):
|
|
143
|
+
super().__init__()
|
|
144
|
+
if padding not in ["center", "same"]:
|
|
145
|
+
raise ValueError("Padding must be 'center' or 'same'.")
|
|
146
|
+
self.padding = padding
|
|
147
|
+
self.frame_len = frame_len
|
|
148
|
+
N = frame_len // 2
|
|
149
|
+
n0 = (N + 1) / 2
|
|
150
|
+
window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
|
|
151
|
+
self.register_buffer("window", window)
|
|
152
|
+
|
|
153
|
+
pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N)
|
|
154
|
+
post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2))
|
|
155
|
+
self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
|
|
156
|
+
self.register_buffer("post_twiddle", view_as_real(post_twiddle))
|
|
157
|
+
|
|
158
|
+
def forward(self, X: torch.Tensor) -> torch.Tensor:
|
|
159
|
+
"""
|
|
160
|
+
Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size,
|
|
164
|
+
L is the number of frames, and N is the number of frequency bins.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio.
|
|
168
|
+
"""
|
|
169
|
+
B, L, N = X.shape
|
|
170
|
+
Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device)
|
|
171
|
+
Y[..., :N] = X
|
|
172
|
+
Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,)))
|
|
173
|
+
y = torch.fft.ifft(Y * view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1)
|
|
174
|
+
y = torch.real(y * view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2)
|
|
175
|
+
result = y * self.window.expand(y.shape)
|
|
176
|
+
output_size = (1, (L + 1) * N)
|
|
177
|
+
audio = torch.nn.functional.fold(
|
|
178
|
+
result.transpose(1, 2),
|
|
179
|
+
output_size=output_size,
|
|
180
|
+
kernel_size=(1, self.frame_len),
|
|
181
|
+
stride=(1, self.frame_len // 2),
|
|
182
|
+
)[:, 0, 0, :]
|
|
183
|
+
|
|
184
|
+
if self.padding == "center":
|
|
185
|
+
pad = self.frame_len // 2
|
|
186
|
+
elif self.padding == "same":
|
|
187
|
+
pad = self.frame_len // 4
|
|
188
|
+
else:
|
|
189
|
+
raise ValueError("Padding must be 'center' or 'same'.")
|
|
190
|
+
|
|
191
|
+
audio = audio[:, pad:-pad]
|
|
192
|
+
return audio
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import torch
|
|
3
|
+
from torch import nn
|
|
4
|
+
from torch.nn import functional as F
|
|
5
|
+
|
|
6
|
+
from indextts.s2mel.modules.encodec import SConv1d
|
|
7
|
+
|
|
8
|
+
from . import commons
|
|
9
|
+
LRELU_SLOPE = 0.1
|
|
10
|
+
|
|
11
|
+
class LayerNorm(nn.Module):
|
|
12
|
+
def __init__(self, channels, eps=1e-5):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self.channels = channels
|
|
15
|
+
self.eps = eps
|
|
16
|
+
|
|
17
|
+
self.gamma = nn.Parameter(torch.ones(channels))
|
|
18
|
+
self.beta = nn.Parameter(torch.zeros(channels))
|
|
19
|
+
|
|
20
|
+
def forward(self, x):
|
|
21
|
+
x = x.transpose(1, -1)
|
|
22
|
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
|
23
|
+
return x.transpose(1, -1)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ConvReluNorm(nn.Module):
|
|
27
|
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
|
28
|
+
super().__init__()
|
|
29
|
+
self.in_channels = in_channels
|
|
30
|
+
self.hidden_channels = hidden_channels
|
|
31
|
+
self.out_channels = out_channels
|
|
32
|
+
self.kernel_size = kernel_size
|
|
33
|
+
self.n_layers = n_layers
|
|
34
|
+
self.p_dropout = p_dropout
|
|
35
|
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
|
36
|
+
|
|
37
|
+
self.conv_layers = nn.ModuleList()
|
|
38
|
+
self.norm_layers = nn.ModuleList()
|
|
39
|
+
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
|
40
|
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
|
41
|
+
self.relu_drop = nn.Sequential(
|
|
42
|
+
nn.ReLU(),
|
|
43
|
+
nn.Dropout(p_dropout))
|
|
44
|
+
for _ in range(n_layers - 1):
|
|
45
|
+
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
|
46
|
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
|
47
|
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
|
48
|
+
self.proj.weight.data.zero_()
|
|
49
|
+
self.proj.bias.data.zero_()
|
|
50
|
+
|
|
51
|
+
def forward(self, x, x_mask):
|
|
52
|
+
x_org = x
|
|
53
|
+
for i in range(self.n_layers):
|
|
54
|
+
x = self.conv_layers[i](x * x_mask)
|
|
55
|
+
x = self.norm_layers[i](x)
|
|
56
|
+
x = self.relu_drop(x)
|
|
57
|
+
x = x_org + self.proj(x)
|
|
58
|
+
return x * x_mask
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DDSConv(nn.Module):
|
|
62
|
+
"""
|
|
63
|
+
Dialted and Depth-Separable Convolution
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
|
67
|
+
super().__init__()
|
|
68
|
+
self.channels = channels
|
|
69
|
+
self.kernel_size = kernel_size
|
|
70
|
+
self.n_layers = n_layers
|
|
71
|
+
self.p_dropout = p_dropout
|
|
72
|
+
|
|
73
|
+
self.drop = nn.Dropout(p_dropout)
|
|
74
|
+
self.convs_sep = nn.ModuleList()
|
|
75
|
+
self.convs_1x1 = nn.ModuleList()
|
|
76
|
+
self.norms_1 = nn.ModuleList()
|
|
77
|
+
self.norms_2 = nn.ModuleList()
|
|
78
|
+
for i in range(n_layers):
|
|
79
|
+
dilation = kernel_size ** i
|
|
80
|
+
padding = (kernel_size * dilation - dilation) // 2
|
|
81
|
+
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
|
82
|
+
groups=channels, dilation=dilation, padding=padding
|
|
83
|
+
))
|
|
84
|
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
|
85
|
+
self.norms_1.append(LayerNorm(channels))
|
|
86
|
+
self.norms_2.append(LayerNorm(channels))
|
|
87
|
+
|
|
88
|
+
def forward(self, x, x_mask, g=None):
|
|
89
|
+
if g is not None:
|
|
90
|
+
x = x + g
|
|
91
|
+
for i in range(self.n_layers):
|
|
92
|
+
y = self.convs_sep[i](x * x_mask)
|
|
93
|
+
y = self.norms_1[i](y)
|
|
94
|
+
y = F.gelu(y)
|
|
95
|
+
y = self.convs_1x1[i](y)
|
|
96
|
+
y = self.norms_2[i](y)
|
|
97
|
+
y = F.gelu(y)
|
|
98
|
+
y = self.drop(y)
|
|
99
|
+
x = x + y
|
|
100
|
+
return x * x_mask
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class WN(torch.nn.Module):
|
|
104
|
+
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0, causal=False):
|
|
105
|
+
super(WN, self).__init__()
|
|
106
|
+
conv1d_type = SConv1d
|
|
107
|
+
assert (kernel_size % 2 == 1)
|
|
108
|
+
self.hidden_channels = hidden_channels
|
|
109
|
+
self.kernel_size = kernel_size,
|
|
110
|
+
self.dilation_rate = dilation_rate
|
|
111
|
+
self.n_layers = n_layers
|
|
112
|
+
self.gin_channels = gin_channels
|
|
113
|
+
self.p_dropout = p_dropout
|
|
114
|
+
|
|
115
|
+
self.in_layers = torch.nn.ModuleList()
|
|
116
|
+
self.res_skip_layers = torch.nn.ModuleList()
|
|
117
|
+
self.drop = nn.Dropout(p_dropout)
|
|
118
|
+
|
|
119
|
+
if gin_channels != 0:
|
|
120
|
+
self.cond_layer = conv1d_type(gin_channels, 2 * hidden_channels * n_layers, 1, norm='weight_norm')
|
|
121
|
+
|
|
122
|
+
for i in range(n_layers):
|
|
123
|
+
dilation = dilation_rate ** i
|
|
124
|
+
padding = int((kernel_size * dilation - dilation) / 2)
|
|
125
|
+
in_layer = conv1d_type(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation,
|
|
126
|
+
padding=padding, norm='weight_norm', causal=causal)
|
|
127
|
+
self.in_layers.append(in_layer)
|
|
128
|
+
|
|
129
|
+
# last one is not necessary
|
|
130
|
+
if i < n_layers - 1:
|
|
131
|
+
res_skip_channels = 2 * hidden_channels
|
|
132
|
+
else:
|
|
133
|
+
res_skip_channels = hidden_channels
|
|
134
|
+
|
|
135
|
+
res_skip_layer = conv1d_type(hidden_channels, res_skip_channels, 1, norm='weight_norm', causal=causal)
|
|
136
|
+
self.res_skip_layers.append(res_skip_layer)
|
|
137
|
+
|
|
138
|
+
def forward(self, x, x_mask, g=None, **kwargs):
|
|
139
|
+
output = torch.zeros_like(x)
|
|
140
|
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
|
141
|
+
|
|
142
|
+
if g is not None:
|
|
143
|
+
g = self.cond_layer(g)
|
|
144
|
+
|
|
145
|
+
for i in range(self.n_layers):
|
|
146
|
+
x_in = self.in_layers[i](x)
|
|
147
|
+
if g is not None:
|
|
148
|
+
cond_offset = i * 2 * self.hidden_channels
|
|
149
|
+
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
|
|
150
|
+
else:
|
|
151
|
+
g_l = torch.zeros_like(x_in)
|
|
152
|
+
|
|
153
|
+
acts = commons.fused_add_tanh_sigmoid_multiply(
|
|
154
|
+
x_in,
|
|
155
|
+
g_l,
|
|
156
|
+
n_channels_tensor)
|
|
157
|
+
acts = self.drop(acts)
|
|
158
|
+
|
|
159
|
+
res_skip_acts = self.res_skip_layers[i](acts)
|
|
160
|
+
if i < self.n_layers - 1:
|
|
161
|
+
res_acts = res_skip_acts[:, :self.hidden_channels, :]
|
|
162
|
+
x = (x + res_acts) * x_mask
|
|
163
|
+
output = output + res_skip_acts[:, self.hidden_channels:, :]
|
|
164
|
+
else:
|
|
165
|
+
output = output + res_skip_acts
|
|
166
|
+
return output * x_mask
|
|
167
|
+
|
|
168
|
+
def remove_weight_norm(self):
|
|
169
|
+
if self.gin_channels != 0:
|
|
170
|
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
|
171
|
+
for l in self.in_layers:
|
|
172
|
+
torch.nn.utils.remove_weight_norm(l)
|
|
173
|
+
for l in self.res_skip_layers:
|
|
174
|
+
torch.nn.utils.remove_weight_norm(l)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#coding:utf-8
|
|
2
|
+
import os, sys
|
|
3
|
+
import os.path as osp
|
|
4
|
+
import numpy as np
|
|
5
|
+
import torch
|
|
6
|
+
from torch import nn
|
|
7
|
+
from torch.optim import Optimizer
|
|
8
|
+
from functools import reduce
|
|
9
|
+
from torch.optim import AdamW
|
|
10
|
+
|
|
11
|
+
class MultiOptimizer:
|
|
12
|
+
def __init__(self, optimizers={}, schedulers={}):
|
|
13
|
+
self.optimizers = optimizers
|
|
14
|
+
self.schedulers = schedulers
|
|
15
|
+
self.keys = list(optimizers.keys())
|
|
16
|
+
self.param_groups = reduce(lambda x,y: x+y, [v.param_groups for v in self.optimizers.values()])
|
|
17
|
+
|
|
18
|
+
def state_dict(self):
|
|
19
|
+
state_dicts = [(key, self.optimizers[key].state_dict())\
|
|
20
|
+
for key in self.keys]
|
|
21
|
+
return state_dicts
|
|
22
|
+
|
|
23
|
+
def scheduler_state_dict(self):
|
|
24
|
+
state_dicts = [(key, self.schedulers[key].state_dict())\
|
|
25
|
+
for key in self.keys]
|
|
26
|
+
return state_dicts
|
|
27
|
+
|
|
28
|
+
def load_state_dict(self, state_dict):
|
|
29
|
+
for key, val in state_dict:
|
|
30
|
+
try:
|
|
31
|
+
self.optimizers[key].load_state_dict(val)
|
|
32
|
+
except:
|
|
33
|
+
print("Unloaded %s" % key)
|
|
34
|
+
|
|
35
|
+
def load_scheduler_state_dict(self, state_dict):
|
|
36
|
+
for key, val in state_dict:
|
|
37
|
+
try:
|
|
38
|
+
self.schedulers[key].load_state_dict(val)
|
|
39
|
+
except:
|
|
40
|
+
print("Unloaded %s" % key)
|
|
41
|
+
|
|
42
|
+
def step(self, key=None, scaler=None):
|
|
43
|
+
keys = [key] if key is not None else self.keys
|
|
44
|
+
_ = [self._step(key, scaler) for key in keys]
|
|
45
|
+
|
|
46
|
+
def _step(self, key, scaler=None):
|
|
47
|
+
if scaler is not None:
|
|
48
|
+
scaler.step(self.optimizers[key])
|
|
49
|
+
scaler.update()
|
|
50
|
+
else:
|
|
51
|
+
self.optimizers[key].step()
|
|
52
|
+
|
|
53
|
+
def zero_grad(self, key=None):
|
|
54
|
+
if key is not None:
|
|
55
|
+
self.optimizers[key].zero_grad()
|
|
56
|
+
else:
|
|
57
|
+
_ = [self.optimizers[key].zero_grad() for key in self.keys]
|
|
58
|
+
|
|
59
|
+
def scheduler(self, *args, key=None):
|
|
60
|
+
if key is not None:
|
|
61
|
+
self.schedulers[key].step(*args)
|
|
62
|
+
else:
|
|
63
|
+
_ = [self.schedulers[key].step_batch(*args) for key in self.keys]
|
|
64
|
+
|
|
65
|
+
def define_scheduler(optimizer, params):
|
|
66
|
+
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=params['gamma'])
|
|
67
|
+
|
|
68
|
+
return scheduler
|
|
69
|
+
|
|
70
|
+
def build_optimizer(model_dict, lr, type='AdamW'):
|
|
71
|
+
optim = {}
|
|
72
|
+
for key, model in model_dict.items():
|
|
73
|
+
model_parameters = model.parameters()
|
|
74
|
+
parameters_names = []
|
|
75
|
+
parameters_names.append(
|
|
76
|
+
[
|
|
77
|
+
name_param_pair[0]
|
|
78
|
+
for name_param_pair in model.named_parameters()
|
|
79
|
+
]
|
|
80
|
+
)
|
|
81
|
+
if type == 'AdamW':
|
|
82
|
+
optim[key] = AdamW(
|
|
83
|
+
model_parameters,
|
|
84
|
+
lr=lr,
|
|
85
|
+
betas=(0.9, 0.98),
|
|
86
|
+
eps=1e-9,
|
|
87
|
+
weight_decay=0.1,
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError('Unknown optimizer type: %s' % type)
|
|
91
|
+
|
|
92
|
+
schedulers = dict([(key, torch.optim.lr_scheduler.ExponentialLR(opt, gamma=0.999996))
|
|
93
|
+
for key, opt in optim.items()])
|
|
94
|
+
|
|
95
|
+
multi_optim = MultiOptimizer(optim, schedulers)
|
|
96
|
+
return multi_optim
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from transformers import SeamlessM4TFeatureExtractor
|
|
2
|
+
from transformers import Wav2Vec2BertModel
|
|
3
|
+
import torch
|
|
4
|
+
import torch.nn as nn
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
import numpy as np
|
|
7
|
+
import librosa
|
|
8
|
+
import os
|
|
9
|
+
import pickle
|
|
10
|
+
import math
|
|
11
|
+
import json
|
|
12
|
+
import safetensors
|
|
13
|
+
import json5
|
|
14
|
+
# from codec.kmeans.repcodec_model import RepCodec
|
|
15
|
+
from startts.examples.ftchar.models.codec.kmeans.repcodec_model import RepCodec
|
|
16
|
+
|
|
17
|
+
class JsonHParams:
|
|
18
|
+
def __init__(self, **kwargs):
|
|
19
|
+
for k, v in kwargs.items():
|
|
20
|
+
if type(v) == dict:
|
|
21
|
+
v = JsonHParams(**v)
|
|
22
|
+
self[k] = v
|
|
23
|
+
|
|
24
|
+
def keys(self):
|
|
25
|
+
return self.__dict__.keys()
|
|
26
|
+
|
|
27
|
+
def items(self):
|
|
28
|
+
return self.__dict__.items()
|
|
29
|
+
|
|
30
|
+
def values(self):
|
|
31
|
+
return self.__dict__.values()
|
|
32
|
+
|
|
33
|
+
def __len__(self):
|
|
34
|
+
return len(self.__dict__)
|
|
35
|
+
|
|
36
|
+
def __getitem__(self, key):
|
|
37
|
+
return getattr(self, key)
|
|
38
|
+
|
|
39
|
+
def __setitem__(self, key, value):
|
|
40
|
+
return setattr(self, key, value)
|
|
41
|
+
|
|
42
|
+
def __contains__(self, key):
|
|
43
|
+
return key in self.__dict__
|
|
44
|
+
|
|
45
|
+
def __repr__(self):
|
|
46
|
+
return self.__dict__.__repr__()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _load_config(config_fn, lowercase=False):
|
|
50
|
+
"""Load configurations into a dictionary
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
config_fn (str): path to configuration file
|
|
54
|
+
lowercase (bool, optional): whether changing keys to lower case. Defaults to False.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
dict: dictionary that stores configurations
|
|
58
|
+
"""
|
|
59
|
+
with open(config_fn, "r") as f:
|
|
60
|
+
data = f.read()
|
|
61
|
+
config_ = json5.loads(data)
|
|
62
|
+
if "base_config" in config_:
|
|
63
|
+
# load configurations from new path
|
|
64
|
+
p_config_path = os.path.join(os.getenv("WORK_DIR"), config_["base_config"])
|
|
65
|
+
p_config_ = _load_config(p_config_path)
|
|
66
|
+
config_ = override_config(p_config_, config_)
|
|
67
|
+
if lowercase:
|
|
68
|
+
# change keys in config_ to lower case
|
|
69
|
+
config_ = get_lowercase_keys_config(config_)
|
|
70
|
+
return config_
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def load_config(config_fn, lowercase=False):
|
|
74
|
+
"""Load configurations into a dictionary
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
config_fn (str): path to configuration file
|
|
78
|
+
lowercase (bool, optional): _description_. Defaults to False.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
JsonHParams: an object that stores configurations
|
|
82
|
+
"""
|
|
83
|
+
config_ = _load_config(config_fn, lowercase=lowercase)
|
|
84
|
+
# create an JsonHParams object with configuration dict
|
|
85
|
+
cfg = JsonHParams(**config_)
|
|
86
|
+
return cfg
|
|
87
|
+
|
|
88
|
+
class Extract_wav2vectbert:
|
|
89
|
+
def __init__(self,device):
|
|
90
|
+
#semantic_model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
|
|
91
|
+
self.semantic_model = Wav2Vec2BertModel.from_pretrained("./MaskGCT_model/w2v_bert/")
|
|
92
|
+
self.semantic_model.eval()
|
|
93
|
+
self.semantic_model.to(device)
|
|
94
|
+
self.stat_mean_var = torch.load("./MaskGCT_model/wav2vec2bert_stats.pt")
|
|
95
|
+
self.semantic_mean = self.stat_mean_var["mean"]
|
|
96
|
+
self.semantic_std = torch.sqrt(self.stat_mean_var["var"])
|
|
97
|
+
self.semantic_mean = self.semantic_mean.to(device)
|
|
98
|
+
self.semantic_std = self.semantic_std.to(device)
|
|
99
|
+
self.processor = SeamlessM4TFeatureExtractor.from_pretrained(
|
|
100
|
+
"./MaskGCT_model/w2v_bert/")
|
|
101
|
+
self.device = device
|
|
102
|
+
|
|
103
|
+
cfg_maskgct = load_config('./MaskGCT_model/maskgct.json')
|
|
104
|
+
cfg = cfg_maskgct.model.semantic_codec
|
|
105
|
+
self.semantic_code_ckpt = r'./MaskGCT_model/semantic_codec/model.safetensors'
|
|
106
|
+
self.semantic_codec = RepCodec(cfg=cfg)
|
|
107
|
+
self.semantic_codec.eval()
|
|
108
|
+
self.semantic_codec.to(device)
|
|
109
|
+
safetensors.torch.load_model(self.semantic_codec, self.semantic_code_ckpt)
|
|
110
|
+
|
|
111
|
+
@torch.no_grad()
|
|
112
|
+
def extract_features(self, speech): # speech [b,T]
|
|
113
|
+
inputs = self.processor(speech, sampling_rate=16000, return_tensors="pt")
|
|
114
|
+
input_features = inputs["input_features"]
|
|
115
|
+
attention_mask = inputs["attention_mask"]
|
|
116
|
+
return input_features, attention_mask #[2, 620, 160] [2, 620]
|
|
117
|
+
|
|
118
|
+
@torch.no_grad()
|
|
119
|
+
def extract_semantic_code(self, input_features, attention_mask):
|
|
120
|
+
vq_emb = self.semantic_model( # Wav2Vec2BertModel
|
|
121
|
+
input_features=input_features,
|
|
122
|
+
attention_mask=attention_mask,
|
|
123
|
+
output_hidden_states=True,
|
|
124
|
+
)
|
|
125
|
+
feat = vq_emb.hidden_states[17] # (B, T, C)
|
|
126
|
+
feat = (feat - self.semantic_mean.to(feat)) / self.semantic_std.to(feat)
|
|
127
|
+
|
|
128
|
+
semantic_code, rec_feat = self.semantic_codec.quantize(feat) # (B, T)
|
|
129
|
+
return semantic_code, rec_feat
|
|
130
|
+
|
|
131
|
+
def feature_extract(self, prompt_speech):
|
|
132
|
+
|
|
133
|
+
input_features, attention_mask = self.extract_features(prompt_speech)
|
|
134
|
+
input_features = input_features.to(self.device)
|
|
135
|
+
attention_mask = attention_mask.to(self.device)
|
|
136
|
+
semantic_code, rec_feat = self.extract_semantic_code(input_features, attention_mask)
|
|
137
|
+
return semantic_code,rec_feat
|
|
138
|
+
|
|
139
|
+
if __name__=='__main__':
|
|
140
|
+
speech_path = 'test/magi1.wav'
|
|
141
|
+
speech = librosa.load(speech_path, sr=16000)[0]
|
|
142
|
+
speech = np.c_[speech,speech,speech].T #[2, 198559]
|
|
143
|
+
print(speech.shape)
|
|
144
|
+
|
|
145
|
+
Extract_feature = Extract_wav2vectbert('cuda:0')
|
|
146
|
+
semantic_code,rec_feat = Extract_feature.feature_extract(speech)
|
|
147
|
+
print(semantic_code.shape,rec_feat.shape)
|
|
148
|
+
|
|
File without changes
|