xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +473 -31
- xinference/client/restful/async_restful_client.py +178 -8
- xinference/client/restful/restful_client.py +151 -3
- xinference/core/supervisor.py +99 -53
- xinference/core/worker.py +10 -0
- xinference/deploy/cmdline.py +15 -0
- xinference/model/audio/core.py +21 -6
- xinference/model/audio/indextts2.py +166 -0
- xinference/model/audio/model_spec.json +58 -21
- xinference/model/image/model_spec.json +159 -90
- xinference/model/image/stable_diffusion/core.py +13 -4
- xinference/model/llm/__init__.py +6 -2
- xinference/model/llm/llm_family.json +1299 -174
- xinference/model/llm/mlx/distributed_models/core.py +41 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
- xinference/model/llm/sglang/core.py +44 -11
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
- xinference/model/llm/transformers/chatglm.py +3 -0
- xinference/model/llm/transformers/core.py +129 -36
- xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
- xinference/model/llm/transformers/utils.py +23 -0
- xinference/model/llm/utils.py +48 -32
- xinference/model/llm/vllm/core.py +207 -72
- xinference/model/utils.py +74 -31
- xinference/thirdparty/audiotools/__init__.py +10 -0
- xinference/thirdparty/audiotools/core/__init__.py +4 -0
- xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
- xinference/thirdparty/audiotools/core/display.py +194 -0
- xinference/thirdparty/audiotools/core/dsp.py +390 -0
- xinference/thirdparty/audiotools/core/effects.py +647 -0
- xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
- xinference/thirdparty/audiotools/core/loudness.py +320 -0
- xinference/thirdparty/audiotools/core/playback.py +252 -0
- xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
- xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
- xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
- xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
- xinference/thirdparty/audiotools/core/util.py +671 -0
- xinference/thirdparty/audiotools/core/whisper.py +97 -0
- xinference/thirdparty/audiotools/data/__init__.py +3 -0
- xinference/thirdparty/audiotools/data/datasets.py +517 -0
- xinference/thirdparty/audiotools/data/preprocess.py +81 -0
- xinference/thirdparty/audiotools/data/transforms.py +1592 -0
- xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
- xinference/thirdparty/audiotools/metrics/distance.py +131 -0
- xinference/thirdparty/audiotools/metrics/quality.py +159 -0
- xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
- xinference/thirdparty/audiotools/ml/__init__.py +5 -0
- xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
- xinference/thirdparty/audiotools/ml/decorators.py +440 -0
- xinference/thirdparty/audiotools/ml/experiment.py +90 -0
- xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
- xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
- xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
- xinference/thirdparty/audiotools/post.py +140 -0
- xinference/thirdparty/audiotools/preference.py +600 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
- xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
- xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
- xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
- xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
- xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
- xinference/thirdparty/indextts/__init__.py +0 -0
- xinference/thirdparty/indextts/cli.py +65 -0
- xinference/thirdparty/indextts/gpt/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
- xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
- xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
- xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
- xinference/thirdparty/indextts/gpt/model.py +713 -0
- xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
- xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
- xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
- xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
- xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
- xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
- xinference/thirdparty/indextts/infer.py +690 -0
- xinference/thirdparty/indextts/infer_v2.py +739 -0
- xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
- xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
- xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
- xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
- xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
- xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
- xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
- xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
- xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
- xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
- xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
- xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
- xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
- xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
- xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
- xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
- xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
- xinference/thirdparty/indextts/utils/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/arch_util.py +120 -0
- xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
- xinference/thirdparty/indextts/utils/common.py +121 -0
- xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
- xinference/thirdparty/indextts/utils/front.py +536 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
- xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
- xinference/thirdparty/indextts/utils/text_utils.py +41 -0
- xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
- xinference/thirdparty/indextts/utils/utils.py +93 -0
- xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
- xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
- xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
- xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
- xinference/thirdparty/melo/text/chinese_mix.py +2 -2
- xinference/types.py +9 -0
- xinference/ui/gradio/media_interface.py +66 -8
- xinference/ui/web/ui/build/asset-manifest.json +6 -6
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
- xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
- xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
- xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
- xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
- xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
- xinference/ui/web/ui/package-lock.json +0 -34
- xinference/ui/web/ui/package.json +0 -1
- xinference/ui/web/ui/src/locales/en.json +9 -3
- xinference/ui/web/ui/src/locales/ja.json +9 -3
- xinference/ui/web/ui/src/locales/ko.json +9 -3
- xinference/ui/web/ui/src/locales/zh.json +9 -3
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
- xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
- xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
- xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
- xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
- xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
- xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
- xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
- xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
- xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
- xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
- xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
- xinference/ui/web/ui/node_modules/select/bower.json +0 -13
- xinference/ui/web/ui/node_modules/select/package.json +0 -29
- xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
import argparse
|
|
4
|
+
import gradio as gr
|
|
5
|
+
from zipfile import ZipFile
|
|
6
|
+
import langid
|
|
7
|
+
from . import se_extractor
|
|
8
|
+
from .api import BaseSpeakerTTS, ToneColorConverter
|
|
9
|
+
|
|
10
|
+
parser = argparse.ArgumentParser()
|
|
11
|
+
parser.add_argument("--share", action='store_true', default=False, help="make link public")
|
|
12
|
+
args = parser.parse_args()
|
|
13
|
+
|
|
14
|
+
en_ckpt_base = 'checkpoints/base_speakers/EN'
|
|
15
|
+
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
|
|
16
|
+
ckpt_converter = 'checkpoints/converter'
|
|
17
|
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
18
|
+
output_dir = 'outputs'
|
|
19
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
# load models
|
|
22
|
+
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
|
|
23
|
+
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
|
|
24
|
+
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
|
|
25
|
+
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
|
|
26
|
+
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
|
27
|
+
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
|
28
|
+
|
|
29
|
+
# load speaker embeddings
|
|
30
|
+
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
|
|
31
|
+
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
|
|
32
|
+
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
|
|
33
|
+
|
|
34
|
+
# This online demo mainly supports English and Chinese
|
|
35
|
+
supported_languages = ['zh', 'en']
|
|
36
|
+
|
|
37
|
+
def predict(prompt, style, audio_file_pth, agree):
|
|
38
|
+
# initialize a empty info
|
|
39
|
+
text_hint = ''
|
|
40
|
+
# agree with the terms
|
|
41
|
+
if agree == False:
|
|
42
|
+
text_hint += '[ERROR] Please accept the Terms & Condition!\n'
|
|
43
|
+
gr.Warning("Please accept the Terms & Condition!")
|
|
44
|
+
return (
|
|
45
|
+
text_hint,
|
|
46
|
+
None,
|
|
47
|
+
None,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# first detect the input language
|
|
51
|
+
language_predicted = langid.classify(prompt)[0].strip()
|
|
52
|
+
print(f"Detected language:{language_predicted}")
|
|
53
|
+
|
|
54
|
+
if language_predicted not in supported_languages:
|
|
55
|
+
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
|
56
|
+
gr.Warning(
|
|
57
|
+
f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
return (
|
|
61
|
+
text_hint,
|
|
62
|
+
None,
|
|
63
|
+
None,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if language_predicted == "zh":
|
|
67
|
+
tts_model = zh_base_speaker_tts
|
|
68
|
+
source_se = zh_source_se
|
|
69
|
+
language = 'Chinese'
|
|
70
|
+
if style not in ['default']:
|
|
71
|
+
text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
|
|
72
|
+
gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
|
|
73
|
+
return (
|
|
74
|
+
text_hint,
|
|
75
|
+
None,
|
|
76
|
+
None,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
tts_model = en_base_speaker_tts
|
|
81
|
+
if style == 'default':
|
|
82
|
+
source_se = en_source_default_se
|
|
83
|
+
else:
|
|
84
|
+
source_se = en_source_style_se
|
|
85
|
+
language = 'English'
|
|
86
|
+
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
|
|
87
|
+
text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
|
|
88
|
+
gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
|
|
89
|
+
return (
|
|
90
|
+
text_hint,
|
|
91
|
+
None,
|
|
92
|
+
None,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
speaker_wav = audio_file_pth
|
|
96
|
+
|
|
97
|
+
if len(prompt) < 2:
|
|
98
|
+
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
|
99
|
+
gr.Warning("Please give a longer prompt text")
|
|
100
|
+
return (
|
|
101
|
+
text_hint,
|
|
102
|
+
None,
|
|
103
|
+
None,
|
|
104
|
+
)
|
|
105
|
+
if len(prompt) > 200:
|
|
106
|
+
text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
|
|
107
|
+
gr.Warning(
|
|
108
|
+
"Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
|
|
109
|
+
)
|
|
110
|
+
return (
|
|
111
|
+
text_hint,
|
|
112
|
+
None,
|
|
113
|
+
None,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
|
117
|
+
try:
|
|
118
|
+
target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
|
121
|
+
gr.Warning(
|
|
122
|
+
"[ERROR] Get target tone color error {str(e)} \n"
|
|
123
|
+
)
|
|
124
|
+
return (
|
|
125
|
+
text_hint,
|
|
126
|
+
None,
|
|
127
|
+
None,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
src_path = f'{output_dir}/tmp.wav'
|
|
131
|
+
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
|
132
|
+
|
|
133
|
+
save_path = f'{output_dir}/output.wav'
|
|
134
|
+
# Run the tone color converter
|
|
135
|
+
encode_message = "@MyShell"
|
|
136
|
+
tone_color_converter.convert(
|
|
137
|
+
audio_src_path=src_path,
|
|
138
|
+
src_se=source_se,
|
|
139
|
+
tgt_se=target_se,
|
|
140
|
+
output_path=save_path,
|
|
141
|
+
message=encode_message)
|
|
142
|
+
|
|
143
|
+
text_hint += f'''Get response successfully \n'''
|
|
144
|
+
|
|
145
|
+
return (
|
|
146
|
+
text_hint,
|
|
147
|
+
save_path,
|
|
148
|
+
speaker_wav,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
title = "MyShell OpenVoice"
|
|
154
|
+
|
|
155
|
+
description = """
|
|
156
|
+
We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
markdown_table = """
|
|
160
|
+
<div align="center" style="margin-bottom: 10px;">
|
|
161
|
+
|
|
162
|
+
| | | |
|
|
163
|
+
| :-----------: | :-----------: | :-----------: |
|
|
164
|
+
| **OpenSource Repo** | **Project Page** | **Join the Community** |
|
|
165
|
+
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [](https://discord.gg/myshell) |
|
|
166
|
+
|
|
167
|
+
</div>
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
markdown_table_v2 = """
|
|
171
|
+
<div align="center" style="margin-bottom: 2px;">
|
|
172
|
+
|
|
173
|
+
| | | | |
|
|
174
|
+
| :-----------: | :-----------: | :-----------: | :-----------: |
|
|
175
|
+
| **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
|
|
176
|
+
|
|
177
|
+
| | |
|
|
178
|
+
| :-----------: | :-----------: |
|
|
179
|
+
**Join the Community** | [](https://discord.gg/myshell) |
|
|
180
|
+
|
|
181
|
+
</div>
|
|
182
|
+
"""
|
|
183
|
+
content = """
|
|
184
|
+
<div>
|
|
185
|
+
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
|
186
|
+
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
|
187
|
+
</div>
|
|
188
|
+
"""
|
|
189
|
+
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
examples = [
|
|
193
|
+
[
|
|
194
|
+
"今天天气真好,我们一起出去吃饭吧。",
|
|
195
|
+
'default',
|
|
196
|
+
"resources/demo_speaker1.mp3",
|
|
197
|
+
True,
|
|
198
|
+
],[
|
|
199
|
+
"This audio is generated by open voice with a half-performance model.",
|
|
200
|
+
'whispering',
|
|
201
|
+
"resources/demo_speaker2.mp3",
|
|
202
|
+
True,
|
|
203
|
+
],
|
|
204
|
+
[
|
|
205
|
+
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
|
206
|
+
'sad',
|
|
207
|
+
"resources/demo_speaker0.mp3",
|
|
208
|
+
True,
|
|
209
|
+
],
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
with gr.Blocks(analytics_enabled=False) as demo:
|
|
213
|
+
|
|
214
|
+
with gr.Row():
|
|
215
|
+
with gr.Column():
|
|
216
|
+
with gr.Row():
|
|
217
|
+
gr.Markdown(
|
|
218
|
+
"""
|
|
219
|
+
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
|
220
|
+
"""
|
|
221
|
+
)
|
|
222
|
+
with gr.Row():
|
|
223
|
+
gr.Markdown(markdown_table_v2)
|
|
224
|
+
with gr.Row():
|
|
225
|
+
gr.Markdown(description)
|
|
226
|
+
with gr.Column():
|
|
227
|
+
gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
|
|
228
|
+
|
|
229
|
+
with gr.Row():
|
|
230
|
+
gr.HTML(wrapped_markdown_content)
|
|
231
|
+
|
|
232
|
+
with gr.Row():
|
|
233
|
+
with gr.Column():
|
|
234
|
+
input_text_gr = gr.Textbox(
|
|
235
|
+
label="Text Prompt",
|
|
236
|
+
info="One or two sentences at a time produces the best results. Up to 200 text characters.",
|
|
237
|
+
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
|
238
|
+
)
|
|
239
|
+
style_gr = gr.Dropdown(
|
|
240
|
+
label="Style",
|
|
241
|
+
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
|
242
|
+
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
|
|
243
|
+
max_choices=1,
|
|
244
|
+
value="default",
|
|
245
|
+
)
|
|
246
|
+
ref_gr = gr.Audio(
|
|
247
|
+
label="Reference Audio",
|
|
248
|
+
info="Click on the ✎ button to upload your own target speaker audio",
|
|
249
|
+
type="filepath",
|
|
250
|
+
value="resources/demo_speaker2.mp3",
|
|
251
|
+
)
|
|
252
|
+
tos_gr = gr.Checkbox(
|
|
253
|
+
label="Agree",
|
|
254
|
+
value=False,
|
|
255
|
+
info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
with gr.Column():
|
|
262
|
+
out_text_gr = gr.Text(label="Info")
|
|
263
|
+
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
|
|
264
|
+
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
|
265
|
+
|
|
266
|
+
gr.Examples(examples,
|
|
267
|
+
label="Examples",
|
|
268
|
+
inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
|
|
269
|
+
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
|
270
|
+
fn=predict,
|
|
271
|
+
cache_examples=False,)
|
|
272
|
+
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
|
273
|
+
|
|
274
|
+
demo.queue()
|
|
275
|
+
demo.launch(debug=True, show_api=True, share=args.share)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import torch
|
|
4
|
+
import hashlib
|
|
5
|
+
import librosa
|
|
6
|
+
import base64
|
|
7
|
+
from glob import glob
|
|
8
|
+
import numpy as np
|
|
9
|
+
from pydub import AudioSegment
|
|
10
|
+
from faster_whisper import WhisperModel
|
|
11
|
+
import hashlib
|
|
12
|
+
import base64
|
|
13
|
+
import librosa
|
|
14
|
+
# from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
|
15
|
+
|
|
16
|
+
model_size = "medium"
|
|
17
|
+
# Run on GPU with FP16
|
|
18
|
+
model = None
|
|
19
|
+
def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
|
|
20
|
+
global model
|
|
21
|
+
if model is None:
|
|
22
|
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
23
|
+
audio = AudioSegment.from_file(audio_path)
|
|
24
|
+
max_len = len(audio)
|
|
25
|
+
|
|
26
|
+
target_folder = os.path.join(target_dir, audio_name)
|
|
27
|
+
|
|
28
|
+
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
|
29
|
+
segments = list(segments)
|
|
30
|
+
|
|
31
|
+
# create directory
|
|
32
|
+
os.makedirs(target_folder, exist_ok=True)
|
|
33
|
+
wavs_folder = os.path.join(target_folder, 'wavs')
|
|
34
|
+
os.makedirs(wavs_folder, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
# segments
|
|
37
|
+
s_ind = 0
|
|
38
|
+
start_time = None
|
|
39
|
+
|
|
40
|
+
for k, w in enumerate(segments):
|
|
41
|
+
# process with the time
|
|
42
|
+
if k == 0:
|
|
43
|
+
start_time = max(0, w.start)
|
|
44
|
+
|
|
45
|
+
end_time = w.end
|
|
46
|
+
|
|
47
|
+
# calculate confidence
|
|
48
|
+
if len(w.words) > 0:
|
|
49
|
+
confidence = sum([s.probability for s in w.words]) / len(w.words)
|
|
50
|
+
else:
|
|
51
|
+
confidence = 0.
|
|
52
|
+
# clean text
|
|
53
|
+
text = w.text.replace('...', '')
|
|
54
|
+
|
|
55
|
+
# left 0.08s for each audios
|
|
56
|
+
audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
|
|
57
|
+
|
|
58
|
+
# segment file name
|
|
59
|
+
fname = f"{audio_name}_seg{s_ind}.wav"
|
|
60
|
+
|
|
61
|
+
# filter out the segment shorter than 1.5s and longer than 20s
|
|
62
|
+
save = audio_seg.duration_seconds > 1.5 and \
|
|
63
|
+
audio_seg.duration_seconds < 20. and \
|
|
64
|
+
len(text) >= 2 and len(text) < 200
|
|
65
|
+
|
|
66
|
+
if save:
|
|
67
|
+
output_file = os.path.join(wavs_folder, fname)
|
|
68
|
+
audio_seg.export(output_file, format='wav')
|
|
69
|
+
|
|
70
|
+
if k < len(segments) - 1:
|
|
71
|
+
start_time = max(0, segments[k+1].start - 0.08)
|
|
72
|
+
|
|
73
|
+
s_ind = s_ind + 1
|
|
74
|
+
return wavs_folder
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
|
|
78
|
+
SAMPLE_RATE = 16000
|
|
79
|
+
audio_vad = get_audio_tensor(audio_path)
|
|
80
|
+
segments = get_vad_segments(
|
|
81
|
+
audio_vad,
|
|
82
|
+
output_sample=True,
|
|
83
|
+
min_speech_duration=0.1,
|
|
84
|
+
min_silence_duration=1,
|
|
85
|
+
method="silero",
|
|
86
|
+
)
|
|
87
|
+
segments = [(seg["start"], seg["end"]) for seg in segments]
|
|
88
|
+
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
|
|
89
|
+
print(segments)
|
|
90
|
+
audio_active = AudioSegment.silent(duration=0)
|
|
91
|
+
audio = AudioSegment.from_file(audio_path)
|
|
92
|
+
|
|
93
|
+
for start_time, end_time in segments:
|
|
94
|
+
audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
|
|
95
|
+
|
|
96
|
+
audio_dur = audio_active.duration_seconds
|
|
97
|
+
print(f'after vad: dur = {audio_dur}')
|
|
98
|
+
target_folder = os.path.join(target_dir, audio_name)
|
|
99
|
+
wavs_folder = os.path.join(target_folder, 'wavs')
|
|
100
|
+
os.makedirs(wavs_folder, exist_ok=True)
|
|
101
|
+
start_time = 0.
|
|
102
|
+
count = 0
|
|
103
|
+
num_splits = int(np.round(audio_dur / split_seconds))
|
|
104
|
+
assert num_splits > 0, 'input audio is too short'
|
|
105
|
+
interval = audio_dur / num_splits
|
|
106
|
+
|
|
107
|
+
for i in range(num_splits):
|
|
108
|
+
end_time = min(start_time + interval, audio_dur)
|
|
109
|
+
if i == num_splits - 1:
|
|
110
|
+
end_time = audio_dur
|
|
111
|
+
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
|
|
112
|
+
audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
|
|
113
|
+
audio_seg.export(output_file, format='wav')
|
|
114
|
+
start_time = end_time
|
|
115
|
+
count += 1
|
|
116
|
+
return wavs_folder
|
|
117
|
+
|
|
118
|
+
def hash_numpy_array(audio_path):
|
|
119
|
+
array, _ = librosa.load(audio_path, sr=None, mono=True)
|
|
120
|
+
# Convert the array to bytes
|
|
121
|
+
array_bytes = array.tobytes()
|
|
122
|
+
# Calculate the hash of the array bytes
|
|
123
|
+
hash_object = hashlib.sha256(array_bytes)
|
|
124
|
+
hash_value = hash_object.digest()
|
|
125
|
+
# Convert the hash value to base64
|
|
126
|
+
base64_value = base64.b64encode(hash_value)
|
|
127
|
+
return base64_value.decode('utf-8')[:16].replace('/', '_^')
|
|
128
|
+
|
|
129
|
+
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
|
|
130
|
+
device = vc_model.device
|
|
131
|
+
version = vc_model.version
|
|
132
|
+
print("OpenVoice version:", version)
|
|
133
|
+
|
|
134
|
+
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
|
|
135
|
+
se_path = os.path.join(target_dir, audio_name, 'se.pth')
|
|
136
|
+
|
|
137
|
+
# if os.path.isfile(se_path):
|
|
138
|
+
# se = torch.load(se_path).to(device)
|
|
139
|
+
# return se, audio_name
|
|
140
|
+
# if os.path.isdir(audio_path):
|
|
141
|
+
# wavs_folder = audio_path
|
|
142
|
+
|
|
143
|
+
# if vad:
|
|
144
|
+
# wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
|
|
145
|
+
# else:
|
|
146
|
+
# wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
|
|
147
|
+
|
|
148
|
+
# audio_segs = glob(f'{wavs_folder}/*.wav')
|
|
149
|
+
# if len(audio_segs) == 0:
|
|
150
|
+
# raise NotImplementedError('No audio segments found!')
|
|
151
|
+
|
|
152
|
+
return vc_model.extract_se([audio_path], se_save_path=se_path), audio_name
|
|
153
|
+
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch.nn import functional as F
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
|
8
|
+
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
|
9
|
+
DEFAULT_MIN_DERIVATIVE = 1e-3
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def piecewise_rational_quadratic_transform(
|
|
13
|
+
inputs,
|
|
14
|
+
unnormalized_widths,
|
|
15
|
+
unnormalized_heights,
|
|
16
|
+
unnormalized_derivatives,
|
|
17
|
+
inverse=False,
|
|
18
|
+
tails=None,
|
|
19
|
+
tail_bound=1.0,
|
|
20
|
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
|
21
|
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
|
22
|
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
|
23
|
+
):
|
|
24
|
+
if tails is None:
|
|
25
|
+
spline_fn = rational_quadratic_spline
|
|
26
|
+
spline_kwargs = {}
|
|
27
|
+
else:
|
|
28
|
+
spline_fn = unconstrained_rational_quadratic_spline
|
|
29
|
+
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
|
|
30
|
+
|
|
31
|
+
outputs, logabsdet = spline_fn(
|
|
32
|
+
inputs=inputs,
|
|
33
|
+
unnormalized_widths=unnormalized_widths,
|
|
34
|
+
unnormalized_heights=unnormalized_heights,
|
|
35
|
+
unnormalized_derivatives=unnormalized_derivatives,
|
|
36
|
+
inverse=inverse,
|
|
37
|
+
min_bin_width=min_bin_width,
|
|
38
|
+
min_bin_height=min_bin_height,
|
|
39
|
+
min_derivative=min_derivative,
|
|
40
|
+
**spline_kwargs
|
|
41
|
+
)
|
|
42
|
+
return outputs, logabsdet
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def searchsorted(bin_locations, inputs, eps=1e-6):
|
|
46
|
+
bin_locations[..., -1] += eps
|
|
47
|
+
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def unconstrained_rational_quadratic_spline(
|
|
51
|
+
inputs,
|
|
52
|
+
unnormalized_widths,
|
|
53
|
+
unnormalized_heights,
|
|
54
|
+
unnormalized_derivatives,
|
|
55
|
+
inverse=False,
|
|
56
|
+
tails="linear",
|
|
57
|
+
tail_bound=1.0,
|
|
58
|
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
|
59
|
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
|
60
|
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
|
61
|
+
):
|
|
62
|
+
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
|
63
|
+
outside_interval_mask = ~inside_interval_mask
|
|
64
|
+
|
|
65
|
+
outputs = torch.zeros_like(inputs)
|
|
66
|
+
logabsdet = torch.zeros_like(inputs)
|
|
67
|
+
|
|
68
|
+
if tails == "linear":
|
|
69
|
+
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
|
70
|
+
constant = np.log(np.exp(1 - min_derivative) - 1)
|
|
71
|
+
unnormalized_derivatives[..., 0] = constant
|
|
72
|
+
unnormalized_derivatives[..., -1] = constant
|
|
73
|
+
|
|
74
|
+
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
|
75
|
+
logabsdet[outside_interval_mask] = 0
|
|
76
|
+
else:
|
|
77
|
+
raise RuntimeError("{} tails are not implemented.".format(tails))
|
|
78
|
+
|
|
79
|
+
(
|
|
80
|
+
outputs[inside_interval_mask],
|
|
81
|
+
logabsdet[inside_interval_mask],
|
|
82
|
+
) = rational_quadratic_spline(
|
|
83
|
+
inputs=inputs[inside_interval_mask],
|
|
84
|
+
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
|
85
|
+
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
|
86
|
+
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
|
87
|
+
inverse=inverse,
|
|
88
|
+
left=-tail_bound,
|
|
89
|
+
right=tail_bound,
|
|
90
|
+
bottom=-tail_bound,
|
|
91
|
+
top=tail_bound,
|
|
92
|
+
min_bin_width=min_bin_width,
|
|
93
|
+
min_bin_height=min_bin_height,
|
|
94
|
+
min_derivative=min_derivative,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return outputs, logabsdet
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def rational_quadratic_spline(
|
|
101
|
+
inputs,
|
|
102
|
+
unnormalized_widths,
|
|
103
|
+
unnormalized_heights,
|
|
104
|
+
unnormalized_derivatives,
|
|
105
|
+
inverse=False,
|
|
106
|
+
left=0.0,
|
|
107
|
+
right=1.0,
|
|
108
|
+
bottom=0.0,
|
|
109
|
+
top=1.0,
|
|
110
|
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
|
111
|
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
|
112
|
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
|
113
|
+
):
|
|
114
|
+
if torch.min(inputs) < left or torch.max(inputs) > right:
|
|
115
|
+
raise ValueError("Input to a transform is not within its domain")
|
|
116
|
+
|
|
117
|
+
num_bins = unnormalized_widths.shape[-1]
|
|
118
|
+
|
|
119
|
+
if min_bin_width * num_bins > 1.0:
|
|
120
|
+
raise ValueError("Minimal bin width too large for the number of bins")
|
|
121
|
+
if min_bin_height * num_bins > 1.0:
|
|
122
|
+
raise ValueError("Minimal bin height too large for the number of bins")
|
|
123
|
+
|
|
124
|
+
widths = F.softmax(unnormalized_widths, dim=-1)
|
|
125
|
+
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
|
126
|
+
cumwidths = torch.cumsum(widths, dim=-1)
|
|
127
|
+
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
|
|
128
|
+
cumwidths = (right - left) * cumwidths + left
|
|
129
|
+
cumwidths[..., 0] = left
|
|
130
|
+
cumwidths[..., -1] = right
|
|
131
|
+
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
|
132
|
+
|
|
133
|
+
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
|
134
|
+
|
|
135
|
+
heights = F.softmax(unnormalized_heights, dim=-1)
|
|
136
|
+
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
|
137
|
+
cumheights = torch.cumsum(heights, dim=-1)
|
|
138
|
+
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
|
|
139
|
+
cumheights = (top - bottom) * cumheights + bottom
|
|
140
|
+
cumheights[..., 0] = bottom
|
|
141
|
+
cumheights[..., -1] = top
|
|
142
|
+
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
|
143
|
+
|
|
144
|
+
if inverse:
|
|
145
|
+
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
|
146
|
+
else:
|
|
147
|
+
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
|
148
|
+
|
|
149
|
+
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
|
150
|
+
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
|
151
|
+
|
|
152
|
+
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
|
153
|
+
delta = heights / widths
|
|
154
|
+
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
|
155
|
+
|
|
156
|
+
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
|
157
|
+
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
|
158
|
+
|
|
159
|
+
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
|
160
|
+
|
|
161
|
+
if inverse:
|
|
162
|
+
a = (inputs - input_cumheights) * (
|
|
163
|
+
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
|
164
|
+
) + input_heights * (input_delta - input_derivatives)
|
|
165
|
+
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
|
|
166
|
+
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
|
167
|
+
)
|
|
168
|
+
c = -input_delta * (inputs - input_cumheights)
|
|
169
|
+
|
|
170
|
+
discriminant = b.pow(2) - 4 * a * c
|
|
171
|
+
assert (discriminant >= 0).all()
|
|
172
|
+
|
|
173
|
+
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
|
174
|
+
outputs = root * input_bin_widths + input_cumwidths
|
|
175
|
+
|
|
176
|
+
theta_one_minus_theta = root * (1 - root)
|
|
177
|
+
denominator = input_delta + (
|
|
178
|
+
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
|
179
|
+
* theta_one_minus_theta
|
|
180
|
+
)
|
|
181
|
+
derivative_numerator = input_delta.pow(2) * (
|
|
182
|
+
input_derivatives_plus_one * root.pow(2)
|
|
183
|
+
+ 2 * input_delta * theta_one_minus_theta
|
|
184
|
+
+ input_derivatives * (1 - root).pow(2)
|
|
185
|
+
)
|
|
186
|
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
|
187
|
+
|
|
188
|
+
return outputs, -logabsdet
|
|
189
|
+
else:
|
|
190
|
+
theta = (inputs - input_cumwidths) / input_bin_widths
|
|
191
|
+
theta_one_minus_theta = theta * (1 - theta)
|
|
192
|
+
|
|
193
|
+
numerator = input_heights * (
|
|
194
|
+
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
|
|
195
|
+
)
|
|
196
|
+
denominator = input_delta + (
|
|
197
|
+
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
|
198
|
+
* theta_one_minus_theta
|
|
199
|
+
)
|
|
200
|
+
outputs = input_cumheights + numerator / denominator
|
|
201
|
+
|
|
202
|
+
derivative_numerator = input_delta.pow(2) * (
|
|
203
|
+
input_derivatives_plus_one * theta.pow(2)
|
|
204
|
+
+ 2 * input_delta * theta_one_minus_theta
|
|
205
|
+
+ input_derivatives * (1 - theta).pow(2)
|
|
206
|
+
)
|
|
207
|
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
|
208
|
+
|
|
209
|
+
return outputs, logabsdet
|