xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +2 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +77 -71
- xinference/core/chat_interface.py +6 -1
- xinference/core/model.py +79 -19
- xinference/core/supervisor.py +172 -10
- xinference/core/utils.py +12 -8
- xinference/core/worker.py +102 -4
- xinference/deploy/cmdline.py +3 -1
- xinference/deploy/test/test_cmdline.py +56 -0
- xinference/isolation.py +24 -0
- xinference/model/audio/core.py +16 -0
- xinference/model/audio/cosyvoice.py +39 -6
- xinference/model/audio/f5tts.py +200 -0
- xinference/model/audio/f5tts_mlx.py +260 -0
- xinference/model/audio/fish_speech.py +36 -111
- xinference/model/audio/melotts.py +110 -0
- xinference/model/audio/model_spec.json +99 -3
- xinference/model/audio/model_spec_modelscope.json +27 -0
- xinference/model/audio/utils.py +32 -0
- xinference/model/audio/whisper.py +35 -10
- xinference/model/embedding/core.py +203 -142
- xinference/model/embedding/model_spec.json +7 -0
- xinference/model/embedding/model_spec_modelscope.json +8 -0
- xinference/model/image/core.py +69 -1
- xinference/model/image/model_spec.json +145 -4
- xinference/model/image/model_spec_modelscope.json +150 -4
- xinference/model/image/stable_diffusion/core.py +45 -13
- xinference/model/llm/__init__.py +4 -2
- xinference/model/llm/llm_family.json +536 -53
- xinference/model/llm/llm_family.py +15 -36
- xinference/model/llm/llm_family_modelscope.json +454 -20
- xinference/model/llm/memory.py +1 -1
- xinference/model/llm/mlx/core.py +248 -52
- xinference/model/llm/sglang/core.py +1 -0
- xinference/model/llm/transformers/chatglm.py +9 -5
- xinference/model/llm/transformers/cogagent.py +272 -0
- xinference/model/llm/transformers/core.py +2 -0
- xinference/model/llm/transformers/qwen2_vl.py +12 -1
- xinference/model/llm/transformers/utils.py +16 -8
- xinference/model/llm/utils.py +36 -4
- xinference/model/llm/vllm/core.py +53 -10
- xinference/model/llm/vllm/xavier/__init__.py +13 -0
- xinference/model/llm/vllm/xavier/allocator.py +74 -0
- xinference/model/llm/vllm/xavier/block.py +111 -0
- xinference/model/llm/vllm/xavier/block_manager.py +71 -0
- xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
- xinference/model/llm/vllm/xavier/collective.py +74 -0
- xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
- xinference/model/llm/vllm/xavier/engine.py +247 -0
- xinference/model/llm/vllm/xavier/executor.py +134 -0
- xinference/model/llm/vllm/xavier/scheduler.py +438 -0
- xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
- xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
- xinference/model/llm/vllm/xavier/transfer.py +319 -0
- xinference/model/video/diffusers.py +14 -0
- xinference/model/video/model_spec.json +15 -0
- xinference/model/video/model_spec_modelscope.json +16 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
- xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
- xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
- xinference/thirdparty/cosyvoice/bin/train.py +42 -8
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
- xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
- xinference/thirdparty/cosyvoice/cli/model.py +330 -80
- xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
- xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
- xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
- xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
- xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
- xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
- xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
- xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
- xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
- xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
- xinference/thirdparty/cosyvoice/utils/common.py +28 -1
- xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
- xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
- xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
- xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
- xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
- xinference/thirdparty/f5_tts/api.py +166 -0
- xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
- xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
- xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
- xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
- xinference/thirdparty/f5_tts/eval/README.md +49 -0
- xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
- xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
- xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
- xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
- xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
- xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
- xinference/thirdparty/f5_tts/infer/README.md +191 -0
- xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
- xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
- xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
- xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
- xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
- xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
- xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
- xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
- xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
- xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
- xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
- xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
- xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
- xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
- xinference/thirdparty/f5_tts/model/__init__.py +10 -0
- xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
- xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
- xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
- xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
- xinference/thirdparty/f5_tts/model/cfm.py +285 -0
- xinference/thirdparty/f5_tts/model/dataset.py +319 -0
- xinference/thirdparty/f5_tts/model/modules.py +658 -0
- xinference/thirdparty/f5_tts/model/trainer.py +366 -0
- xinference/thirdparty/f5_tts/model/utils.py +185 -0
- xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
- xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
- xinference/thirdparty/f5_tts/socket_server.py +159 -0
- xinference/thirdparty/f5_tts/train/README.md +77 -0
- xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
- xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
- xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
- xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
- xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
- xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
- xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
- xinference/thirdparty/f5_tts/train/train.py +75 -0
- xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
- xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
- xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
- xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
- xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
- xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
- xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
- xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
- xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
- xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
- xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
- xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
- xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
- xinference/thirdparty/fish_speech/tools/schema.py +11 -28
- xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
- xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
- xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
- xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
- xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
- xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
- xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
- xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
- xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
- xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
- xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
- xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
- xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
- xinference/thirdparty/matcha/utils/utils.py +2 -2
- xinference/thirdparty/melo/api.py +135 -0
- xinference/thirdparty/melo/app.py +61 -0
- xinference/thirdparty/melo/attentions.py +459 -0
- xinference/thirdparty/melo/commons.py +160 -0
- xinference/thirdparty/melo/configs/config.json +94 -0
- xinference/thirdparty/melo/data/example/metadata.list +20 -0
- xinference/thirdparty/melo/data_utils.py +413 -0
- xinference/thirdparty/melo/download_utils.py +67 -0
- xinference/thirdparty/melo/infer.py +25 -0
- xinference/thirdparty/melo/init_downloads.py +14 -0
- xinference/thirdparty/melo/losses.py +58 -0
- xinference/thirdparty/melo/main.py +36 -0
- xinference/thirdparty/melo/mel_processing.py +174 -0
- xinference/thirdparty/melo/models.py +1030 -0
- xinference/thirdparty/melo/modules.py +598 -0
- xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
- xinference/thirdparty/melo/monotonic_align/core.py +46 -0
- xinference/thirdparty/melo/preprocess_text.py +135 -0
- xinference/thirdparty/melo/split_utils.py +174 -0
- xinference/thirdparty/melo/text/__init__.py +35 -0
- xinference/thirdparty/melo/text/chinese.py +199 -0
- xinference/thirdparty/melo/text/chinese_bert.py +107 -0
- xinference/thirdparty/melo/text/chinese_mix.py +253 -0
- xinference/thirdparty/melo/text/cleaner.py +36 -0
- xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
- xinference/thirdparty/melo/text/cmudict.rep +129530 -0
- xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
- xinference/thirdparty/melo/text/english.py +284 -0
- xinference/thirdparty/melo/text/english_bert.py +39 -0
- xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
- xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
- xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
- xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
- xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
- xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
- xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
- xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
- xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
- xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
- xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
- xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
- xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
- xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
- xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
- xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
- xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
- xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
- xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
- xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
- xinference/thirdparty/melo/text/french.py +94 -0
- xinference/thirdparty/melo/text/french_bert.py +39 -0
- xinference/thirdparty/melo/text/japanese.py +647 -0
- xinference/thirdparty/melo/text/japanese_bert.py +49 -0
- xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
- xinference/thirdparty/melo/text/korean.py +192 -0
- xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
- xinference/thirdparty/melo/text/spanish.py +122 -0
- xinference/thirdparty/melo/text/spanish_bert.py +39 -0
- xinference/thirdparty/melo/text/symbols.py +290 -0
- xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
- xinference/thirdparty/melo/train.py +635 -0
- xinference/thirdparty/melo/train.sh +19 -0
- xinference/thirdparty/melo/transforms.py +209 -0
- xinference/thirdparty/melo/utils.py +424 -0
- xinference/types.py +15 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
- xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
- xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
- xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +67 -3
- xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
- xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
- xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
- xinference/web/ui/node_modules/i18next/package.json +129 -0
- xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
- xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
- xinference/web/ui/node_modules/react-i18next/package.json +162 -0
- xinference/web/ui/node_modules/void-elements/package.json +34 -0
- xinference/web/ui/package-lock.json +69 -3
- xinference/web/ui/package.json +2 -0
- xinference/web/ui/src/locales/en.json +186 -0
- xinference/web/ui/src/locales/zh.json +186 -0
- {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/METADATA +68 -32
- {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/RECORD +316 -122
- xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
- xinference/thirdparty/fish_speech/tools/api.py +0 -943
- xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
- xinference/thirdparty/fish_speech/tools/webui.py +0 -548
- xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
- xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
- xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
- xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
- /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
- /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
- /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
- /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
- /xinference/thirdparty/{fish_speech/tools → melo/text/fr_phonemizer}/__init__.py +0 -0
- /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
- {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
- {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/WHEEL +0 -0
- {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
data/example/wavs/000.wav|EN-default|EN|Well, there are always new trends and styles emerging in the fashion world, but I think some of the biggest trends at the moment include sustainability and ethical fashion, streetwear and athleisure, and oversized and deconstructed silhouettes.
|
|
2
|
+
data/example/wavs/001.wav|EN-default|EN|Many designers and brands are focusing on creating more environmentally-friendly and socially responsible clothing, while others are incorporating elements of sportswear and casual wear into their collections.
|
|
3
|
+
data/example/wavs/002.wav|EN-default|EN|And there's a growing interest in looser, more relaxed shapes and unconventional materials and finishes.
|
|
4
|
+
data/example/wavs/003.wav|EN-default|EN|That's really insightful.
|
|
5
|
+
data/example/wavs/004.wav|EN-default|EN|What do you think are some of the benefits of following fashion trends?
|
|
6
|
+
data/example/wavs/005.wav|EN-default|EN|Well, I think one of the main benefits of following fashion trends is that it can be a way to express your creativity, personality, and individuality.
|
|
7
|
+
data/example/wavs/006.wav|EN-default|EN|Fashion can be a powerful tool for self-expression and can help you feel more confident and comfortable in your own skin.
|
|
8
|
+
data/example/wavs/007.wav|EN-default|EN|Additionally, staying up-to-date with fashion trends can help you develop your own sense of style and learn how to put together outfits that make you look and feel great.
|
|
9
|
+
data/example/wavs/008.wav|EN-default|EN|That's a great point.
|
|
10
|
+
data/example/wavs/009.wav|EN-default|EN|Do you think it's important to stay on top of the latest fashion trends, or is it more important to focus on timeless style?
|
|
11
|
+
data/example/wavs/010.wav|EN-default|EN|I think it's really up to each individual to decide what approach to fashion works best for them.
|
|
12
|
+
data/example/wavs/011.wav|EN-default|EN|Some people prefer to stick with classic, timeless styles that never go out of fashion, while others enjoy experimenting with new and innovative trends.
|
|
13
|
+
data/example/wavs/012.wav|EN-default|EN|Ultimately, fashion is about personal expression and there's no right or wrong way to approach it.
|
|
14
|
+
data/example/wavs/013.wav|EN-default|EN|The most important thing is to wear what makes you feel good and confident.
|
|
15
|
+
data/example/wavs/014.wav|EN-default|EN|I completely agree.
|
|
16
|
+
data/example/wavs/015.wav|EN-default|EN|Some popular ones that come to mind are oversized blazers, statement sleeves, printed maxi dresses, and chunky sneakers.
|
|
17
|
+
data/example/wavs/016.wav|EN-default|EN|It's been really interesting chatting with you about fashion.
|
|
18
|
+
data/example/wavs/017.wav|EN-default|EN|That's a good point.
|
|
19
|
+
data/example/wavs/018.wav|EN-default|EN|What do you think are some current fashion trends that are popular right now?
|
|
20
|
+
data/example/wavs/019.wav|EN-default|EN|There are so many trends happening right now, it's hard to keep track of them all!
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
import torch
|
|
4
|
+
import torch.utils.data
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
from loguru import logger
|
|
7
|
+
import commons
|
|
8
|
+
from mel_processing import spectrogram_torch, mel_spectrogram_torch
|
|
9
|
+
from utils import load_filepaths_and_text
|
|
10
|
+
from utils import load_wav_to_torch_librosa as load_wav_to_torch
|
|
11
|
+
from text import cleaned_text_to_sequence, get_bert
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
"""Multi speaker version"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
18
|
+
"""
|
|
19
|
+
1) loads audio, speaker_id, text pairs
|
|
20
|
+
2) normalizes text and converts them to sequences of integers
|
|
21
|
+
3) computes spectrograms from audio files.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, audiopaths_sid_text, hparams):
|
|
25
|
+
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
|
|
26
|
+
self.max_wav_value = hparams.max_wav_value
|
|
27
|
+
self.sampling_rate = hparams.sampling_rate
|
|
28
|
+
self.filter_length = hparams.filter_length
|
|
29
|
+
self.hop_length = hparams.hop_length
|
|
30
|
+
self.win_length = hparams.win_length
|
|
31
|
+
self.sampling_rate = hparams.sampling_rate
|
|
32
|
+
self.spk_map = hparams.spk2id
|
|
33
|
+
self.hparams = hparams
|
|
34
|
+
self.disable_bert = getattr(hparams, "disable_bert", False)
|
|
35
|
+
|
|
36
|
+
self.use_mel_spec_posterior = getattr(
|
|
37
|
+
hparams, "use_mel_posterior_encoder", False
|
|
38
|
+
)
|
|
39
|
+
if self.use_mel_spec_posterior:
|
|
40
|
+
self.n_mel_channels = getattr(hparams, "n_mel_channels", 80)
|
|
41
|
+
|
|
42
|
+
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
|
43
|
+
|
|
44
|
+
self.add_blank = hparams.add_blank
|
|
45
|
+
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
|
46
|
+
self.max_text_len = getattr(hparams, "max_text_len", 300)
|
|
47
|
+
|
|
48
|
+
random.seed(1234)
|
|
49
|
+
random.shuffle(self.audiopaths_sid_text)
|
|
50
|
+
self._filter()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _filter(self):
|
|
54
|
+
"""
|
|
55
|
+
Filter text & store spec lengths
|
|
56
|
+
"""
|
|
57
|
+
# Store spectrogram lengths for Bucketing
|
|
58
|
+
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
|
|
59
|
+
# spec_length = wav_length // hop_length
|
|
60
|
+
|
|
61
|
+
audiopaths_sid_text_new = []
|
|
62
|
+
lengths = []
|
|
63
|
+
skipped = 0
|
|
64
|
+
logger.info("Init dataset...")
|
|
65
|
+
for item in tqdm(
|
|
66
|
+
self.audiopaths_sid_text
|
|
67
|
+
):
|
|
68
|
+
try:
|
|
69
|
+
_id, spk, language, text, phones, tone, word2ph = item
|
|
70
|
+
except:
|
|
71
|
+
print(item)
|
|
72
|
+
raise
|
|
73
|
+
audiopath = f"{_id}"
|
|
74
|
+
if self.min_text_len <= len(phones) and len(phones) <= self.max_text_len:
|
|
75
|
+
phones = phones.split(" ")
|
|
76
|
+
tone = [int(i) for i in tone.split(" ")]
|
|
77
|
+
word2ph = [int(i) for i in word2ph.split(" ")]
|
|
78
|
+
audiopaths_sid_text_new.append(
|
|
79
|
+
[audiopath, spk, language, text, phones, tone, word2ph]
|
|
80
|
+
)
|
|
81
|
+
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
|
82
|
+
else:
|
|
83
|
+
skipped += 1
|
|
84
|
+
logger.info(f'min: {min(lengths)}; max: {max(lengths)}' )
|
|
85
|
+
logger.info(
|
|
86
|
+
"skipped: "
|
|
87
|
+
+ str(skipped)
|
|
88
|
+
+ ", total: "
|
|
89
|
+
+ str(len(self.audiopaths_sid_text))
|
|
90
|
+
)
|
|
91
|
+
self.audiopaths_sid_text = audiopaths_sid_text_new
|
|
92
|
+
self.lengths = lengths
|
|
93
|
+
|
|
94
|
+
def get_audio_text_speaker_pair(self, audiopath_sid_text):
|
|
95
|
+
# separate filename, speaker_id and text
|
|
96
|
+
audiopath, sid, language, text, phones, tone, word2ph = audiopath_sid_text
|
|
97
|
+
|
|
98
|
+
bert, ja_bert, phones, tone, language = self.get_text(
|
|
99
|
+
text, word2ph, phones, tone, language, audiopath
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
spec, wav = self.get_audio(audiopath)
|
|
103
|
+
sid = int(getattr(self.spk_map, sid, '0'))
|
|
104
|
+
sid = torch.LongTensor([sid])
|
|
105
|
+
return (phones, spec, wav, sid, tone, language, bert, ja_bert)
|
|
106
|
+
|
|
107
|
+
def get_audio(self, filename):
|
|
108
|
+
audio_norm, sampling_rate = load_wav_to_torch(filename, self.sampling_rate)
|
|
109
|
+
if sampling_rate != self.sampling_rate:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
"{} {} SR doesn't match target {} SR".format(
|
|
112
|
+
filename, sampling_rate, self.sampling_rate
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
# NOTE: normalize has been achieved by torchaudio
|
|
116
|
+
# audio_norm = audio / self.max_wav_value
|
|
117
|
+
audio_norm = audio_norm.unsqueeze(0)
|
|
118
|
+
spec_filename = filename.replace(".wav", ".spec.pt")
|
|
119
|
+
if self.use_mel_spec_posterior:
|
|
120
|
+
spec_filename = spec_filename.replace(".spec.pt", ".mel.pt")
|
|
121
|
+
try:
|
|
122
|
+
spec = torch.load(spec_filename)
|
|
123
|
+
assert False
|
|
124
|
+
except:
|
|
125
|
+
if self.use_mel_spec_posterior:
|
|
126
|
+
spec = mel_spectrogram_torch(
|
|
127
|
+
audio_norm,
|
|
128
|
+
self.filter_length,
|
|
129
|
+
self.n_mel_channels,
|
|
130
|
+
self.sampling_rate,
|
|
131
|
+
self.hop_length,
|
|
132
|
+
self.win_length,
|
|
133
|
+
self.hparams.mel_fmin,
|
|
134
|
+
self.hparams.mel_fmax,
|
|
135
|
+
center=False,
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
spec = spectrogram_torch(
|
|
139
|
+
audio_norm,
|
|
140
|
+
self.filter_length,
|
|
141
|
+
self.sampling_rate,
|
|
142
|
+
self.hop_length,
|
|
143
|
+
self.win_length,
|
|
144
|
+
center=False,
|
|
145
|
+
)
|
|
146
|
+
spec = torch.squeeze(spec, 0)
|
|
147
|
+
torch.save(spec, spec_filename)
|
|
148
|
+
return spec, audio_norm
|
|
149
|
+
|
|
150
|
+
def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
|
|
151
|
+
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
|
|
152
|
+
if self.add_blank:
|
|
153
|
+
phone = commons.intersperse(phone, 0)
|
|
154
|
+
tone = commons.intersperse(tone, 0)
|
|
155
|
+
language = commons.intersperse(language, 0)
|
|
156
|
+
for i in range(len(word2ph)):
|
|
157
|
+
word2ph[i] = word2ph[i] * 2
|
|
158
|
+
word2ph[0] += 1
|
|
159
|
+
bert_path = wav_path.replace(".wav", ".bert.pt")
|
|
160
|
+
try:
|
|
161
|
+
bert = torch.load(bert_path)
|
|
162
|
+
assert bert.shape[-1] == len(phone)
|
|
163
|
+
except Exception as e:
|
|
164
|
+
print(e, wav_path, bert_path, bert.shape, len(phone))
|
|
165
|
+
bert = get_bert(text, word2ph, language_str)
|
|
166
|
+
torch.save(bert, bert_path)
|
|
167
|
+
assert bert.shape[-1] == len(phone), phone
|
|
168
|
+
|
|
169
|
+
if self.disable_bert:
|
|
170
|
+
bert = torch.zeros(1024, len(phone))
|
|
171
|
+
ja_bert = torch.zeros(768, len(phone))
|
|
172
|
+
else:
|
|
173
|
+
if language_str in ["ZH"]:
|
|
174
|
+
bert = bert
|
|
175
|
+
ja_bert = torch.zeros(768, len(phone))
|
|
176
|
+
elif language_str in ["JP", "EN", "ZH_MIX_EN", "KR", 'SP', 'ES', 'FR', 'DE', 'RU']:
|
|
177
|
+
ja_bert = bert
|
|
178
|
+
bert = torch.zeros(1024, len(phone))
|
|
179
|
+
else:
|
|
180
|
+
raise
|
|
181
|
+
bert = torch.zeros(1024, len(phone))
|
|
182
|
+
ja_bert = torch.zeros(768, len(phone))
|
|
183
|
+
assert bert.shape[-1] == len(phone)
|
|
184
|
+
phone = torch.LongTensor(phone)
|
|
185
|
+
tone = torch.LongTensor(tone)
|
|
186
|
+
language = torch.LongTensor(language)
|
|
187
|
+
return bert, ja_bert, phone, tone, language
|
|
188
|
+
|
|
189
|
+
def get_sid(self, sid):
|
|
190
|
+
sid = torch.LongTensor([int(sid)])
|
|
191
|
+
return sid
|
|
192
|
+
|
|
193
|
+
def __getitem__(self, index):
|
|
194
|
+
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
|
|
195
|
+
|
|
196
|
+
def __len__(self):
|
|
197
|
+
return len(self.audiopaths_sid_text)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class TextAudioSpeakerCollate:
|
|
201
|
+
"""Zero-pads model inputs and targets"""
|
|
202
|
+
|
|
203
|
+
def __init__(self, return_ids=False):
|
|
204
|
+
self.return_ids = return_ids
|
|
205
|
+
|
|
206
|
+
def __call__(self, batch):
|
|
207
|
+
"""Collate's training batch from normalized text, audio and speaker identities
|
|
208
|
+
PARAMS
|
|
209
|
+
------
|
|
210
|
+
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
|
211
|
+
"""
|
|
212
|
+
# Right zero-pad all one-hot text sequences to max input length
|
|
213
|
+
_, ids_sorted_decreasing = torch.sort(
|
|
214
|
+
torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
max_text_len = max([len(x[0]) for x in batch])
|
|
218
|
+
max_spec_len = max([x[1].size(1) for x in batch])
|
|
219
|
+
max_wav_len = max([x[2].size(1) for x in batch])
|
|
220
|
+
|
|
221
|
+
text_lengths = torch.LongTensor(len(batch))
|
|
222
|
+
spec_lengths = torch.LongTensor(len(batch))
|
|
223
|
+
wav_lengths = torch.LongTensor(len(batch))
|
|
224
|
+
sid = torch.LongTensor(len(batch))
|
|
225
|
+
|
|
226
|
+
text_padded = torch.LongTensor(len(batch), max_text_len)
|
|
227
|
+
tone_padded = torch.LongTensor(len(batch), max_text_len)
|
|
228
|
+
language_padded = torch.LongTensor(len(batch), max_text_len)
|
|
229
|
+
bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
|
|
230
|
+
ja_bert_padded = torch.FloatTensor(len(batch), 768, max_text_len)
|
|
231
|
+
|
|
232
|
+
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
|
233
|
+
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
|
234
|
+
text_padded.zero_()
|
|
235
|
+
tone_padded.zero_()
|
|
236
|
+
language_padded.zero_()
|
|
237
|
+
spec_padded.zero_()
|
|
238
|
+
wav_padded.zero_()
|
|
239
|
+
bert_padded.zero_()
|
|
240
|
+
ja_bert_padded.zero_()
|
|
241
|
+
for i in range(len(ids_sorted_decreasing)):
|
|
242
|
+
row = batch[ids_sorted_decreasing[i]]
|
|
243
|
+
|
|
244
|
+
text = row[0]
|
|
245
|
+
text_padded[i, : text.size(0)] = text
|
|
246
|
+
text_lengths[i] = text.size(0)
|
|
247
|
+
|
|
248
|
+
spec = row[1]
|
|
249
|
+
spec_padded[i, :, : spec.size(1)] = spec
|
|
250
|
+
spec_lengths[i] = spec.size(1)
|
|
251
|
+
|
|
252
|
+
wav = row[2]
|
|
253
|
+
wav_padded[i, :, : wav.size(1)] = wav
|
|
254
|
+
wav_lengths[i] = wav.size(1)
|
|
255
|
+
|
|
256
|
+
sid[i] = row[3]
|
|
257
|
+
|
|
258
|
+
tone = row[4]
|
|
259
|
+
tone_padded[i, : tone.size(0)] = tone
|
|
260
|
+
|
|
261
|
+
language = row[5]
|
|
262
|
+
language_padded[i, : language.size(0)] = language
|
|
263
|
+
|
|
264
|
+
bert = row[6]
|
|
265
|
+
bert_padded[i, :, : bert.size(1)] = bert
|
|
266
|
+
|
|
267
|
+
ja_bert = row[7]
|
|
268
|
+
ja_bert_padded[i, :, : ja_bert.size(1)] = ja_bert
|
|
269
|
+
|
|
270
|
+
return (
|
|
271
|
+
text_padded,
|
|
272
|
+
text_lengths,
|
|
273
|
+
spec_padded,
|
|
274
|
+
spec_lengths,
|
|
275
|
+
wav_padded,
|
|
276
|
+
wav_lengths,
|
|
277
|
+
sid,
|
|
278
|
+
tone_padded,
|
|
279
|
+
language_padded,
|
|
280
|
+
bert_padded,
|
|
281
|
+
ja_bert_padded,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
|
286
|
+
"""
|
|
287
|
+
Maintain similar input lengths in a batch.
|
|
288
|
+
Length groups are specified by boundaries.
|
|
289
|
+
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
|
|
290
|
+
|
|
291
|
+
It removes samples which are not included in the boundaries.
|
|
292
|
+
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
def __init__(
|
|
296
|
+
self,
|
|
297
|
+
dataset,
|
|
298
|
+
batch_size,
|
|
299
|
+
boundaries,
|
|
300
|
+
num_replicas=None,
|
|
301
|
+
rank=None,
|
|
302
|
+
shuffle=True,
|
|
303
|
+
):
|
|
304
|
+
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
|
|
305
|
+
self.lengths = dataset.lengths
|
|
306
|
+
self.batch_size = batch_size
|
|
307
|
+
self.boundaries = boundaries
|
|
308
|
+
|
|
309
|
+
self.buckets, self.num_samples_per_bucket = self._create_buckets()
|
|
310
|
+
self.total_size = sum(self.num_samples_per_bucket)
|
|
311
|
+
self.num_samples = self.total_size // self.num_replicas
|
|
312
|
+
print('buckets:', self.num_samples_per_bucket)
|
|
313
|
+
|
|
314
|
+
def _create_buckets(self):
|
|
315
|
+
buckets = [[] for _ in range(len(self.boundaries) - 1)]
|
|
316
|
+
for i in range(len(self.lengths)):
|
|
317
|
+
length = self.lengths[i]
|
|
318
|
+
idx_bucket = self._bisect(length)
|
|
319
|
+
if idx_bucket != -1:
|
|
320
|
+
buckets[idx_bucket].append(i)
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
for i in range(len(buckets) - 1, 0, -1):
|
|
324
|
+
if len(buckets[i]) == 0:
|
|
325
|
+
buckets.pop(i)
|
|
326
|
+
self.boundaries.pop(i + 1)
|
|
327
|
+
assert all(len(bucket) > 0 for bucket in buckets)
|
|
328
|
+
# When one bucket is not traversed
|
|
329
|
+
except Exception as e:
|
|
330
|
+
print("Bucket warning ", e)
|
|
331
|
+
for i in range(len(buckets) - 1, -1, -1):
|
|
332
|
+
if len(buckets[i]) == 0:
|
|
333
|
+
buckets.pop(i)
|
|
334
|
+
self.boundaries.pop(i + 1)
|
|
335
|
+
|
|
336
|
+
num_samples_per_bucket = []
|
|
337
|
+
for i in range(len(buckets)):
|
|
338
|
+
len_bucket = len(buckets[i])
|
|
339
|
+
total_batch_size = self.num_replicas * self.batch_size
|
|
340
|
+
rem = (
|
|
341
|
+
total_batch_size - (len_bucket % total_batch_size)
|
|
342
|
+
) % total_batch_size
|
|
343
|
+
num_samples_per_bucket.append(len_bucket + rem)
|
|
344
|
+
return buckets, num_samples_per_bucket
|
|
345
|
+
|
|
346
|
+
def __iter__(self):
|
|
347
|
+
# deterministically shuffle based on epoch
|
|
348
|
+
g = torch.Generator()
|
|
349
|
+
g.manual_seed(self.epoch)
|
|
350
|
+
|
|
351
|
+
indices = []
|
|
352
|
+
if self.shuffle:
|
|
353
|
+
for bucket in self.buckets:
|
|
354
|
+
indices.append(torch.randperm(len(bucket), generator=g).tolist())
|
|
355
|
+
else:
|
|
356
|
+
for bucket in self.buckets:
|
|
357
|
+
indices.append(list(range(len(bucket))))
|
|
358
|
+
|
|
359
|
+
batches = []
|
|
360
|
+
for i in range(len(self.buckets)):
|
|
361
|
+
bucket = self.buckets[i]
|
|
362
|
+
len_bucket = len(bucket)
|
|
363
|
+
if len_bucket == 0:
|
|
364
|
+
continue
|
|
365
|
+
ids_bucket = indices[i]
|
|
366
|
+
num_samples_bucket = self.num_samples_per_bucket[i]
|
|
367
|
+
|
|
368
|
+
# add extra samples to make it evenly divisible
|
|
369
|
+
rem = num_samples_bucket - len_bucket
|
|
370
|
+
ids_bucket = (
|
|
371
|
+
ids_bucket
|
|
372
|
+
+ ids_bucket * (rem // len_bucket)
|
|
373
|
+
+ ids_bucket[: (rem % len_bucket)]
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# subsample
|
|
377
|
+
ids_bucket = ids_bucket[self.rank :: self.num_replicas]
|
|
378
|
+
|
|
379
|
+
# batching
|
|
380
|
+
for j in range(len(ids_bucket) // self.batch_size):
|
|
381
|
+
batch = [
|
|
382
|
+
bucket[idx]
|
|
383
|
+
for idx in ids_bucket[
|
|
384
|
+
j * self.batch_size : (j + 1) * self.batch_size
|
|
385
|
+
]
|
|
386
|
+
]
|
|
387
|
+
batches.append(batch)
|
|
388
|
+
|
|
389
|
+
if self.shuffle:
|
|
390
|
+
batch_ids = torch.randperm(len(batches), generator=g).tolist()
|
|
391
|
+
batches = [batches[i] for i in batch_ids]
|
|
392
|
+
self.batches = batches
|
|
393
|
+
|
|
394
|
+
assert len(self.batches) * self.batch_size == self.num_samples
|
|
395
|
+
return iter(self.batches)
|
|
396
|
+
|
|
397
|
+
def _bisect(self, x, lo=0, hi=None):
|
|
398
|
+
if hi is None:
|
|
399
|
+
hi = len(self.boundaries) - 1
|
|
400
|
+
|
|
401
|
+
if hi > lo:
|
|
402
|
+
mid = (hi + lo) // 2
|
|
403
|
+
if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
|
|
404
|
+
return mid
|
|
405
|
+
elif x <= self.boundaries[mid]:
|
|
406
|
+
return self._bisect(x, lo, mid)
|
|
407
|
+
else:
|
|
408
|
+
return self._bisect(x, mid + 1, hi)
|
|
409
|
+
else:
|
|
410
|
+
return -1
|
|
411
|
+
|
|
412
|
+
def __len__(self):
|
|
413
|
+
return self.num_samples // self.batch_size
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import os
|
|
3
|
+
from . import utils
|
|
4
|
+
from cached_path import cached_path
|
|
5
|
+
from huggingface_hub import hf_hub_download
|
|
6
|
+
|
|
7
|
+
DOWNLOAD_CKPT_URLS = {
|
|
8
|
+
'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth',
|
|
9
|
+
'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth',
|
|
10
|
+
'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/checkpoint.pth',
|
|
11
|
+
'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/checkpoint.pth',
|
|
12
|
+
'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/checkpoint.pth',
|
|
13
|
+
'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/checkpoint.pth',
|
|
14
|
+
'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/checkpoint.pth',
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
DOWNLOAD_CONFIG_URLS = {
|
|
18
|
+
'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/config.json',
|
|
19
|
+
'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/config.json',
|
|
20
|
+
'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/config.json',
|
|
21
|
+
'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/config.json',
|
|
22
|
+
'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/config.json',
|
|
23
|
+
'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/config.json',
|
|
24
|
+
'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/config.json',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
PRETRAINED_MODELS = {
|
|
28
|
+
'G.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/G.pth',
|
|
29
|
+
'D.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/D.pth',
|
|
30
|
+
'DUR.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/DUR.pth',
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
LANG_TO_HF_REPO_ID = {
|
|
34
|
+
'EN': 'myshell-ai/MeloTTS-English',
|
|
35
|
+
'EN_V2': 'myshell-ai/MeloTTS-English-v2',
|
|
36
|
+
'EN_NEWEST': 'myshell-ai/MeloTTS-English-v3',
|
|
37
|
+
'FR': 'myshell-ai/MeloTTS-French',
|
|
38
|
+
'JP': 'myshell-ai/MeloTTS-Japanese',
|
|
39
|
+
'ES': 'myshell-ai/MeloTTS-Spanish',
|
|
40
|
+
'ZH': 'myshell-ai/MeloTTS-Chinese',
|
|
41
|
+
'KR': 'myshell-ai/MeloTTS-Korean',
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def load_or_download_config(locale, use_hf=True, config_path=None):
|
|
45
|
+
if config_path is None:
|
|
46
|
+
language = locale.split('-')[0].upper()
|
|
47
|
+
if use_hf:
|
|
48
|
+
assert language in LANG_TO_HF_REPO_ID
|
|
49
|
+
config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json")
|
|
50
|
+
else:
|
|
51
|
+
assert language in DOWNLOAD_CONFIG_URLS
|
|
52
|
+
config_path = cached_path(DOWNLOAD_CONFIG_URLS[language])
|
|
53
|
+
return utils.get_hparams_from_file(config_path)
|
|
54
|
+
|
|
55
|
+
def load_or_download_model(locale, device, use_hf=True, ckpt_path=None):
|
|
56
|
+
if ckpt_path is None:
|
|
57
|
+
language = locale.split('-')[0].upper()
|
|
58
|
+
if use_hf:
|
|
59
|
+
assert language in LANG_TO_HF_REPO_ID
|
|
60
|
+
ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth")
|
|
61
|
+
else:
|
|
62
|
+
assert language in DOWNLOAD_CKPT_URLS
|
|
63
|
+
ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language])
|
|
64
|
+
return torch.load(ckpt_path, map_location=device)
|
|
65
|
+
|
|
66
|
+
def load_pretrain_model():
|
|
67
|
+
return [cached_path(url) for url in PRETRAINED_MODELS.values()]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import click
|
|
3
|
+
from melo.api import TTS
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@click.command()
|
|
8
|
+
@click.option('--ckpt_path', '-m', type=str, default=None, help="Path to the checkpoint file")
|
|
9
|
+
@click.option('--text', '-t', type=str, default=None, help="Text to speak")
|
|
10
|
+
@click.option('--language', '-l', type=str, default="EN", help="Language of the model")
|
|
11
|
+
@click.option('--output_dir', '-o', type=str, default="outputs", help="Path to the output")
|
|
12
|
+
def main(ckpt_path, text, language, output_dir):
|
|
13
|
+
if ckpt_path is None:
|
|
14
|
+
raise ValueError("The model_path must be specified")
|
|
15
|
+
|
|
16
|
+
config_path = os.path.join(os.path.dirname(ckpt_path), 'config.json')
|
|
17
|
+
model = TTS(language=language, config_path=config_path, ckpt_path=ckpt_path)
|
|
18
|
+
|
|
19
|
+
for spk_name, spk_id in model.hps.data.spk2id.items():
|
|
20
|
+
save_path = f'{output_dir}/{spk_name}/output.wav'
|
|
21
|
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
|
22
|
+
model.tts_to_file(text, spk_id, save_path)
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
main()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
if __name__ == '__main__':
|
|
4
|
+
|
|
5
|
+
from melo.api import TTS
|
|
6
|
+
device = 'auto'
|
|
7
|
+
models = {
|
|
8
|
+
'EN': TTS(language='EN', device=device),
|
|
9
|
+
'ES': TTS(language='ES', device=device),
|
|
10
|
+
'FR': TTS(language='FR', device=device),
|
|
11
|
+
'ZH': TTS(language='ZH', device=device),
|
|
12
|
+
'JP': TTS(language='JP', device=device),
|
|
13
|
+
'KR': TTS(language='KR', device=device),
|
|
14
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def feature_loss(fmap_r, fmap_g):
|
|
5
|
+
loss = 0
|
|
6
|
+
for dr, dg in zip(fmap_r, fmap_g):
|
|
7
|
+
for rl, gl in zip(dr, dg):
|
|
8
|
+
rl = rl.float().detach()
|
|
9
|
+
gl = gl.float()
|
|
10
|
+
loss += torch.mean(torch.abs(rl - gl))
|
|
11
|
+
|
|
12
|
+
return loss * 2
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
|
16
|
+
loss = 0
|
|
17
|
+
r_losses = []
|
|
18
|
+
g_losses = []
|
|
19
|
+
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
|
20
|
+
dr = dr.float()
|
|
21
|
+
dg = dg.float()
|
|
22
|
+
r_loss = torch.mean((1 - dr) ** 2)
|
|
23
|
+
g_loss = torch.mean(dg**2)
|
|
24
|
+
loss += r_loss + g_loss
|
|
25
|
+
r_losses.append(r_loss.item())
|
|
26
|
+
g_losses.append(g_loss.item())
|
|
27
|
+
|
|
28
|
+
return loss, r_losses, g_losses
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def generator_loss(disc_outputs):
|
|
32
|
+
loss = 0
|
|
33
|
+
gen_losses = []
|
|
34
|
+
for dg in disc_outputs:
|
|
35
|
+
dg = dg.float()
|
|
36
|
+
l = torch.mean((1 - dg) ** 2)
|
|
37
|
+
gen_losses.append(l)
|
|
38
|
+
loss += l
|
|
39
|
+
|
|
40
|
+
return loss, gen_losses
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
|
44
|
+
"""
|
|
45
|
+
z_p, logs_q: [b, h, t_t]
|
|
46
|
+
m_p, logs_p: [b, h, t_t]
|
|
47
|
+
"""
|
|
48
|
+
z_p = z_p.float()
|
|
49
|
+
logs_q = logs_q.float()
|
|
50
|
+
m_p = m_p.float()
|
|
51
|
+
logs_p = logs_p.float()
|
|
52
|
+
z_mask = z_mask.float()
|
|
53
|
+
|
|
54
|
+
kl = logs_p - logs_q - 0.5
|
|
55
|
+
kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
|
|
56
|
+
kl = torch.sum(kl * z_mask)
|
|
57
|
+
l = kl / torch.sum(z_mask)
|
|
58
|
+
return l
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import click
|
|
2
|
+
import warnings
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@click.command
|
|
7
|
+
@click.argument('text')
|
|
8
|
+
@click.argument('output_path')
|
|
9
|
+
@click.option("--file", '-f', is_flag=True, show_default=True, default=False, help="Text is a file")
|
|
10
|
+
@click.option('--language', '-l', default='EN', help='Language, defaults to English', type=click.Choice(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], case_sensitive=False))
|
|
11
|
+
@click.option('--speaker', '-spk', default='EN-Default', help='Speaker ID, only for English, leave empty for default, ignored if not English. If English, defaults to "EN-Default"', type=click.Choice(['EN-Default', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU']))
|
|
12
|
+
@click.option('--speed', '-s', default=1.0, help='Speed, defaults to 1.0', type=float)
|
|
13
|
+
@click.option('--device', '-d', default='auto', help='Device, defaults to auto')
|
|
14
|
+
def main(text, file, output_path, language, speaker, speed, device):
|
|
15
|
+
if file:
|
|
16
|
+
if not os.path.exists(text):
|
|
17
|
+
raise FileNotFoundError(f'Trying to load text from file due to --file/-f flag, but file not found. Remove the --file/-f flag to pass a string.')
|
|
18
|
+
else:
|
|
19
|
+
with open(text) as f:
|
|
20
|
+
text = f.read().strip()
|
|
21
|
+
if text == '':
|
|
22
|
+
raise ValueError('You entered empty text or the file you passed was empty.')
|
|
23
|
+
language = language.upper()
|
|
24
|
+
if language == '': language = 'EN'
|
|
25
|
+
if speaker == '': speaker = None
|
|
26
|
+
if (not language == 'EN') and speaker:
|
|
27
|
+
warnings.warn('You specified a speaker but the language is English.')
|
|
28
|
+
from melo.api import TTS
|
|
29
|
+
model = TTS(language=language, device=device)
|
|
30
|
+
speaker_ids = model.hps.data.spk2id
|
|
31
|
+
if language == 'EN':
|
|
32
|
+
if not speaker: speaker = 'EN-Default'
|
|
33
|
+
spkr = speaker_ids[speaker]
|
|
34
|
+
else:
|
|
35
|
+
spkr = speaker_ids[list(speaker_ids.keys())[0]]
|
|
36
|
+
model.tts_to_file(text, spkr, output_path, speed=speed)
|