xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,20 @@
1
+ data/example/wavs/000.wav|EN-default|EN|Well, there are always new trends and styles emerging in the fashion world, but I think some of the biggest trends at the moment include sustainability and ethical fashion, streetwear and athleisure, and oversized and deconstructed silhouettes.
2
+ data/example/wavs/001.wav|EN-default|EN|Many designers and brands are focusing on creating more environmentally-friendly and socially responsible clothing, while others are incorporating elements of sportswear and casual wear into their collections.
3
+ data/example/wavs/002.wav|EN-default|EN|And there's a growing interest in looser, more relaxed shapes and unconventional materials and finishes.
4
+ data/example/wavs/003.wav|EN-default|EN|That's really insightful.
5
+ data/example/wavs/004.wav|EN-default|EN|What do you think are some of the benefits of following fashion trends?
6
+ data/example/wavs/005.wav|EN-default|EN|Well, I think one of the main benefits of following fashion trends is that it can be a way to express your creativity, personality, and individuality.
7
+ data/example/wavs/006.wav|EN-default|EN|Fashion can be a powerful tool for self-expression and can help you feel more confident and comfortable in your own skin.
8
+ data/example/wavs/007.wav|EN-default|EN|Additionally, staying up-to-date with fashion trends can help you develop your own sense of style and learn how to put together outfits that make you look and feel great.
9
+ data/example/wavs/008.wav|EN-default|EN|That's a great point.
10
+ data/example/wavs/009.wav|EN-default|EN|Do you think it's important to stay on top of the latest fashion trends, or is it more important to focus on timeless style?
11
+ data/example/wavs/010.wav|EN-default|EN|I think it's really up to each individual to decide what approach to fashion works best for them.
12
+ data/example/wavs/011.wav|EN-default|EN|Some people prefer to stick with classic, timeless styles that never go out of fashion, while others enjoy experimenting with new and innovative trends.
13
+ data/example/wavs/012.wav|EN-default|EN|Ultimately, fashion is about personal expression and there's no right or wrong way to approach it.
14
+ data/example/wavs/013.wav|EN-default|EN|The most important thing is to wear what makes you feel good and confident.
15
+ data/example/wavs/014.wav|EN-default|EN|I completely agree.
16
+ data/example/wavs/015.wav|EN-default|EN|Some popular ones that come to mind are oversized blazers, statement sleeves, printed maxi dresses, and chunky sneakers.
17
+ data/example/wavs/016.wav|EN-default|EN|It's been really interesting chatting with you about fashion.
18
+ data/example/wavs/017.wav|EN-default|EN|That's a good point.
19
+ data/example/wavs/018.wav|EN-default|EN|What do you think are some current fashion trends that are popular right now?
20
+ data/example/wavs/019.wav|EN-default|EN|There are so many trends happening right now, it's hard to keep track of them all!
@@ -0,0 +1,413 @@
1
+ import os
2
+ import random
3
+ import torch
4
+ import torch.utils.data
5
+ from tqdm import tqdm
6
+ from loguru import logger
7
+ import commons
8
+ from mel_processing import spectrogram_torch, mel_spectrogram_torch
9
+ from utils import load_filepaths_and_text
10
+ from utils import load_wav_to_torch_librosa as load_wav_to_torch
11
+ from text import cleaned_text_to_sequence, get_bert
12
+ import numpy as np
13
+
14
+ """Multi speaker version"""
15
+
16
+
17
+ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
18
+ """
19
+ 1) loads audio, speaker_id, text pairs
20
+ 2) normalizes text and converts them to sequences of integers
21
+ 3) computes spectrograms from audio files.
22
+ """
23
+
24
+ def __init__(self, audiopaths_sid_text, hparams):
25
+ self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
26
+ self.max_wav_value = hparams.max_wav_value
27
+ self.sampling_rate = hparams.sampling_rate
28
+ self.filter_length = hparams.filter_length
29
+ self.hop_length = hparams.hop_length
30
+ self.win_length = hparams.win_length
31
+ self.sampling_rate = hparams.sampling_rate
32
+ self.spk_map = hparams.spk2id
33
+ self.hparams = hparams
34
+ self.disable_bert = getattr(hparams, "disable_bert", False)
35
+
36
+ self.use_mel_spec_posterior = getattr(
37
+ hparams, "use_mel_posterior_encoder", False
38
+ )
39
+ if self.use_mel_spec_posterior:
40
+ self.n_mel_channels = getattr(hparams, "n_mel_channels", 80)
41
+
42
+ self.cleaned_text = getattr(hparams, "cleaned_text", False)
43
+
44
+ self.add_blank = hparams.add_blank
45
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
46
+ self.max_text_len = getattr(hparams, "max_text_len", 300)
47
+
48
+ random.seed(1234)
49
+ random.shuffle(self.audiopaths_sid_text)
50
+ self._filter()
51
+
52
+
53
+ def _filter(self):
54
+ """
55
+ Filter text & store spec lengths
56
+ """
57
+ # Store spectrogram lengths for Bucketing
58
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
59
+ # spec_length = wav_length // hop_length
60
+
61
+ audiopaths_sid_text_new = []
62
+ lengths = []
63
+ skipped = 0
64
+ logger.info("Init dataset...")
65
+ for item in tqdm(
66
+ self.audiopaths_sid_text
67
+ ):
68
+ try:
69
+ _id, spk, language, text, phones, tone, word2ph = item
70
+ except:
71
+ print(item)
72
+ raise
73
+ audiopath = f"{_id}"
74
+ if self.min_text_len <= len(phones) and len(phones) <= self.max_text_len:
75
+ phones = phones.split(" ")
76
+ tone = [int(i) for i in tone.split(" ")]
77
+ word2ph = [int(i) for i in word2ph.split(" ")]
78
+ audiopaths_sid_text_new.append(
79
+ [audiopath, spk, language, text, phones, tone, word2ph]
80
+ )
81
+ lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
82
+ else:
83
+ skipped += 1
84
+ logger.info(f'min: {min(lengths)}; max: {max(lengths)}' )
85
+ logger.info(
86
+ "skipped: "
87
+ + str(skipped)
88
+ + ", total: "
89
+ + str(len(self.audiopaths_sid_text))
90
+ )
91
+ self.audiopaths_sid_text = audiopaths_sid_text_new
92
+ self.lengths = lengths
93
+
94
+ def get_audio_text_speaker_pair(self, audiopath_sid_text):
95
+ # separate filename, speaker_id and text
96
+ audiopath, sid, language, text, phones, tone, word2ph = audiopath_sid_text
97
+
98
+ bert, ja_bert, phones, tone, language = self.get_text(
99
+ text, word2ph, phones, tone, language, audiopath
100
+ )
101
+
102
+ spec, wav = self.get_audio(audiopath)
103
+ sid = int(getattr(self.spk_map, sid, '0'))
104
+ sid = torch.LongTensor([sid])
105
+ return (phones, spec, wav, sid, tone, language, bert, ja_bert)
106
+
107
+ def get_audio(self, filename):
108
+ audio_norm, sampling_rate = load_wav_to_torch(filename, self.sampling_rate)
109
+ if sampling_rate != self.sampling_rate:
110
+ raise ValueError(
111
+ "{} {} SR doesn't match target {} SR".format(
112
+ filename, sampling_rate, self.sampling_rate
113
+ )
114
+ )
115
+ # NOTE: normalize has been achieved by torchaudio
116
+ # audio_norm = audio / self.max_wav_value
117
+ audio_norm = audio_norm.unsqueeze(0)
118
+ spec_filename = filename.replace(".wav", ".spec.pt")
119
+ if self.use_mel_spec_posterior:
120
+ spec_filename = spec_filename.replace(".spec.pt", ".mel.pt")
121
+ try:
122
+ spec = torch.load(spec_filename)
123
+ assert False
124
+ except:
125
+ if self.use_mel_spec_posterior:
126
+ spec = mel_spectrogram_torch(
127
+ audio_norm,
128
+ self.filter_length,
129
+ self.n_mel_channels,
130
+ self.sampling_rate,
131
+ self.hop_length,
132
+ self.win_length,
133
+ self.hparams.mel_fmin,
134
+ self.hparams.mel_fmax,
135
+ center=False,
136
+ )
137
+ else:
138
+ spec = spectrogram_torch(
139
+ audio_norm,
140
+ self.filter_length,
141
+ self.sampling_rate,
142
+ self.hop_length,
143
+ self.win_length,
144
+ center=False,
145
+ )
146
+ spec = torch.squeeze(spec, 0)
147
+ torch.save(spec, spec_filename)
148
+ return spec, audio_norm
149
+
150
+ def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
151
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
152
+ if self.add_blank:
153
+ phone = commons.intersperse(phone, 0)
154
+ tone = commons.intersperse(tone, 0)
155
+ language = commons.intersperse(language, 0)
156
+ for i in range(len(word2ph)):
157
+ word2ph[i] = word2ph[i] * 2
158
+ word2ph[0] += 1
159
+ bert_path = wav_path.replace(".wav", ".bert.pt")
160
+ try:
161
+ bert = torch.load(bert_path)
162
+ assert bert.shape[-1] == len(phone)
163
+ except Exception as e:
164
+ print(e, wav_path, bert_path, bert.shape, len(phone))
165
+ bert = get_bert(text, word2ph, language_str)
166
+ torch.save(bert, bert_path)
167
+ assert bert.shape[-1] == len(phone), phone
168
+
169
+ if self.disable_bert:
170
+ bert = torch.zeros(1024, len(phone))
171
+ ja_bert = torch.zeros(768, len(phone))
172
+ else:
173
+ if language_str in ["ZH"]:
174
+ bert = bert
175
+ ja_bert = torch.zeros(768, len(phone))
176
+ elif language_str in ["JP", "EN", "ZH_MIX_EN", "KR", 'SP', 'ES', 'FR', 'DE', 'RU']:
177
+ ja_bert = bert
178
+ bert = torch.zeros(1024, len(phone))
179
+ else:
180
+ raise
181
+ bert = torch.zeros(1024, len(phone))
182
+ ja_bert = torch.zeros(768, len(phone))
183
+ assert bert.shape[-1] == len(phone)
184
+ phone = torch.LongTensor(phone)
185
+ tone = torch.LongTensor(tone)
186
+ language = torch.LongTensor(language)
187
+ return bert, ja_bert, phone, tone, language
188
+
189
+ def get_sid(self, sid):
190
+ sid = torch.LongTensor([int(sid)])
191
+ return sid
192
+
193
+ def __getitem__(self, index):
194
+ return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
195
+
196
+ def __len__(self):
197
+ return len(self.audiopaths_sid_text)
198
+
199
+
200
+ class TextAudioSpeakerCollate:
201
+ """Zero-pads model inputs and targets"""
202
+
203
+ def __init__(self, return_ids=False):
204
+ self.return_ids = return_ids
205
+
206
+ def __call__(self, batch):
207
+ """Collate's training batch from normalized text, audio and speaker identities
208
+ PARAMS
209
+ ------
210
+ batch: [text_normalized, spec_normalized, wav_normalized, sid]
211
+ """
212
+ # Right zero-pad all one-hot text sequences to max input length
213
+ _, ids_sorted_decreasing = torch.sort(
214
+ torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True
215
+ )
216
+
217
+ max_text_len = max([len(x[0]) for x in batch])
218
+ max_spec_len = max([x[1].size(1) for x in batch])
219
+ max_wav_len = max([x[2].size(1) for x in batch])
220
+
221
+ text_lengths = torch.LongTensor(len(batch))
222
+ spec_lengths = torch.LongTensor(len(batch))
223
+ wav_lengths = torch.LongTensor(len(batch))
224
+ sid = torch.LongTensor(len(batch))
225
+
226
+ text_padded = torch.LongTensor(len(batch), max_text_len)
227
+ tone_padded = torch.LongTensor(len(batch), max_text_len)
228
+ language_padded = torch.LongTensor(len(batch), max_text_len)
229
+ bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
230
+ ja_bert_padded = torch.FloatTensor(len(batch), 768, max_text_len)
231
+
232
+ spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
233
+ wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
234
+ text_padded.zero_()
235
+ tone_padded.zero_()
236
+ language_padded.zero_()
237
+ spec_padded.zero_()
238
+ wav_padded.zero_()
239
+ bert_padded.zero_()
240
+ ja_bert_padded.zero_()
241
+ for i in range(len(ids_sorted_decreasing)):
242
+ row = batch[ids_sorted_decreasing[i]]
243
+
244
+ text = row[0]
245
+ text_padded[i, : text.size(0)] = text
246
+ text_lengths[i] = text.size(0)
247
+
248
+ spec = row[1]
249
+ spec_padded[i, :, : spec.size(1)] = spec
250
+ spec_lengths[i] = spec.size(1)
251
+
252
+ wav = row[2]
253
+ wav_padded[i, :, : wav.size(1)] = wav
254
+ wav_lengths[i] = wav.size(1)
255
+
256
+ sid[i] = row[3]
257
+
258
+ tone = row[4]
259
+ tone_padded[i, : tone.size(0)] = tone
260
+
261
+ language = row[5]
262
+ language_padded[i, : language.size(0)] = language
263
+
264
+ bert = row[6]
265
+ bert_padded[i, :, : bert.size(1)] = bert
266
+
267
+ ja_bert = row[7]
268
+ ja_bert_padded[i, :, : ja_bert.size(1)] = ja_bert
269
+
270
+ return (
271
+ text_padded,
272
+ text_lengths,
273
+ spec_padded,
274
+ spec_lengths,
275
+ wav_padded,
276
+ wav_lengths,
277
+ sid,
278
+ tone_padded,
279
+ language_padded,
280
+ bert_padded,
281
+ ja_bert_padded,
282
+ )
283
+
284
+
285
+ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
286
+ """
287
+ Maintain similar input lengths in a batch.
288
+ Length groups are specified by boundaries.
289
+ Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
290
+
291
+ It removes samples which are not included in the boundaries.
292
+ Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
293
+ """
294
+
295
+ def __init__(
296
+ self,
297
+ dataset,
298
+ batch_size,
299
+ boundaries,
300
+ num_replicas=None,
301
+ rank=None,
302
+ shuffle=True,
303
+ ):
304
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
305
+ self.lengths = dataset.lengths
306
+ self.batch_size = batch_size
307
+ self.boundaries = boundaries
308
+
309
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
310
+ self.total_size = sum(self.num_samples_per_bucket)
311
+ self.num_samples = self.total_size // self.num_replicas
312
+ print('buckets:', self.num_samples_per_bucket)
313
+
314
+ def _create_buckets(self):
315
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
316
+ for i in range(len(self.lengths)):
317
+ length = self.lengths[i]
318
+ idx_bucket = self._bisect(length)
319
+ if idx_bucket != -1:
320
+ buckets[idx_bucket].append(i)
321
+
322
+ try:
323
+ for i in range(len(buckets) - 1, 0, -1):
324
+ if len(buckets[i]) == 0:
325
+ buckets.pop(i)
326
+ self.boundaries.pop(i + 1)
327
+ assert all(len(bucket) > 0 for bucket in buckets)
328
+ # When one bucket is not traversed
329
+ except Exception as e:
330
+ print("Bucket warning ", e)
331
+ for i in range(len(buckets) - 1, -1, -1):
332
+ if len(buckets[i]) == 0:
333
+ buckets.pop(i)
334
+ self.boundaries.pop(i + 1)
335
+
336
+ num_samples_per_bucket = []
337
+ for i in range(len(buckets)):
338
+ len_bucket = len(buckets[i])
339
+ total_batch_size = self.num_replicas * self.batch_size
340
+ rem = (
341
+ total_batch_size - (len_bucket % total_batch_size)
342
+ ) % total_batch_size
343
+ num_samples_per_bucket.append(len_bucket + rem)
344
+ return buckets, num_samples_per_bucket
345
+
346
+ def __iter__(self):
347
+ # deterministically shuffle based on epoch
348
+ g = torch.Generator()
349
+ g.manual_seed(self.epoch)
350
+
351
+ indices = []
352
+ if self.shuffle:
353
+ for bucket in self.buckets:
354
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
355
+ else:
356
+ for bucket in self.buckets:
357
+ indices.append(list(range(len(bucket))))
358
+
359
+ batches = []
360
+ for i in range(len(self.buckets)):
361
+ bucket = self.buckets[i]
362
+ len_bucket = len(bucket)
363
+ if len_bucket == 0:
364
+ continue
365
+ ids_bucket = indices[i]
366
+ num_samples_bucket = self.num_samples_per_bucket[i]
367
+
368
+ # add extra samples to make it evenly divisible
369
+ rem = num_samples_bucket - len_bucket
370
+ ids_bucket = (
371
+ ids_bucket
372
+ + ids_bucket * (rem // len_bucket)
373
+ + ids_bucket[: (rem % len_bucket)]
374
+ )
375
+
376
+ # subsample
377
+ ids_bucket = ids_bucket[self.rank :: self.num_replicas]
378
+
379
+ # batching
380
+ for j in range(len(ids_bucket) // self.batch_size):
381
+ batch = [
382
+ bucket[idx]
383
+ for idx in ids_bucket[
384
+ j * self.batch_size : (j + 1) * self.batch_size
385
+ ]
386
+ ]
387
+ batches.append(batch)
388
+
389
+ if self.shuffle:
390
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
391
+ batches = [batches[i] for i in batch_ids]
392
+ self.batches = batches
393
+
394
+ assert len(self.batches) * self.batch_size == self.num_samples
395
+ return iter(self.batches)
396
+
397
+ def _bisect(self, x, lo=0, hi=None):
398
+ if hi is None:
399
+ hi = len(self.boundaries) - 1
400
+
401
+ if hi > lo:
402
+ mid = (hi + lo) // 2
403
+ if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
404
+ return mid
405
+ elif x <= self.boundaries[mid]:
406
+ return self._bisect(x, lo, mid)
407
+ else:
408
+ return self._bisect(x, mid + 1, hi)
409
+ else:
410
+ return -1
411
+
412
+ def __len__(self):
413
+ return self.num_samples // self.batch_size
@@ -0,0 +1,67 @@
1
+ import torch
2
+ import os
3
+ from . import utils
4
+ from cached_path import cached_path
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ DOWNLOAD_CKPT_URLS = {
8
+ 'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth',
9
+ 'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth',
10
+ 'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/checkpoint.pth',
11
+ 'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/checkpoint.pth',
12
+ 'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/checkpoint.pth',
13
+ 'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/checkpoint.pth',
14
+ 'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/checkpoint.pth',
15
+ }
16
+
17
+ DOWNLOAD_CONFIG_URLS = {
18
+ 'EN': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN/config.json',
19
+ 'EN_V2': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/EN_V2/config.json',
20
+ 'FR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/FR/config.json',
21
+ 'JP': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/JP/config.json',
22
+ 'ES': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ES/config.json',
23
+ 'ZH': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/ZH/config.json',
24
+ 'KR': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/KR/config.json',
25
+ }
26
+
27
+ PRETRAINED_MODELS = {
28
+ 'G.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/G.pth',
29
+ 'D.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/D.pth',
30
+ 'DUR.pth': 'https://myshell-public-repo-host.s3.amazonaws.com/openvoice/basespeakers/pretrained/DUR.pth',
31
+ }
32
+
33
+ LANG_TO_HF_REPO_ID = {
34
+ 'EN': 'myshell-ai/MeloTTS-English',
35
+ 'EN_V2': 'myshell-ai/MeloTTS-English-v2',
36
+ 'EN_NEWEST': 'myshell-ai/MeloTTS-English-v3',
37
+ 'FR': 'myshell-ai/MeloTTS-French',
38
+ 'JP': 'myshell-ai/MeloTTS-Japanese',
39
+ 'ES': 'myshell-ai/MeloTTS-Spanish',
40
+ 'ZH': 'myshell-ai/MeloTTS-Chinese',
41
+ 'KR': 'myshell-ai/MeloTTS-Korean',
42
+ }
43
+
44
+ def load_or_download_config(locale, use_hf=True, config_path=None):
45
+ if config_path is None:
46
+ language = locale.split('-')[0].upper()
47
+ if use_hf:
48
+ assert language in LANG_TO_HF_REPO_ID
49
+ config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json")
50
+ else:
51
+ assert language in DOWNLOAD_CONFIG_URLS
52
+ config_path = cached_path(DOWNLOAD_CONFIG_URLS[language])
53
+ return utils.get_hparams_from_file(config_path)
54
+
55
+ def load_or_download_model(locale, device, use_hf=True, ckpt_path=None):
56
+ if ckpt_path is None:
57
+ language = locale.split('-')[0].upper()
58
+ if use_hf:
59
+ assert language in LANG_TO_HF_REPO_ID
60
+ ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth")
61
+ else:
62
+ assert language in DOWNLOAD_CKPT_URLS
63
+ ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language])
64
+ return torch.load(ckpt_path, map_location=device)
65
+
66
+ def load_pretrain_model():
67
+ return [cached_path(url) for url in PRETRAINED_MODELS.values()]
@@ -0,0 +1,25 @@
1
+ import os
2
+ import click
3
+ from melo.api import TTS
4
+
5
+
6
+
7
+ @click.command()
8
+ @click.option('--ckpt_path', '-m', type=str, default=None, help="Path to the checkpoint file")
9
+ @click.option('--text', '-t', type=str, default=None, help="Text to speak")
10
+ @click.option('--language', '-l', type=str, default="EN", help="Language of the model")
11
+ @click.option('--output_dir', '-o', type=str, default="outputs", help="Path to the output")
12
+ def main(ckpt_path, text, language, output_dir):
13
+ if ckpt_path is None:
14
+ raise ValueError("The model_path must be specified")
15
+
16
+ config_path = os.path.join(os.path.dirname(ckpt_path), 'config.json')
17
+ model = TTS(language=language, config_path=config_path, ckpt_path=ckpt_path)
18
+
19
+ for spk_name, spk_id in model.hps.data.spk2id.items():
20
+ save_path = f'{output_dir}/{spk_name}/output.wav'
21
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
22
+ model.tts_to_file(text, spk_id, save_path)
23
+
24
+ if __name__ == "__main__":
25
+ main()
@@ -0,0 +1,14 @@
1
+
2
+
3
+ if __name__ == '__main__':
4
+
5
+ from melo.api import TTS
6
+ device = 'auto'
7
+ models = {
8
+ 'EN': TTS(language='EN', device=device),
9
+ 'ES': TTS(language='ES', device=device),
10
+ 'FR': TTS(language='FR', device=device),
11
+ 'ZH': TTS(language='ZH', device=device),
12
+ 'JP': TTS(language='JP', device=device),
13
+ 'KR': TTS(language='KR', device=device),
14
+ }
@@ -0,0 +1,58 @@
1
+ import torch
2
+
3
+
4
+ def feature_loss(fmap_r, fmap_g):
5
+ loss = 0
6
+ for dr, dg in zip(fmap_r, fmap_g):
7
+ for rl, gl in zip(dr, dg):
8
+ rl = rl.float().detach()
9
+ gl = gl.float()
10
+ loss += torch.mean(torch.abs(rl - gl))
11
+
12
+ return loss * 2
13
+
14
+
15
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16
+ loss = 0
17
+ r_losses = []
18
+ g_losses = []
19
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20
+ dr = dr.float()
21
+ dg = dg.float()
22
+ r_loss = torch.mean((1 - dr) ** 2)
23
+ g_loss = torch.mean(dg**2)
24
+ loss += r_loss + g_loss
25
+ r_losses.append(r_loss.item())
26
+ g_losses.append(g_loss.item())
27
+
28
+ return loss, r_losses, g_losses
29
+
30
+
31
+ def generator_loss(disc_outputs):
32
+ loss = 0
33
+ gen_losses = []
34
+ for dg in disc_outputs:
35
+ dg = dg.float()
36
+ l = torch.mean((1 - dg) ** 2)
37
+ gen_losses.append(l)
38
+ loss += l
39
+
40
+ return loss, gen_losses
41
+
42
+
43
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44
+ """
45
+ z_p, logs_q: [b, h, t_t]
46
+ m_p, logs_p: [b, h, t_t]
47
+ """
48
+ z_p = z_p.float()
49
+ logs_q = logs_q.float()
50
+ m_p = m_p.float()
51
+ logs_p = logs_p.float()
52
+ z_mask = z_mask.float()
53
+
54
+ kl = logs_p - logs_q - 0.5
55
+ kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56
+ kl = torch.sum(kl * z_mask)
57
+ l = kl / torch.sum(z_mask)
58
+ return l
@@ -0,0 +1,36 @@
1
+ import click
2
+ import warnings
3
+ import os
4
+
5
+
6
+ @click.command
7
+ @click.argument('text')
8
+ @click.argument('output_path')
9
+ @click.option("--file", '-f', is_flag=True, show_default=True, default=False, help="Text is a file")
10
+ @click.option('--language', '-l', default='EN', help='Language, defaults to English', type=click.Choice(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], case_sensitive=False))
11
+ @click.option('--speaker', '-spk', default='EN-Default', help='Speaker ID, only for English, leave empty for default, ignored if not English. If English, defaults to "EN-Default"', type=click.Choice(['EN-Default', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU']))
12
+ @click.option('--speed', '-s', default=1.0, help='Speed, defaults to 1.0', type=float)
13
+ @click.option('--device', '-d', default='auto', help='Device, defaults to auto')
14
+ def main(text, file, output_path, language, speaker, speed, device):
15
+ if file:
16
+ if not os.path.exists(text):
17
+ raise FileNotFoundError(f'Trying to load text from file due to --file/-f flag, but file not found. Remove the --file/-f flag to pass a string.')
18
+ else:
19
+ with open(text) as f:
20
+ text = f.read().strip()
21
+ if text == '':
22
+ raise ValueError('You entered empty text or the file you passed was empty.')
23
+ language = language.upper()
24
+ if language == '': language = 'EN'
25
+ if speaker == '': speaker = None
26
+ if (not language == 'EN') and speaker:
27
+ warnings.warn('You specified a speaker but the language is English.')
28
+ from melo.api import TTS
29
+ model = TTS(language=language, device=device)
30
+ speaker_ids = model.hps.data.spk2id
31
+ if language == 'EN':
32
+ if not speaker: speaker = 'EN-Default'
33
+ spkr = speaker_ids[speaker]
34
+ else:
35
+ spkr = speaker_ids[list(speaker_ids.keys())[0]]
36
+ model.tts_to_file(text, spkr, output_path, speed=speed)