xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +77 -71
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +79 -19
  6. xinference/core/supervisor.py +172 -10
  7. xinference/core/utils.py +12 -8
  8. xinference/core/worker.py +102 -4
  9. xinference/deploy/cmdline.py +3 -1
  10. xinference/deploy/test/test_cmdline.py +56 -0
  11. xinference/isolation.py +24 -0
  12. xinference/model/audio/core.py +16 -0
  13. xinference/model/audio/cosyvoice.py +39 -6
  14. xinference/model/audio/f5tts.py +200 -0
  15. xinference/model/audio/f5tts_mlx.py +260 -0
  16. xinference/model/audio/fish_speech.py +36 -111
  17. xinference/model/audio/melotts.py +110 -0
  18. xinference/model/audio/model_spec.json +99 -3
  19. xinference/model/audio/model_spec_modelscope.json +27 -0
  20. xinference/model/audio/utils.py +32 -0
  21. xinference/model/audio/whisper.py +35 -10
  22. xinference/model/embedding/core.py +203 -142
  23. xinference/model/embedding/model_spec.json +7 -0
  24. xinference/model/embedding/model_spec_modelscope.json +8 -0
  25. xinference/model/image/core.py +69 -1
  26. xinference/model/image/model_spec.json +145 -4
  27. xinference/model/image/model_spec_modelscope.json +150 -4
  28. xinference/model/image/stable_diffusion/core.py +45 -13
  29. xinference/model/llm/__init__.py +4 -2
  30. xinference/model/llm/llm_family.json +536 -53
  31. xinference/model/llm/llm_family.py +15 -36
  32. xinference/model/llm/llm_family_modelscope.json +454 -20
  33. xinference/model/llm/memory.py +1 -1
  34. xinference/model/llm/mlx/core.py +248 -52
  35. xinference/model/llm/sglang/core.py +1 -0
  36. xinference/model/llm/transformers/chatglm.py +9 -5
  37. xinference/model/llm/transformers/cogagent.py +272 -0
  38. xinference/model/llm/transformers/core.py +2 -0
  39. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  40. xinference/model/llm/transformers/utils.py +16 -8
  41. xinference/model/llm/utils.py +36 -4
  42. xinference/model/llm/vllm/core.py +53 -10
  43. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  44. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  45. xinference/model/llm/vllm/xavier/block.py +111 -0
  46. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  47. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  48. xinference/model/llm/vllm/xavier/collective.py +74 -0
  49. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  50. xinference/model/llm/vllm/xavier/engine.py +247 -0
  51. xinference/model/llm/vllm/xavier/executor.py +134 -0
  52. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  53. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  54. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  55. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  56. xinference/model/video/diffusers.py +14 -0
  57. xinference/model/video/model_spec.json +15 -0
  58. xinference/model/video/model_spec_modelscope.json +16 -0
  59. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  60. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  61. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  62. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  63. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  64. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  65. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  66. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  67. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  68. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  69. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  70. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  71. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  72. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  73. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  74. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  75. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  76. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  77. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  78. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  79. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  80. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  81. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  82. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  83. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  84. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  85. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  86. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  87. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  88. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  89. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  90. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  91. xinference/thirdparty/f5_tts/api.py +166 -0
  92. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  93. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  94. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  95. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  96. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  97. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  98. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  99. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  100. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  101. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  102. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  103. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  104. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  105. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  106. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  107. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  108. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  109. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  110. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  111. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  112. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  114. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  115. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  116. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  117. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  118. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  119. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  120. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  121. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  122. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  123. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  124. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  125. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  126. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  127. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  128. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  129. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  130. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  131. xinference/thirdparty/f5_tts/train/README.md +77 -0
  132. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  133. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  134. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  135. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  136. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  137. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  138. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  139. xinference/thirdparty/f5_tts/train/train.py +75 -0
  140. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  141. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  142. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  143. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  144. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  145. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  146. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  147. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  148. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  149. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  150. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  151. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  152. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  153. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  154. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  155. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  156. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  157. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  158. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  159. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  160. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  161. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  162. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  163. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  164. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  165. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  166. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  167. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  168. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  169. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  170. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  171. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  172. xinference/thirdparty/matcha/utils/utils.py +2 -2
  173. xinference/thirdparty/melo/api.py +135 -0
  174. xinference/thirdparty/melo/app.py +61 -0
  175. xinference/thirdparty/melo/attentions.py +459 -0
  176. xinference/thirdparty/melo/commons.py +160 -0
  177. xinference/thirdparty/melo/configs/config.json +94 -0
  178. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  179. xinference/thirdparty/melo/data_utils.py +413 -0
  180. xinference/thirdparty/melo/download_utils.py +67 -0
  181. xinference/thirdparty/melo/infer.py +25 -0
  182. xinference/thirdparty/melo/init_downloads.py +14 -0
  183. xinference/thirdparty/melo/losses.py +58 -0
  184. xinference/thirdparty/melo/main.py +36 -0
  185. xinference/thirdparty/melo/mel_processing.py +174 -0
  186. xinference/thirdparty/melo/models.py +1030 -0
  187. xinference/thirdparty/melo/modules.py +598 -0
  188. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  189. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  190. xinference/thirdparty/melo/preprocess_text.py +135 -0
  191. xinference/thirdparty/melo/split_utils.py +174 -0
  192. xinference/thirdparty/melo/text/__init__.py +35 -0
  193. xinference/thirdparty/melo/text/chinese.py +199 -0
  194. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  195. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  196. xinference/thirdparty/melo/text/cleaner.py +36 -0
  197. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  198. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  199. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  200. xinference/thirdparty/melo/text/english.py +284 -0
  201. xinference/thirdparty/melo/text/english_bert.py +39 -0
  202. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  203. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  204. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  205. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  206. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  207. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  208. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  209. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  210. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  211. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  212. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  213. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  214. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  215. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  216. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  217. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  218. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  219. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  220. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  221. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  222. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  223. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  224. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  225. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  226. xinference/thirdparty/melo/text/french.py +94 -0
  227. xinference/thirdparty/melo/text/french_bert.py +39 -0
  228. xinference/thirdparty/melo/text/japanese.py +647 -0
  229. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  230. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  231. xinference/thirdparty/melo/text/korean.py +192 -0
  232. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  233. xinference/thirdparty/melo/text/spanish.py +122 -0
  234. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  235. xinference/thirdparty/melo/text/symbols.py +290 -0
  236. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  237. xinference/thirdparty/melo/train.py +635 -0
  238. xinference/thirdparty/melo/train.sh +19 -0
  239. xinference/thirdparty/melo/transforms.py +209 -0
  240. xinference/thirdparty/melo/utils.py +424 -0
  241. xinference/types.py +15 -0
  242. xinference/web/ui/build/asset-manifest.json +6 -6
  243. xinference/web/ui/build/index.html +1 -1
  244. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  245. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  246. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  247. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  248. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  249. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  250. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  251. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  252. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  253. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  254. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  255. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  256. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  257. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  258. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  259. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  260. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  261. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  262. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  263. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  264. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  265. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  266. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  292. xinference/web/ui/node_modules/.package-lock.json +67 -3
  293. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  294. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  295. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  296. xinference/web/ui/node_modules/i18next/package.json +129 -0
  297. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  298. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  299. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  300. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  301. xinference/web/ui/package-lock.json +69 -3
  302. xinference/web/ui/package.json +2 -0
  303. xinference/web/ui/src/locales/en.json +186 -0
  304. xinference/web/ui/src/locales/zh.json +186 -0
  305. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/METADATA +68 -32
  306. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/RECORD +316 -122
  307. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  308. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  309. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  310. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  311. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  312. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  313. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  314. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  315. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  316. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  317. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  318. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  319. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  320. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  321. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  322. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  323. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  324. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  325. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  326. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  327. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  328. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  329. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  330. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  331. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  332. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  333. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  334. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  335. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  336. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  337. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  338. /xinference/thirdparty/{fish_speech/tools → melo/text/fr_phonemizer}/__init__.py +0 -0
  339. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  340. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  341. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/WHEEL +0 -0
  342. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  343. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,405 @@
1
+ import math
2
+ import os
3
+ import random
4
+ import string
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+
11
+ from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
12
+ from f5_tts.model.modules import MelSpec
13
+ from f5_tts.model.utils import convert_char_to_pinyin
14
+
15
+
16
+ # seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
17
+ def get_seedtts_testset_metainfo(metalst):
18
+ f = open(metalst)
19
+ lines = f.readlines()
20
+ f.close()
21
+ metainfo = []
22
+ for line in lines:
23
+ if len(line.strip().split("|")) == 5:
24
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
25
+ elif len(line.strip().split("|")) == 4:
26
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
27
+ gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
28
+ if not os.path.isabs(prompt_wav):
29
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
30
+ metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
31
+ return metainfo
32
+
33
+
34
+ # librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
35
+ def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
36
+ f = open(metalst)
37
+ lines = f.readlines()
38
+ f.close()
39
+ metainfo = []
40
+ for line in lines:
41
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
42
+
43
+ # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
44
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
45
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
46
+
47
+ # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
48
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
49
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
50
+
51
+ metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
52
+
53
+ return metainfo
54
+
55
+
56
+ # padded to max length mel batch
57
+ def padded_mel_batch(ref_mels):
58
+ max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
59
+ padded_ref_mels = []
60
+ for mel in ref_mels:
61
+ padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
62
+ padded_ref_mels.append(padded_ref_mel)
63
+ padded_ref_mels = torch.stack(padded_ref_mels)
64
+ padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
65
+ return padded_ref_mels
66
+
67
+
68
+ # get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
69
+
70
+
71
+ def get_inference_prompt(
72
+ metainfo,
73
+ speed=1.0,
74
+ tokenizer="pinyin",
75
+ polyphone=True,
76
+ target_sample_rate=24000,
77
+ n_fft=1024,
78
+ win_length=1024,
79
+ n_mel_channels=100,
80
+ hop_length=256,
81
+ mel_spec_type="vocos",
82
+ target_rms=0.1,
83
+ use_truth_duration=False,
84
+ infer_batch_size=1,
85
+ num_buckets=200,
86
+ min_secs=3,
87
+ max_secs=40,
88
+ ):
89
+ prompts_all = []
90
+
91
+ min_tokens = min_secs * target_sample_rate // hop_length
92
+ max_tokens = max_secs * target_sample_rate // hop_length
93
+
94
+ batch_accum = [0] * num_buckets
95
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
96
+ [[] for _ in range(num_buckets)] for _ in range(6)
97
+ )
98
+
99
+ mel_spectrogram = MelSpec(
100
+ n_fft=n_fft,
101
+ hop_length=hop_length,
102
+ win_length=win_length,
103
+ n_mel_channels=n_mel_channels,
104
+ target_sample_rate=target_sample_rate,
105
+ mel_spec_type=mel_spec_type,
106
+ )
107
+
108
+ for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
109
+ # Audio
110
+ ref_audio, ref_sr = torchaudio.load(prompt_wav)
111
+ ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
112
+ if ref_rms < target_rms:
113
+ ref_audio = ref_audio * target_rms / ref_rms
114
+ assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
115
+ if ref_sr != target_sample_rate:
116
+ resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
117
+ ref_audio = resampler(ref_audio)
118
+
119
+ # Text
120
+ if len(prompt_text[-1].encode("utf-8")) == 1:
121
+ prompt_text = prompt_text + " "
122
+ text = [prompt_text + gt_text]
123
+ if tokenizer == "pinyin":
124
+ text_list = convert_char_to_pinyin(text, polyphone=polyphone)
125
+ else:
126
+ text_list = text
127
+
128
+ # Duration, mel frame length
129
+ ref_mel_len = ref_audio.shape[-1] // hop_length
130
+ if use_truth_duration:
131
+ gt_audio, gt_sr = torchaudio.load(gt_wav)
132
+ if gt_sr != target_sample_rate:
133
+ resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
134
+ gt_audio = resampler(gt_audio)
135
+ total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
136
+
137
+ # # test vocoder resynthesis
138
+ # ref_audio = gt_audio
139
+ else:
140
+ ref_text_len = len(prompt_text.encode("utf-8"))
141
+ gen_text_len = len(gt_text.encode("utf-8"))
142
+ total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
143
+
144
+ # to mel spectrogram
145
+ ref_mel = mel_spectrogram(ref_audio)
146
+ ref_mel = ref_mel.squeeze(0)
147
+
148
+ # deal with batch
149
+ assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
150
+ assert (
151
+ min_tokens <= total_mel_len <= max_tokens
152
+ ), f"Audio {utt} has duration {total_mel_len*hop_length//target_sample_rate}s out of range [{min_secs}, {max_secs}]."
153
+ bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
154
+
155
+ utts[bucket_i].append(utt)
156
+ ref_rms_list[bucket_i].append(ref_rms)
157
+ ref_mels[bucket_i].append(ref_mel)
158
+ ref_mel_lens[bucket_i].append(ref_mel_len)
159
+ total_mel_lens[bucket_i].append(total_mel_len)
160
+ final_text_list[bucket_i].extend(text_list)
161
+
162
+ batch_accum[bucket_i] += total_mel_len
163
+
164
+ if batch_accum[bucket_i] >= infer_batch_size:
165
+ # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
166
+ prompts_all.append(
167
+ (
168
+ utts[bucket_i],
169
+ ref_rms_list[bucket_i],
170
+ padded_mel_batch(ref_mels[bucket_i]),
171
+ ref_mel_lens[bucket_i],
172
+ total_mel_lens[bucket_i],
173
+ final_text_list[bucket_i],
174
+ )
175
+ )
176
+ batch_accum[bucket_i] = 0
177
+ (
178
+ utts[bucket_i],
179
+ ref_rms_list[bucket_i],
180
+ ref_mels[bucket_i],
181
+ ref_mel_lens[bucket_i],
182
+ total_mel_lens[bucket_i],
183
+ final_text_list[bucket_i],
184
+ ) = [], [], [], [], [], []
185
+
186
+ # add residual
187
+ for bucket_i, bucket_frames in enumerate(batch_accum):
188
+ if bucket_frames > 0:
189
+ prompts_all.append(
190
+ (
191
+ utts[bucket_i],
192
+ ref_rms_list[bucket_i],
193
+ padded_mel_batch(ref_mels[bucket_i]),
194
+ ref_mel_lens[bucket_i],
195
+ total_mel_lens[bucket_i],
196
+ final_text_list[bucket_i],
197
+ )
198
+ )
199
+ # not only leave easy work for last workers
200
+ random.seed(666)
201
+ random.shuffle(prompts_all)
202
+
203
+ return prompts_all
204
+
205
+
206
+ # get wav_res_ref_text of seed-tts test metalst
207
+ # https://github.com/BytedanceSpeech/seed-tts-eval
208
+
209
+
210
+ def get_seed_tts_test(metalst, gen_wav_dir, gpus):
211
+ f = open(metalst)
212
+ lines = f.readlines()
213
+ f.close()
214
+
215
+ test_set_ = []
216
+ for line in tqdm(lines):
217
+ if len(line.strip().split("|")) == 5:
218
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
219
+ elif len(line.strip().split("|")) == 4:
220
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
221
+
222
+ if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
223
+ continue
224
+ gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
225
+ if not os.path.isabs(prompt_wav):
226
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
227
+
228
+ test_set_.append((gen_wav, prompt_wav, gt_text))
229
+
230
+ num_jobs = len(gpus)
231
+ if num_jobs == 1:
232
+ return [(gpus[0], test_set_)]
233
+
234
+ wav_per_job = len(test_set_) // num_jobs + 1
235
+ test_set = []
236
+ for i in range(num_jobs):
237
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
238
+
239
+ return test_set
240
+
241
+
242
+ # get librispeech test-clean cross sentence test
243
+
244
+
245
+ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
246
+ f = open(metalst)
247
+ lines = f.readlines()
248
+ f.close()
249
+
250
+ test_set_ = []
251
+ for line in tqdm(lines):
252
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
253
+
254
+ if eval_ground_truth:
255
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
256
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
257
+ else:
258
+ if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
259
+ raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
260
+ gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
261
+
262
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
263
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
264
+
265
+ test_set_.append((gen_wav, ref_wav, gen_txt))
266
+
267
+ num_jobs = len(gpus)
268
+ if num_jobs == 1:
269
+ return [(gpus[0], test_set_)]
270
+
271
+ wav_per_job = len(test_set_) // num_jobs + 1
272
+ test_set = []
273
+ for i in range(num_jobs):
274
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
275
+
276
+ return test_set
277
+
278
+
279
+ # load asr model
280
+
281
+
282
+ def load_asr_model(lang, ckpt_dir=""):
283
+ if lang == "zh":
284
+ from funasr import AutoModel
285
+
286
+ model = AutoModel(
287
+ model=os.path.join(ckpt_dir, "paraformer-zh"),
288
+ # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
289
+ # punc_model = os.path.join(ckpt_dir, "ct-punc"),
290
+ # spk_model = os.path.join(ckpt_dir, "cam++"),
291
+ disable_update=True,
292
+ ) # following seed-tts setting
293
+ elif lang == "en":
294
+ from faster_whisper import WhisperModel
295
+
296
+ model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
297
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
298
+ return model
299
+
300
+
301
+ # WER Evaluation, the way Seed-TTS does
302
+
303
+
304
+ def run_asr_wer(args):
305
+ rank, lang, test_set, ckpt_dir = args
306
+
307
+ if lang == "zh":
308
+ import zhconv
309
+
310
+ torch.cuda.set_device(rank)
311
+ elif lang == "en":
312
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
313
+ else:
314
+ raise NotImplementedError(
315
+ "lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now."
316
+ )
317
+
318
+ asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
319
+
320
+ from zhon.hanzi import punctuation
321
+
322
+ punctuation_all = punctuation + string.punctuation
323
+ wers = []
324
+
325
+ from jiwer import compute_measures
326
+
327
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
328
+ if lang == "zh":
329
+ res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
330
+ hypo = res[0]["text"]
331
+ hypo = zhconv.convert(hypo, "zh-cn")
332
+ elif lang == "en":
333
+ segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
334
+ hypo = ""
335
+ for segment in segments:
336
+ hypo = hypo + " " + segment.text
337
+
338
+ # raw_truth = truth
339
+ # raw_hypo = hypo
340
+
341
+ for x in punctuation_all:
342
+ truth = truth.replace(x, "")
343
+ hypo = hypo.replace(x, "")
344
+
345
+ truth = truth.replace(" ", " ")
346
+ hypo = hypo.replace(" ", " ")
347
+
348
+ if lang == "zh":
349
+ truth = " ".join([x for x in truth])
350
+ hypo = " ".join([x for x in hypo])
351
+ elif lang == "en":
352
+ truth = truth.lower()
353
+ hypo = hypo.lower()
354
+
355
+ measures = compute_measures(truth, hypo)
356
+ wer = measures["wer"]
357
+
358
+ # ref_list = truth.split(" ")
359
+ # subs = measures["substitutions"] / len(ref_list)
360
+ # dele = measures["deletions"] / len(ref_list)
361
+ # inse = measures["insertions"] / len(ref_list)
362
+
363
+ wers.append(wer)
364
+
365
+ return wers
366
+
367
+
368
+ # SIM Evaluation
369
+
370
+
371
+ def run_sim(args):
372
+ rank, test_set, ckpt_dir = args
373
+ device = f"cuda:{rank}"
374
+
375
+ model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
376
+ state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
377
+ model.load_state_dict(state_dict["model"], strict=False)
378
+
379
+ use_gpu = True if torch.cuda.is_available() else False
380
+ if use_gpu:
381
+ model = model.cuda(device)
382
+ model.eval()
383
+
384
+ sim_list = []
385
+ for wav1, wav2, truth in tqdm(test_set):
386
+ wav1, sr1 = torchaudio.load(wav1)
387
+ wav2, sr2 = torchaudio.load(wav2)
388
+
389
+ resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
390
+ resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
391
+ wav1 = resample1(wav1)
392
+ wav2 = resample2(wav2)
393
+
394
+ if use_gpu:
395
+ wav1 = wav1.cuda(device)
396
+ wav2 = wav2.cuda(device)
397
+ with torch.no_grad():
398
+ emb1 = model(wav1)
399
+ emb2 = model(wav2)
400
+
401
+ sim = F.cosine_similarity(emb1, emb2)[0].item()
402
+ # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
403
+ sim_list.append(sim)
404
+
405
+ return sim_list
@@ -0,0 +1,191 @@
1
+ # Inference
2
+
3
+ The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
+
5
+ **More checkpoints with whole community efforts can be found in [SHARED.md](SHARED.md), supporting more languages.**
6
+
7
+ Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
8
+
9
+ To avoid possible inference failures, make sure you have seen through the following instructions.
10
+
11
+ - Use reference audio <15s and leave some silence (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
12
+ - Uppercased letters will be uttered letter by letter, so use lowercased letters for normal words.
13
+ - Add some spaces (blank: " ") or punctuations (e.g. "," ".") to explicitly introduce some pauses.
14
+ - Preprocess numbers to Chinese letters if you want to have them read in Chinese, otherwise in English.
15
+
16
+
17
+ ## Gradio App
18
+
19
+ Currently supported features:
20
+
21
+ - Basic TTS with Chunk Inference
22
+ - Multi-Style / Multi-Speaker Generation
23
+ - Voice Chat powered by Qwen2.5-3B-Instruct
24
+
25
+ The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
26
+
27
+ The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
28
+
29
+ Could also be used as a component for larger application.
30
+ ```python
31
+ import gradio as gr
32
+ from f5_tts.infer.infer_gradio import app
33
+
34
+ with gr.Blocks() as main_app:
35
+ gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
36
+
37
+ # ... other Gradio components
38
+
39
+ app.render()
40
+
41
+ main_app.launch()
42
+ ```
43
+
44
+
45
+ ## CLI Inference
46
+
47
+ The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
48
+
49
+ The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
50
+
51
+ For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
52
+
53
+ Basically you can inference with flags:
54
+ ```bash
55
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
56
+ f5-tts_infer-cli \
57
+ --model "F5-TTS" \
58
+ --ref_audio "ref_audio.wav" \
59
+ --ref_text "The content, subtitle or transcription of reference audio." \
60
+ --gen_text "Some text you want TTS model generate for you."
61
+
62
+ # Choose Vocoder
63
+ f5-tts_infer-cli --vocoder_name bigvgan --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base_bigvgan/model_1250000.pt>
64
+ f5-tts_infer-cli --vocoder_name vocos --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base/model_1200000.safetensors>
65
+ ```
66
+
67
+ And a `.toml` file would help with more flexible usage.
68
+
69
+ ```bash
70
+ f5-tts_infer-cli -c custom.toml
71
+ ```
72
+
73
+ For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
74
+
75
+ ```toml
76
+ # F5-TTS | E2-TTS
77
+ model = "F5-TTS"
78
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
79
+ # If an empty "", transcribes the reference audio automatically.
80
+ ref_text = "Some call me nature, others call me mother nature."
81
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
82
+ # File with text to generate. Ignores the text above.
83
+ gen_file = ""
84
+ remove_silence = false
85
+ output_dir = "tests"
86
+ ```
87
+
88
+ You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
89
+
90
+ ```toml
91
+ # F5-TTS | E2-TTS
92
+ model = "F5-TTS"
93
+ ref_audio = "infer/examples/multi/main.flac"
94
+ # If an empty "", transcribes the reference audio automatically.
95
+ ref_text = ""
96
+ gen_text = ""
97
+ # File with text to generate. Ignores the text above.
98
+ gen_file = "infer/examples/multi/story.txt"
99
+ remove_silence = true
100
+ output_dir = "tests"
101
+
102
+ [voices.town]
103
+ ref_audio = "infer/examples/multi/town.flac"
104
+ ref_text = ""
105
+
106
+ [voices.country]
107
+ ref_audio = "infer/examples/multi/country.flac"
108
+ ref_text = ""
109
+ ```
110
+ You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
111
+
112
+ ## Speech Editing
113
+
114
+ To test speech editing capabilities, use the following command:
115
+
116
+ ```bash
117
+ python src/f5_tts/infer/speech_edit.py
118
+ ```
119
+
120
+ ## Socket Realtime Client
121
+
122
+ To communicate with socket server you need to run
123
+ ```bash
124
+ python src/f5_tts/socket_server.py
125
+ ```
126
+
127
+ <details>
128
+ <summary>Then create client to communicate</summary>
129
+
130
+ ``` python
131
+ import socket
132
+ import numpy as np
133
+ import asyncio
134
+ import pyaudio
135
+
136
+ async def listen_to_voice(text, server_ip='localhost', server_port=9999):
137
+ client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
138
+ client_socket.connect((server_ip, server_port))
139
+
140
+ async def play_audio_stream():
141
+ buffer = b''
142
+ p = pyaudio.PyAudio()
143
+ stream = p.open(format=pyaudio.paFloat32,
144
+ channels=1,
145
+ rate=24000, # Ensure this matches the server's sampling rate
146
+ output=True,
147
+ frames_per_buffer=2048)
148
+
149
+ try:
150
+ while True:
151
+ chunk = await asyncio.get_event_loop().run_in_executor(None, client_socket.recv, 1024)
152
+ if not chunk: # End of stream
153
+ break
154
+ if b"END_OF_AUDIO" in chunk:
155
+ buffer += chunk.replace(b"END_OF_AUDIO", b"")
156
+ if buffer:
157
+ audio_array = np.frombuffer(buffer, dtype=np.float32).copy() # Make a writable copy
158
+ stream.write(audio_array.tobytes())
159
+ break
160
+ buffer += chunk
161
+ if len(buffer) >= 4096:
162
+ audio_array = np.frombuffer(buffer[:4096], dtype=np.float32).copy() # Make a writable copy
163
+ stream.write(audio_array.tobytes())
164
+ buffer = buffer[4096:]
165
+ finally:
166
+ stream.stop_stream()
167
+ stream.close()
168
+ p.terminate()
169
+
170
+ try:
171
+ # Send only the text to the server
172
+ await asyncio.get_event_loop().run_in_executor(None, client_socket.sendall, text.encode('utf-8'))
173
+ await play_audio_stream()
174
+ print("Audio playback finished.")
175
+
176
+ except Exception as e:
177
+ print(f"Error in listen_to_voice: {e}")
178
+
179
+ finally:
180
+ client_socket.close()
181
+
182
+ # Example usage: Replace this with your actual server IP and port
183
+ async def main():
184
+ await listen_to_voice("my name is jenny..", server_ip='localhost', server_port=9998)
185
+
186
+ # Run the main async function
187
+ asyncio.run(main())
188
+ ```
189
+
190
+ </details>
191
+
@@ -0,0 +1,74 @@
1
+ <!-- omit in toc -->
2
+ # Shared Model Cards
3
+
4
+ <!-- omit in toc -->
5
+ ### **Prerequisites of using**
6
+ - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
7
+ - The models in this repository are open source and are based on voluntary contributions from contributors.
8
+ - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
9
+
10
+ <!-- omit in toc -->
11
+ ### **Welcome to share here**
12
+ - Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
13
+ - Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
14
+ - Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
15
+
16
+ <!-- omit in toc -->
17
+ ### Supported Languages
18
+ - [Multilingual](#multilingual)
19
+ - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
20
+ - [Mandarin](#mandarin)
21
+ - [Japanese](#japanese)
22
+ - [F5-TTS Base @ pretrain/finetune @ ja](#f5-tts-base--pretrainfinetune--ja)
23
+ - [English](#english)
24
+ - [French](#french)
25
+ - [French LibriVox @ finetune @ fr](#french-librivox--finetune--fr)
26
+
27
+
28
+ ## Multilingual
29
+
30
+ #### F5-TTS Base @ pretrain @ zh & en
31
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
32
+ |:---:|:------------:|:-----------:|:-------------:|
33
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
34
+
35
+ ```bash
36
+ MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
37
+ VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
38
+ ```
39
+
40
+ *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
41
+
42
+
43
+ ## Mandarin
44
+
45
+ ## Japanese
46
+
47
+ #### F5-TTS Base @ pretrain/finetune @ ja
48
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
49
+ |:---:|:------------:|:-----------:|:-------------:|
50
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_8500000)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
51
+
52
+ ```bash
53
+ MODEL_CKPT: hf://Jmica/F5TTS/JA_8500000/model_8499660.pt
54
+ VOCAB_FILE: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
55
+ ```
56
+
57
+ ## English
58
+
59
+
60
+ ## French
61
+
62
+ #### French LibriVox @ finetune @ fr
63
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
64
+ |:---:|:------------:|:-----------:|:-------------:|
65
+ |F5-TTS French|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
66
+
67
+ ```bash
68
+ MODEL_CKPT: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
69
+ VOCAB_FILE: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
70
+ ```
71
+
72
+ - [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
73
+ - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
74
+ - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
@@ -0,0 +1,11 @@
1
+ # F5-TTS | E2-TTS
2
+ model = "F5-TTS"
3
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = "Some call me nature, others call me mother nature."
6
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = ""
9
+ remove_silence = false
10
+ output_dir = "tests"
11
+ output_file = "infer_cli_out.wav"