xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +77 -71
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +79 -19
  6. xinference/core/supervisor.py +172 -10
  7. xinference/core/utils.py +12 -8
  8. xinference/core/worker.py +102 -4
  9. xinference/deploy/cmdline.py +3 -1
  10. xinference/deploy/test/test_cmdline.py +56 -0
  11. xinference/isolation.py +24 -0
  12. xinference/model/audio/core.py +16 -0
  13. xinference/model/audio/cosyvoice.py +39 -6
  14. xinference/model/audio/f5tts.py +200 -0
  15. xinference/model/audio/f5tts_mlx.py +260 -0
  16. xinference/model/audio/fish_speech.py +36 -111
  17. xinference/model/audio/melotts.py +110 -0
  18. xinference/model/audio/model_spec.json +99 -3
  19. xinference/model/audio/model_spec_modelscope.json +27 -0
  20. xinference/model/audio/utils.py +32 -0
  21. xinference/model/audio/whisper.py +35 -10
  22. xinference/model/embedding/core.py +203 -142
  23. xinference/model/embedding/model_spec.json +7 -0
  24. xinference/model/embedding/model_spec_modelscope.json +8 -0
  25. xinference/model/image/core.py +69 -1
  26. xinference/model/image/model_spec.json +145 -4
  27. xinference/model/image/model_spec_modelscope.json +150 -4
  28. xinference/model/image/stable_diffusion/core.py +45 -13
  29. xinference/model/llm/__init__.py +4 -2
  30. xinference/model/llm/llm_family.json +536 -53
  31. xinference/model/llm/llm_family.py +15 -36
  32. xinference/model/llm/llm_family_modelscope.json +454 -20
  33. xinference/model/llm/memory.py +1 -1
  34. xinference/model/llm/mlx/core.py +248 -52
  35. xinference/model/llm/sglang/core.py +1 -0
  36. xinference/model/llm/transformers/chatglm.py +9 -5
  37. xinference/model/llm/transformers/cogagent.py +272 -0
  38. xinference/model/llm/transformers/core.py +2 -0
  39. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  40. xinference/model/llm/transformers/utils.py +16 -8
  41. xinference/model/llm/utils.py +36 -4
  42. xinference/model/llm/vllm/core.py +53 -10
  43. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  44. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  45. xinference/model/llm/vllm/xavier/block.py +111 -0
  46. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  47. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  48. xinference/model/llm/vllm/xavier/collective.py +74 -0
  49. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  50. xinference/model/llm/vllm/xavier/engine.py +247 -0
  51. xinference/model/llm/vllm/xavier/executor.py +134 -0
  52. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  53. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  54. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  55. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  56. xinference/model/video/diffusers.py +14 -0
  57. xinference/model/video/model_spec.json +15 -0
  58. xinference/model/video/model_spec_modelscope.json +16 -0
  59. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  60. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  61. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  62. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  63. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  64. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  65. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  66. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  67. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  68. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  69. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  70. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  71. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  72. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  73. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  74. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  75. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  76. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  77. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  78. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  79. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  80. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  81. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  82. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  83. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  84. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  85. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  86. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  87. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  88. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  89. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  90. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  91. xinference/thirdparty/f5_tts/api.py +166 -0
  92. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  93. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  94. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  95. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  96. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  97. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  98. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  99. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  100. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  101. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  102. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  103. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  104. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  105. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  106. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  107. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  108. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  109. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  110. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  111. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  112. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  114. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  115. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  116. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  117. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  118. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  119. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  120. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  121. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  122. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  123. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  124. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  125. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  126. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  127. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  128. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  129. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  130. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  131. xinference/thirdparty/f5_tts/train/README.md +77 -0
  132. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  133. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  134. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  135. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  136. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  137. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  138. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  139. xinference/thirdparty/f5_tts/train/train.py +75 -0
  140. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  141. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  142. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  143. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  144. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  145. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  146. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  147. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  148. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  149. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  150. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  151. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  152. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  153. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  154. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  155. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  156. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  157. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  158. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  159. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  160. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  161. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  162. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  163. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  164. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  165. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  166. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  167. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  168. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  169. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  170. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  171. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  172. xinference/thirdparty/matcha/utils/utils.py +2 -2
  173. xinference/thirdparty/melo/api.py +135 -0
  174. xinference/thirdparty/melo/app.py +61 -0
  175. xinference/thirdparty/melo/attentions.py +459 -0
  176. xinference/thirdparty/melo/commons.py +160 -0
  177. xinference/thirdparty/melo/configs/config.json +94 -0
  178. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  179. xinference/thirdparty/melo/data_utils.py +413 -0
  180. xinference/thirdparty/melo/download_utils.py +67 -0
  181. xinference/thirdparty/melo/infer.py +25 -0
  182. xinference/thirdparty/melo/init_downloads.py +14 -0
  183. xinference/thirdparty/melo/losses.py +58 -0
  184. xinference/thirdparty/melo/main.py +36 -0
  185. xinference/thirdparty/melo/mel_processing.py +174 -0
  186. xinference/thirdparty/melo/models.py +1030 -0
  187. xinference/thirdparty/melo/modules.py +598 -0
  188. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  189. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  190. xinference/thirdparty/melo/preprocess_text.py +135 -0
  191. xinference/thirdparty/melo/split_utils.py +174 -0
  192. xinference/thirdparty/melo/text/__init__.py +35 -0
  193. xinference/thirdparty/melo/text/chinese.py +199 -0
  194. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  195. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  196. xinference/thirdparty/melo/text/cleaner.py +36 -0
  197. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  198. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  199. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  200. xinference/thirdparty/melo/text/english.py +284 -0
  201. xinference/thirdparty/melo/text/english_bert.py +39 -0
  202. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  203. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  204. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  205. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  206. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  207. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  208. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  209. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  210. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  211. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  212. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  213. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  214. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  215. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  216. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  217. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  218. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  219. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  220. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  221. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  222. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  223. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  224. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  225. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  226. xinference/thirdparty/melo/text/french.py +94 -0
  227. xinference/thirdparty/melo/text/french_bert.py +39 -0
  228. xinference/thirdparty/melo/text/japanese.py +647 -0
  229. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  230. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  231. xinference/thirdparty/melo/text/korean.py +192 -0
  232. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  233. xinference/thirdparty/melo/text/spanish.py +122 -0
  234. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  235. xinference/thirdparty/melo/text/symbols.py +290 -0
  236. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  237. xinference/thirdparty/melo/train.py +635 -0
  238. xinference/thirdparty/melo/train.sh +19 -0
  239. xinference/thirdparty/melo/transforms.py +209 -0
  240. xinference/thirdparty/melo/utils.py +424 -0
  241. xinference/types.py +15 -0
  242. xinference/web/ui/build/asset-manifest.json +6 -6
  243. xinference/web/ui/build/index.html +1 -1
  244. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  245. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  246. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  247. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  248. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  249. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  250. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  251. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  252. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  253. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  254. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  255. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  256. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  257. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  258. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  259. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  260. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  261. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  262. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  263. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  264. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  265. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  266. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  292. xinference/web/ui/node_modules/.package-lock.json +67 -3
  293. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  294. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  295. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  296. xinference/web/ui/node_modules/i18next/package.json +129 -0
  297. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  298. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  299. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  300. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  301. xinference/web/ui/package-lock.json +69 -3
  302. xinference/web/ui/package.json +2 -0
  303. xinference/web/ui/src/locales/en.json +186 -0
  304. xinference/web/ui/src/locales/zh.json +186 -0
  305. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/METADATA +68 -32
  306. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/RECORD +316 -122
  307. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  308. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  309. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  310. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  311. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  312. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  313. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  314. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  315. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  316. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  317. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  318. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  319. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  320. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  321. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  322. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  323. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  324. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  325. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  326. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  327. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  328. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  329. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  330. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  331. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  332. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  333. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  334. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  335. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  336. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  337. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  338. /xinference/thirdparty/{fish_speech/tools → melo/text/fr_phonemizer}/__init__.py +0 -0
  339. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  340. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  341. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/WHEEL +0 -0
  342. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  343. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,538 @@
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+
6
+ os.environ["PYTOCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
7
+ sys.path.append(f"../../{os.path.dirname(os.path.abspath(__file__))}/third_party/BigVGAN/")
8
+
9
+ import hashlib
10
+ import re
11
+ import tempfile
12
+ from importlib.resources import files
13
+
14
+ # import matplotlib
15
+
16
+ # matplotlib.use("Agg")
17
+ #
18
+ # import matplotlib.pylab as plt
19
+ import numpy as np
20
+ import torch
21
+ import torchaudio
22
+ import tqdm
23
+ from huggingface_hub import snapshot_download, hf_hub_download
24
+ from pydub import AudioSegment, silence
25
+ from transformers import pipeline
26
+ from vocos import Vocos
27
+
28
+ from f5_tts.model import CFM
29
+ from f5_tts.model.utils import (
30
+ get_tokenizer,
31
+ convert_char_to_pinyin,
32
+ )
33
+
34
+ _ref_audio_cache = {}
35
+
36
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
37
+
38
+ # -----------------------------------------
39
+
40
+ target_sample_rate = 24000
41
+ n_mel_channels = 100
42
+ hop_length = 256
43
+ win_length = 1024
44
+ n_fft = 1024
45
+ mel_spec_type = "vocos"
46
+ target_rms = 0.1
47
+ cross_fade_duration = 0.15
48
+ ode_method = "euler"
49
+ nfe_step = 32 # 16, 32
50
+ cfg_strength = 2.0
51
+ sway_sampling_coef = -1.0
52
+ speed = 1.0
53
+ fix_duration = None
54
+
55
+ # -----------------------------------------
56
+
57
+
58
+ # chunk text into smaller pieces
59
+
60
+
61
+ def chunk_text(text, max_chars=135):
62
+ """
63
+ Splits the input text into chunks, each with a maximum number of characters.
64
+
65
+ Args:
66
+ text (str): The text to be split.
67
+ max_chars (int): The maximum number of characters per chunk.
68
+
69
+ Returns:
70
+ List[str]: A list of text chunks.
71
+ """
72
+ chunks = []
73
+ current_chunk = ""
74
+ # Split the text into sentences based on punctuation followed by whitespace
75
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
76
+
77
+ for sentence in sentences:
78
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
79
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
80
+ else:
81
+ if current_chunk:
82
+ chunks.append(current_chunk.strip())
83
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
84
+
85
+ if current_chunk:
86
+ chunks.append(current_chunk.strip())
87
+
88
+ return chunks
89
+
90
+
91
+ # load vocoder
92
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
93
+ if vocoder_name == "vocos":
94
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
95
+ if is_local:
96
+ print(f"Load vocos from local path {local_path}")
97
+ config_path = f"{local_path}/config.yaml"
98
+ model_path = f"{local_path}/pytorch_model.bin"
99
+ else:
100
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
101
+ repo_id = "charactr/vocos-mel-24khz"
102
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
103
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
104
+ vocoder = Vocos.from_hparams(config_path)
105
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
106
+ from vocos.feature_extractors import EncodecFeatures
107
+
108
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
109
+ encodec_parameters = {
110
+ "feature_extractor.encodec." + key: value
111
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
112
+ }
113
+ state_dict.update(encodec_parameters)
114
+ vocoder.load_state_dict(state_dict)
115
+ vocoder = vocoder.eval().to(device)
116
+ elif vocoder_name == "bigvgan":
117
+ try:
118
+ from third_party.BigVGAN import bigvgan
119
+ except ImportError:
120
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
121
+ if is_local:
122
+ """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
123
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
124
+ else:
125
+ local_path = snapshot_download(repo_id="nvidia/bigvgan_v2_24khz_100band_256x", cache_dir=hf_cache_dir)
126
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
127
+
128
+ vocoder.remove_weight_norm()
129
+ vocoder = vocoder.eval().to(device)
130
+ return vocoder
131
+
132
+
133
+ # load asr pipeline
134
+
135
+ asr_pipe = None
136
+
137
+
138
+ def initialize_asr_pipeline(device: str = device, dtype=None):
139
+ if dtype is None:
140
+ dtype = (
141
+ torch.float16 if "cuda" in device and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
142
+ )
143
+ global asr_pipe
144
+ asr_pipe = pipeline(
145
+ "automatic-speech-recognition",
146
+ model="openai/whisper-large-v3-turbo",
147
+ torch_dtype=dtype,
148
+ device=device,
149
+ )
150
+
151
+
152
+ # transcribe
153
+
154
+
155
+ def transcribe(ref_audio, language=None):
156
+ global asr_pipe
157
+ if asr_pipe is None:
158
+ initialize_asr_pipeline(device=device)
159
+ return asr_pipe(
160
+ ref_audio,
161
+ chunk_length_s=30,
162
+ batch_size=128,
163
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
164
+ return_timestamps=False,
165
+ )["text"].strip()
166
+
167
+
168
+ # load model checkpoint for inference
169
+
170
+
171
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
172
+ if dtype is None:
173
+ dtype = (
174
+ torch.float16 if "cuda" in device and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
175
+ )
176
+ model = model.to(dtype)
177
+
178
+ ckpt_type = ckpt_path.split(".")[-1]
179
+ if ckpt_type == "safetensors":
180
+ from safetensors.torch import load_file
181
+
182
+ checkpoint = load_file(ckpt_path, device=device)
183
+ else:
184
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
185
+
186
+ if use_ema:
187
+ if ckpt_type == "safetensors":
188
+ checkpoint = {"ema_model_state_dict": checkpoint}
189
+ checkpoint["model_state_dict"] = {
190
+ k.replace("ema_model.", ""): v
191
+ for k, v in checkpoint["ema_model_state_dict"].items()
192
+ if k not in ["initted", "step"]
193
+ }
194
+
195
+ # patch for backward compatibility, 305e3ea
196
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
197
+ if key in checkpoint["model_state_dict"]:
198
+ del checkpoint["model_state_dict"][key]
199
+
200
+ model.load_state_dict(checkpoint["model_state_dict"])
201
+ else:
202
+ if ckpt_type == "safetensors":
203
+ checkpoint = {"model_state_dict": checkpoint}
204
+ model.load_state_dict(checkpoint["model_state_dict"])
205
+
206
+ del checkpoint
207
+ torch.cuda.empty_cache()
208
+
209
+ return model.to(device)
210
+
211
+
212
+ # load model for inference
213
+
214
+
215
+ def load_model(
216
+ model_cls,
217
+ model_cfg,
218
+ ckpt_path,
219
+ mel_spec_type=mel_spec_type,
220
+ vocab_file="",
221
+ ode_method=ode_method,
222
+ use_ema=True,
223
+ device=device,
224
+ ):
225
+ if vocab_file == "":
226
+ vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
227
+ tokenizer = "custom"
228
+
229
+ print("\nvocab : ", vocab_file)
230
+ print("token : ", tokenizer)
231
+ print("model : ", ckpt_path, "\n")
232
+
233
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
234
+ model = CFM(
235
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
236
+ mel_spec_kwargs=dict(
237
+ n_fft=n_fft,
238
+ hop_length=hop_length,
239
+ win_length=win_length,
240
+ n_mel_channels=n_mel_channels,
241
+ target_sample_rate=target_sample_rate,
242
+ mel_spec_type=mel_spec_type,
243
+ ),
244
+ odeint_kwargs=dict(
245
+ method=ode_method,
246
+ ),
247
+ vocab_char_map=vocab_char_map,
248
+ ).to(device)
249
+
250
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
251
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
252
+
253
+ return model
254
+
255
+
256
+ def remove_silence_edges(audio, silence_threshold=-42):
257
+ # Remove silence from the start
258
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
259
+ audio = audio[non_silent_start_idx:]
260
+
261
+ # Remove silence from the end
262
+ non_silent_end_duration = audio.duration_seconds
263
+ for ms in reversed(audio):
264
+ if ms.dBFS > silence_threshold:
265
+ break
266
+ non_silent_end_duration -= 0.001
267
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
268
+
269
+ return trimmed_audio
270
+
271
+
272
+ # preprocess reference audio and text
273
+
274
+
275
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
276
+ show_info("Converting audio...")
277
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
278
+ aseg = AudioSegment.from_file(ref_audio_orig)
279
+
280
+ if clip_short:
281
+ # 1. try to find long silence for clipping
282
+ non_silent_segs = silence.split_on_silence(
283
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
284
+ )
285
+ non_silent_wave = AudioSegment.silent(duration=0)
286
+ for non_silent_seg in non_silent_segs:
287
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
288
+ show_info("Audio is over 15s, clipping short. (1)")
289
+ break
290
+ non_silent_wave += non_silent_seg
291
+
292
+ # 2. try to find short silence for clipping if 1. failed
293
+ if len(non_silent_wave) > 15000:
294
+ non_silent_segs = silence.split_on_silence(
295
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
296
+ )
297
+ non_silent_wave = AudioSegment.silent(duration=0)
298
+ for non_silent_seg in non_silent_segs:
299
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
300
+ show_info("Audio is over 15s, clipping short. (2)")
301
+ break
302
+ non_silent_wave += non_silent_seg
303
+
304
+ aseg = non_silent_wave
305
+
306
+ # 3. if no proper silence found for clipping
307
+ if len(aseg) > 15000:
308
+ aseg = aseg[:15000]
309
+ show_info("Audio is over 15s, clipping short. (3)")
310
+
311
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
312
+ aseg.export(f.name, format="wav")
313
+ ref_audio = f.name
314
+
315
+ # Compute a hash of the reference audio file
316
+ with open(ref_audio, "rb") as audio_file:
317
+ audio_data = audio_file.read()
318
+ audio_hash = hashlib.md5(audio_data).hexdigest()
319
+
320
+ if not ref_text.strip():
321
+ global _ref_audio_cache
322
+ if audio_hash in _ref_audio_cache:
323
+ # Use cached asr transcription
324
+ show_info("Using cached reference text...")
325
+ ref_text = _ref_audio_cache[audio_hash]
326
+ else:
327
+ show_info("No reference text provided, transcribing reference audio...")
328
+ ref_text = transcribe(ref_audio)
329
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
330
+ _ref_audio_cache[audio_hash] = ref_text
331
+ else:
332
+ show_info("Using custom reference text...")
333
+
334
+ # Ensure ref_text ends with a proper sentence-ending punctuation
335
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
336
+ if ref_text.endswith("."):
337
+ ref_text += " "
338
+ else:
339
+ ref_text += ". "
340
+
341
+ print("ref_text ", ref_text)
342
+
343
+ return ref_audio, ref_text
344
+
345
+
346
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
347
+
348
+
349
+ def infer_process(
350
+ ref_audio,
351
+ ref_text,
352
+ gen_text,
353
+ model_obj,
354
+ vocoder,
355
+ mel_spec_type=mel_spec_type,
356
+ show_info=print,
357
+ progress=tqdm,
358
+ target_rms=target_rms,
359
+ cross_fade_duration=cross_fade_duration,
360
+ nfe_step=nfe_step,
361
+ cfg_strength=cfg_strength,
362
+ sway_sampling_coef=sway_sampling_coef,
363
+ speed=speed,
364
+ fix_duration=fix_duration,
365
+ device=device,
366
+ ):
367
+ # Split the input text into batches
368
+ audio, sr = torchaudio.load(ref_audio)
369
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
370
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
371
+ for i, gen_text in enumerate(gen_text_batches):
372
+ print(f"gen_text {i}", gen_text)
373
+
374
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
375
+ return infer_batch_process(
376
+ (audio, sr),
377
+ ref_text,
378
+ gen_text_batches,
379
+ model_obj,
380
+ vocoder,
381
+ mel_spec_type=mel_spec_type,
382
+ progress=progress,
383
+ target_rms=target_rms,
384
+ cross_fade_duration=cross_fade_duration,
385
+ nfe_step=nfe_step,
386
+ cfg_strength=cfg_strength,
387
+ sway_sampling_coef=sway_sampling_coef,
388
+ speed=speed,
389
+ fix_duration=fix_duration,
390
+ device=device,
391
+ )
392
+
393
+
394
+ # infer batches
395
+
396
+
397
+ def infer_batch_process(
398
+ ref_audio,
399
+ ref_text,
400
+ gen_text_batches,
401
+ model_obj,
402
+ vocoder,
403
+ mel_spec_type="vocos",
404
+ progress=tqdm,
405
+ target_rms=0.1,
406
+ cross_fade_duration=0.15,
407
+ nfe_step=32,
408
+ cfg_strength=2.0,
409
+ sway_sampling_coef=-1,
410
+ speed=1,
411
+ fix_duration=None,
412
+ device=None,
413
+ ):
414
+ audio, sr = ref_audio
415
+ if audio.shape[0] > 1:
416
+ audio = torch.mean(audio, dim=0, keepdim=True)
417
+
418
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
419
+ if rms < target_rms:
420
+ audio = audio * target_rms / rms
421
+ if sr != target_sample_rate:
422
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
423
+ audio = resampler(audio)
424
+ audio = audio.to(device)
425
+
426
+ generated_waves = []
427
+ spectrograms = []
428
+
429
+ if len(ref_text[-1].encode("utf-8")) == 1:
430
+ ref_text = ref_text + " "
431
+ for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
432
+ # Prepare the text
433
+ text_list = [ref_text + gen_text]
434
+ final_text_list = convert_char_to_pinyin(text_list)
435
+
436
+ ref_audio_len = audio.shape[-1] // hop_length
437
+ if fix_duration is not None:
438
+ duration = int(fix_duration * target_sample_rate / hop_length)
439
+ else:
440
+ # Calculate duration
441
+ ref_text_len = len(ref_text.encode("utf-8"))
442
+ gen_text_len = len(gen_text.encode("utf-8"))
443
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
444
+
445
+ # inference
446
+ with torch.inference_mode():
447
+ generated, _ = model_obj.sample(
448
+ cond=audio,
449
+ text=final_text_list,
450
+ duration=duration,
451
+ steps=nfe_step,
452
+ cfg_strength=cfg_strength,
453
+ sway_sampling_coef=sway_sampling_coef,
454
+ )
455
+
456
+ generated = generated.to(torch.float32)
457
+ generated = generated[:, ref_audio_len:, :]
458
+ generated_mel_spec = generated.permute(0, 2, 1)
459
+ if mel_spec_type == "vocos":
460
+ generated_wave = vocoder.decode(generated_mel_spec)
461
+ elif mel_spec_type == "bigvgan":
462
+ generated_wave = vocoder(generated_mel_spec)
463
+ if rms < target_rms:
464
+ generated_wave = generated_wave * rms / target_rms
465
+
466
+ # wav -> numpy
467
+ generated_wave = generated_wave.squeeze().cpu().numpy()
468
+
469
+ generated_waves.append(generated_wave)
470
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
471
+
472
+ # Combine all generated waves with cross-fading
473
+ if cross_fade_duration <= 0:
474
+ # Simply concatenate
475
+ final_wave = np.concatenate(generated_waves)
476
+ else:
477
+ final_wave = generated_waves[0]
478
+ for i in range(1, len(generated_waves)):
479
+ prev_wave = final_wave
480
+ next_wave = generated_waves[i]
481
+
482
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
483
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
484
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
485
+
486
+ if cross_fade_samples <= 0:
487
+ # No overlap possible, concatenate
488
+ final_wave = np.concatenate([prev_wave, next_wave])
489
+ continue
490
+
491
+ # Overlapping parts
492
+ prev_overlap = prev_wave[-cross_fade_samples:]
493
+ next_overlap = next_wave[:cross_fade_samples]
494
+
495
+ # Fade out and fade in
496
+ fade_out = np.linspace(1, 0, cross_fade_samples)
497
+ fade_in = np.linspace(0, 1, cross_fade_samples)
498
+
499
+ # Cross-faded overlap
500
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
501
+
502
+ # Combine
503
+ new_wave = np.concatenate(
504
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
505
+ )
506
+
507
+ final_wave = new_wave
508
+
509
+ # Create a combined spectrogram
510
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
511
+
512
+ return final_wave, target_sample_rate, combined_spectrogram
513
+
514
+
515
+ # remove silence from generated wav
516
+
517
+
518
+ def remove_silence_for_generated_wav(filename):
519
+ aseg = AudioSegment.from_file(filename)
520
+ non_silent_segs = silence.split_on_silence(
521
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
522
+ )
523
+ non_silent_wave = AudioSegment.silent(duration=0)
524
+ for non_silent_seg in non_silent_segs:
525
+ non_silent_wave += non_silent_seg
526
+ aseg = non_silent_wave
527
+ aseg.export(filename, format="wav")
528
+
529
+
530
+ # save spectrogram
531
+
532
+
533
+ def save_spectrogram(spectrogram, path):
534
+ plt.figure(figsize=(12, 4))
535
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
536
+ plt.colorbar()
537
+ plt.savefig(path)
538
+ plt.close()
@@ -0,0 +1,10 @@
1
+ from f5_tts.model.cfm import CFM
2
+
3
+ from f5_tts.model.backbones.unett import UNetT
4
+ from f5_tts.model.backbones.dit import DiT
5
+ from f5_tts.model.backbones.mmdit import MMDiT
6
+
7
+ # from f5_tts.model.trainer import Trainer
8
+
9
+
10
+ __all__ = ["CFM", "UNetT", "DiT", "MMDiT"] # , "Trainer"]
@@ -0,0 +1,20 @@
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - sd3 structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett