xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,285 @@
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from random import random
13
+ from typing import Callable
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ from torch import nn
18
+ from torch.nn.utils.rnn import pad_sequence
19
+ from torchdiffeq import odeint
20
+
21
+ from f5_tts.model.modules import MelSpec
22
+ from f5_tts.model.utils import (
23
+ default,
24
+ exists,
25
+ lens_to_mask,
26
+ list_str_to_idx,
27
+ list_str_to_tensor,
28
+ mask_from_frac_lengths,
29
+ )
30
+
31
+
32
+ class CFM(nn.Module):
33
+ def __init__(
34
+ self,
35
+ transformer: nn.Module,
36
+ sigma=0.0,
37
+ odeint_kwargs: dict = dict(
38
+ # atol = 1e-5,
39
+ # rtol = 1e-5,
40
+ method="euler" # 'midpoint'
41
+ ),
42
+ audio_drop_prob=0.3,
43
+ cond_drop_prob=0.2,
44
+ num_channels=None,
45
+ mel_spec_module: nn.Module | None = None,
46
+ mel_spec_kwargs: dict = dict(),
47
+ frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
48
+ vocab_char_map: dict[str:int] | None = None,
49
+ ):
50
+ super().__init__()
51
+
52
+ self.frac_lengths_mask = frac_lengths_mask
53
+
54
+ # mel spec
55
+ self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
56
+ num_channels = default(num_channels, self.mel_spec.n_mel_channels)
57
+ self.num_channels = num_channels
58
+
59
+ # classifier-free guidance
60
+ self.audio_drop_prob = audio_drop_prob
61
+ self.cond_drop_prob = cond_drop_prob
62
+
63
+ # transformer
64
+ self.transformer = transformer
65
+ dim = transformer.dim
66
+ self.dim = dim
67
+
68
+ # conditional flow related
69
+ self.sigma = sigma
70
+
71
+ # sampling related
72
+ self.odeint_kwargs = odeint_kwargs
73
+
74
+ # vocab map for tokenization
75
+ self.vocab_char_map = vocab_char_map
76
+
77
+ @property
78
+ def device(self):
79
+ return next(self.parameters()).device
80
+
81
+ @torch.no_grad()
82
+ def sample(
83
+ self,
84
+ cond: float["b n d"] | float["b nw"], # noqa: F722
85
+ text: int["b nt"] | list[str], # noqa: F722
86
+ duration: int | int["b"], # noqa: F821
87
+ *,
88
+ lens: int["b"] | None = None, # noqa: F821
89
+ steps=32,
90
+ cfg_strength=1.0,
91
+ sway_sampling_coef=None,
92
+ seed: int | None = None,
93
+ max_duration=4096,
94
+ vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
95
+ no_ref_audio=False,
96
+ duplicate_test=False,
97
+ t_inter=0.1,
98
+ edit_mask=None,
99
+ ):
100
+ self.eval()
101
+ # raw wave
102
+
103
+ if cond.ndim == 2:
104
+ cond = self.mel_spec(cond)
105
+ cond = cond.permute(0, 2, 1)
106
+ assert cond.shape[-1] == self.num_channels
107
+
108
+ cond = cond.to(next(self.parameters()).dtype)
109
+
110
+ batch, cond_seq_len, device = *cond.shape[:2], cond.device
111
+ if not exists(lens):
112
+ lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
113
+
114
+ # text
115
+
116
+ if isinstance(text, list):
117
+ if exists(self.vocab_char_map):
118
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
119
+ else:
120
+ text = list_str_to_tensor(text).to(device)
121
+ assert text.shape[0] == batch
122
+
123
+ if exists(text):
124
+ text_lens = (text != -1).sum(dim=-1)
125
+ lens = torch.maximum(text_lens, lens) # make sure lengths are at least those of the text characters
126
+
127
+ # duration
128
+
129
+ cond_mask = lens_to_mask(lens)
130
+ if edit_mask is not None:
131
+ cond_mask = cond_mask & edit_mask
132
+
133
+ if isinstance(duration, int):
134
+ duration = torch.full((batch,), duration, device=device, dtype=torch.long)
135
+
136
+ duration = torch.maximum(lens + 1, duration) # just add one token so something is generated
137
+ duration = duration.clamp(max=max_duration)
138
+ max_duration = duration.amax()
139
+
140
+ # duplicate test corner for inner time step oberservation
141
+ if duplicate_test:
142
+ test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
143
+
144
+ cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
145
+ cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
146
+ cond_mask = cond_mask.unsqueeze(-1)
147
+ step_cond = torch.where(
148
+ cond_mask, cond, torch.zeros_like(cond)
149
+ ) # allow direct control (cut cond audio) with lens passed in
150
+
151
+ if batch > 1:
152
+ mask = lens_to_mask(duration)
153
+ else: # save memory and speed up, as single inference need no mask currently
154
+ mask = None
155
+
156
+ # test for no ref audio
157
+ if no_ref_audio:
158
+ cond = torch.zeros_like(cond)
159
+
160
+ # neural ode
161
+
162
+ def fn(t, x):
163
+ # at each step, conditioning is fixed
164
+ # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
165
+
166
+ # predict flow
167
+ pred = self.transformer(
168
+ x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=False, drop_text=False
169
+ )
170
+ if cfg_strength < 1e-5:
171
+ return pred
172
+
173
+ null_pred = self.transformer(
174
+ x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=True, drop_text=True
175
+ )
176
+ return pred + (pred - null_pred) * cfg_strength
177
+
178
+ # noise input
179
+ # to make sure batch inference result is same with different batch size, and for sure single inference
180
+ # still some difference maybe due to convolutional layers
181
+ y0 = []
182
+ for dur in duration:
183
+ if exists(seed):
184
+ torch.manual_seed(seed)
185
+ y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
186
+ y0 = pad_sequence(y0, padding_value=0, batch_first=True)
187
+
188
+ t_start = 0
189
+
190
+ # duplicate test corner for inner time step oberservation
191
+ if duplicate_test:
192
+ t_start = t_inter
193
+ y0 = (1 - t_start) * y0 + t_start * test_cond
194
+ steps = int(steps * (1 - t_start))
195
+
196
+ t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype)
197
+ if sway_sampling_coef is not None:
198
+ t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
199
+
200
+ trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
201
+
202
+ sampled = trajectory[-1]
203
+ out = sampled
204
+ out = torch.where(cond_mask, cond, out)
205
+
206
+ if exists(vocoder):
207
+ out = out.permute(0, 2, 1)
208
+ out = vocoder(out)
209
+
210
+ return out, trajectory
211
+
212
+ def forward(
213
+ self,
214
+ inp: float["b n d"] | float["b nw"], # mel or raw wave # noqa: F722
215
+ text: int["b nt"] | list[str], # noqa: F722
216
+ *,
217
+ lens: int["b"] | None = None, # noqa: F821
218
+ noise_scheduler: str | None = None,
219
+ ):
220
+ # handle raw wave
221
+ if inp.ndim == 2:
222
+ inp = self.mel_spec(inp)
223
+ inp = inp.permute(0, 2, 1)
224
+ assert inp.shape[-1] == self.num_channels
225
+
226
+ batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
227
+
228
+ # handle text as string
229
+ if isinstance(text, list):
230
+ if exists(self.vocab_char_map):
231
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
232
+ else:
233
+ text = list_str_to_tensor(text).to(device)
234
+ assert text.shape[0] == batch
235
+
236
+ # lens and mask
237
+ if not exists(lens):
238
+ lens = torch.full((batch,), seq_len, device=device)
239
+
240
+ mask = lens_to_mask(lens, length=seq_len) # useless here, as collate_fn will pad to max length in batch
241
+
242
+ # get a random span to mask out for training conditionally
243
+ frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
244
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
245
+
246
+ if exists(mask):
247
+ rand_span_mask &= mask
248
+
249
+ # mel is x1
250
+ x1 = inp
251
+
252
+ # x0 is gaussian noise
253
+ x0 = torch.randn_like(x1)
254
+
255
+ # time step
256
+ time = torch.rand((batch,), dtype=dtype, device=self.device)
257
+ # TODO. noise_scheduler
258
+
259
+ # sample xt (φ_t(x) in the paper)
260
+ t = time.unsqueeze(-1).unsqueeze(-1)
261
+ φ = (1 - t) * x0 + t * x1
262
+ flow = x1 - x0
263
+
264
+ # only predict what is within the random mask span for infilling
265
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
266
+
267
+ # transformer and cfg training with a drop rate
268
+ drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
269
+ if random() < self.cond_drop_prob: # p_uncond in voicebox paper
270
+ drop_audio_cond = True
271
+ drop_text = True
272
+ else:
273
+ drop_text = False
274
+
275
+ # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
276
+ # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
277
+ pred = self.transformer(
278
+ x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text
279
+ )
280
+
281
+ # flow matching loss
282
+ loss = F.mse_loss(pred, flow, reduction="none")
283
+ loss = loss[rand_span_mask]
284
+
285
+ return loss.mean(), cond, pred
@@ -0,0 +1,319 @@
1
+ import json
2
+ import random
3
+ from importlib.resources import files
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchaudio
8
+ from datasets import Dataset as Dataset_
9
+ from datasets import load_from_disk
10
+ from torch import nn
11
+ from torch.utils.data import Dataset, Sampler
12
+ from tqdm import tqdm
13
+
14
+ from f5_tts.model.modules import MelSpec
15
+ from f5_tts.model.utils import default
16
+
17
+
18
+ class HFDataset(Dataset):
19
+ def __init__(
20
+ self,
21
+ hf_dataset: Dataset,
22
+ target_sample_rate=24_000,
23
+ n_mel_channels=100,
24
+ hop_length=256,
25
+ n_fft=1024,
26
+ win_length=1024,
27
+ mel_spec_type="vocos",
28
+ ):
29
+ self.data = hf_dataset
30
+ self.target_sample_rate = target_sample_rate
31
+ self.hop_length = hop_length
32
+
33
+ self.mel_spectrogram = MelSpec(
34
+ n_fft=n_fft,
35
+ hop_length=hop_length,
36
+ win_length=win_length,
37
+ n_mel_channels=n_mel_channels,
38
+ target_sample_rate=target_sample_rate,
39
+ mel_spec_type=mel_spec_type,
40
+ )
41
+
42
+ def get_frame_len(self, index):
43
+ row = self.data[index]
44
+ audio = row["audio"]["array"]
45
+ sample_rate = row["audio"]["sampling_rate"]
46
+ return audio.shape[-1] / sample_rate * self.target_sample_rate / self.hop_length
47
+
48
+ def __len__(self):
49
+ return len(self.data)
50
+
51
+ def __getitem__(self, index):
52
+ row = self.data[index]
53
+ audio = row["audio"]["array"]
54
+
55
+ # logger.info(f"Audio shape: {audio.shape}")
56
+
57
+ sample_rate = row["audio"]["sampling_rate"]
58
+ duration = audio.shape[-1] / sample_rate
59
+
60
+ if duration > 30 or duration < 0.3:
61
+ return self.__getitem__((index + 1) % len(self.data))
62
+
63
+ audio_tensor = torch.from_numpy(audio).float()
64
+
65
+ if sample_rate != self.target_sample_rate:
66
+ resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
67
+ audio_tensor = resampler(audio_tensor)
68
+
69
+ audio_tensor = audio_tensor.unsqueeze(0) # 't -> 1 t')
70
+
71
+ mel_spec = self.mel_spectrogram(audio_tensor)
72
+
73
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
74
+
75
+ text = row["text"]
76
+
77
+ return dict(
78
+ mel_spec=mel_spec,
79
+ text=text,
80
+ )
81
+
82
+
83
+ class CustomDataset(Dataset):
84
+ def __init__(
85
+ self,
86
+ custom_dataset: Dataset,
87
+ durations=None,
88
+ target_sample_rate=24_000,
89
+ hop_length=256,
90
+ n_mel_channels=100,
91
+ n_fft=1024,
92
+ win_length=1024,
93
+ mel_spec_type="vocos",
94
+ preprocessed_mel=False,
95
+ mel_spec_module: nn.Module | None = None,
96
+ ):
97
+ self.data = custom_dataset
98
+ self.durations = durations
99
+ self.target_sample_rate = target_sample_rate
100
+ self.hop_length = hop_length
101
+ self.n_fft = n_fft
102
+ self.win_length = win_length
103
+ self.mel_spec_type = mel_spec_type
104
+ self.preprocessed_mel = preprocessed_mel
105
+
106
+ if not preprocessed_mel:
107
+ self.mel_spectrogram = default(
108
+ mel_spec_module,
109
+ MelSpec(
110
+ n_fft=n_fft,
111
+ hop_length=hop_length,
112
+ win_length=win_length,
113
+ n_mel_channels=n_mel_channels,
114
+ target_sample_rate=target_sample_rate,
115
+ mel_spec_type=mel_spec_type,
116
+ ),
117
+ )
118
+
119
+ def get_frame_len(self, index):
120
+ if (
121
+ self.durations is not None
122
+ ): # Please make sure the separately provided durations are correct, otherwise 99.99% OOM
123
+ return self.durations[index] * self.target_sample_rate / self.hop_length
124
+ return self.data[index]["duration"] * self.target_sample_rate / self.hop_length
125
+
126
+ def __len__(self):
127
+ return len(self.data)
128
+
129
+ def __getitem__(self, index):
130
+ while True:
131
+ row = self.data[index]
132
+ audio_path = row["audio_path"]
133
+ text = row["text"]
134
+ duration = row["duration"]
135
+
136
+ # filter by given length
137
+ if 0.3 <= duration <= 30:
138
+ break # valid
139
+
140
+ index = (index + 1) % len(self.data)
141
+
142
+ if self.preprocessed_mel:
143
+ mel_spec = torch.tensor(row["mel_spec"])
144
+ else:
145
+ audio, source_sample_rate = torchaudio.load(audio_path)
146
+
147
+ # make sure mono input
148
+ if audio.shape[0] > 1:
149
+ audio = torch.mean(audio, dim=0, keepdim=True)
150
+
151
+ # resample if necessary
152
+ if source_sample_rate != self.target_sample_rate:
153
+ resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
154
+ audio = resampler(audio)
155
+
156
+ # to mel spectrogram
157
+ mel_spec = self.mel_spectrogram(audio)
158
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
159
+
160
+ return {
161
+ "mel_spec": mel_spec,
162
+ "text": text,
163
+ }
164
+
165
+
166
+ # Dynamic Batch Sampler
167
+ class DynamicBatchSampler(Sampler[list[int]]):
168
+ """Extension of Sampler that will do the following:
169
+ 1. Change the batch size (essentially number of sequences)
170
+ in a batch to ensure that the total number of frames are less
171
+ than a certain threshold.
172
+ 2. Make sure the padding efficiency in the batch is high.
173
+ """
174
+
175
+ def __init__(
176
+ self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_last: bool = False
177
+ ):
178
+ self.sampler = sampler
179
+ self.frames_threshold = frames_threshold
180
+ self.max_samples = max_samples
181
+
182
+ indices, batches = [], []
183
+ data_source = self.sampler.data_source
184
+
185
+ for idx in tqdm(
186
+ self.sampler, desc="Sorting with sampler... if slow, check whether dataset is provided with duration"
187
+ ):
188
+ indices.append((idx, data_source.get_frame_len(idx)))
189
+ indices.sort(key=lambda elem: elem[1])
190
+
191
+ batch = []
192
+ batch_frames = 0
193
+ for idx, frame_len in tqdm(
194
+ indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
195
+ ):
196
+ if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
197
+ batch.append(idx)
198
+ batch_frames += frame_len
199
+ else:
200
+ if len(batch) > 0:
201
+ batches.append(batch)
202
+ if frame_len <= self.frames_threshold:
203
+ batch = [idx]
204
+ batch_frames = frame_len
205
+ else:
206
+ batch = []
207
+ batch_frames = 0
208
+
209
+ if not drop_last and len(batch) > 0:
210
+ batches.append(batch)
211
+
212
+ del indices
213
+
214
+ # if want to have different batches between epochs, may just set a seed and log it in ckpt
215
+ # cuz during multi-gpu training, although the batch on per gpu not change between epochs, the formed general minibatch is different
216
+ # e.g. for epoch n, use (random_seed + n)
217
+ random.seed(random_seed)
218
+ random.shuffle(batches)
219
+
220
+ self.batches = batches
221
+
222
+ def __iter__(self):
223
+ return iter(self.batches)
224
+
225
+ def __len__(self):
226
+ return len(self.batches)
227
+
228
+
229
+ # Load dataset
230
+
231
+
232
+ def load_dataset(
233
+ dataset_name: str,
234
+ tokenizer: str = "pinyin",
235
+ dataset_type: str = "CustomDataset",
236
+ audio_type: str = "raw",
237
+ mel_spec_module: nn.Module | None = None,
238
+ mel_spec_kwargs: dict = dict(),
239
+ ) -> CustomDataset | HFDataset:
240
+ """
241
+ dataset_type - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
242
+ - "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
243
+ """
244
+
245
+ print("Loading dataset ...")
246
+
247
+ if dataset_type == "CustomDataset":
248
+ rel_data_path = str(files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}"))
249
+ if audio_type == "raw":
250
+ try:
251
+ train_dataset = load_from_disk(f"{rel_data_path}/raw")
252
+ except: # noqa: E722
253
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/raw.arrow")
254
+ preprocessed_mel = False
255
+ elif audio_type == "mel":
256
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/mel.arrow")
257
+ preprocessed_mel = True
258
+ with open(f"{rel_data_path}/duration.json", "r", encoding="utf-8") as f:
259
+ data_dict = json.load(f)
260
+ durations = data_dict["duration"]
261
+ train_dataset = CustomDataset(
262
+ train_dataset,
263
+ durations=durations,
264
+ preprocessed_mel=preprocessed_mel,
265
+ mel_spec_module=mel_spec_module,
266
+ **mel_spec_kwargs,
267
+ )
268
+
269
+ elif dataset_type == "CustomDatasetPath":
270
+ try:
271
+ train_dataset = load_from_disk(f"{dataset_name}/raw")
272
+ except: # noqa: E722
273
+ train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")
274
+
275
+ with open(f"{dataset_name}/duration.json", "r", encoding="utf-8") as f:
276
+ data_dict = json.load(f)
277
+ durations = data_dict["duration"]
278
+ train_dataset = CustomDataset(
279
+ train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs
280
+ )
281
+
282
+ elif dataset_type == "HFDataset":
283
+ print(
284
+ "Should manually modify the path of huggingface dataset to your need.\n"
285
+ + "May also the corresponding script cuz different dataset may have different format."
286
+ )
287
+ pre, post = dataset_name.split("_")
288
+ train_dataset = HFDataset(
289
+ load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir=str(files("f5_tts").joinpath("../../data"))),
290
+ )
291
+
292
+ return train_dataset
293
+
294
+
295
+ # collation
296
+
297
+
298
+ def collate_fn(batch):
299
+ mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
300
+ mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
301
+ max_mel_length = mel_lengths.amax()
302
+
303
+ padded_mel_specs = []
304
+ for spec in mel_specs: # TODO. maybe records mask for attention here
305
+ padding = (0, max_mel_length - spec.size(-1))
306
+ padded_spec = F.pad(spec, padding, value=0)
307
+ padded_mel_specs.append(padded_spec)
308
+
309
+ mel_specs = torch.stack(padded_mel_specs)
310
+
311
+ text = [item["text"] for item in batch]
312
+ text_lengths = torch.LongTensor([len(item) for item in text])
313
+
314
+ return dict(
315
+ mel=mel_specs,
316
+ mel_lengths=mel_lengths,
317
+ text=text,
318
+ text_lengths=text_lengths,
319
+ )