xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +77 -71
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +79 -19
  6. xinference/core/supervisor.py +172 -10
  7. xinference/core/utils.py +12 -8
  8. xinference/core/worker.py +102 -4
  9. xinference/deploy/cmdline.py +3 -1
  10. xinference/deploy/test/test_cmdline.py +56 -0
  11. xinference/isolation.py +24 -0
  12. xinference/model/audio/core.py +16 -0
  13. xinference/model/audio/cosyvoice.py +39 -6
  14. xinference/model/audio/f5tts.py +200 -0
  15. xinference/model/audio/f5tts_mlx.py +260 -0
  16. xinference/model/audio/fish_speech.py +36 -111
  17. xinference/model/audio/melotts.py +110 -0
  18. xinference/model/audio/model_spec.json +99 -3
  19. xinference/model/audio/model_spec_modelscope.json +27 -0
  20. xinference/model/audio/utils.py +32 -0
  21. xinference/model/audio/whisper.py +35 -10
  22. xinference/model/embedding/core.py +203 -142
  23. xinference/model/embedding/model_spec.json +7 -0
  24. xinference/model/embedding/model_spec_modelscope.json +8 -0
  25. xinference/model/image/core.py +69 -1
  26. xinference/model/image/model_spec.json +145 -4
  27. xinference/model/image/model_spec_modelscope.json +150 -4
  28. xinference/model/image/stable_diffusion/core.py +45 -13
  29. xinference/model/llm/__init__.py +4 -2
  30. xinference/model/llm/llm_family.json +536 -53
  31. xinference/model/llm/llm_family.py +15 -36
  32. xinference/model/llm/llm_family_modelscope.json +454 -20
  33. xinference/model/llm/memory.py +1 -1
  34. xinference/model/llm/mlx/core.py +248 -52
  35. xinference/model/llm/sglang/core.py +1 -0
  36. xinference/model/llm/transformers/chatglm.py +9 -5
  37. xinference/model/llm/transformers/cogagent.py +272 -0
  38. xinference/model/llm/transformers/core.py +2 -0
  39. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  40. xinference/model/llm/transformers/utils.py +16 -8
  41. xinference/model/llm/utils.py +36 -4
  42. xinference/model/llm/vllm/core.py +53 -10
  43. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  44. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  45. xinference/model/llm/vllm/xavier/block.py +111 -0
  46. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  47. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  48. xinference/model/llm/vllm/xavier/collective.py +74 -0
  49. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  50. xinference/model/llm/vllm/xavier/engine.py +247 -0
  51. xinference/model/llm/vllm/xavier/executor.py +134 -0
  52. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  53. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  54. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  55. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  56. xinference/model/video/diffusers.py +14 -0
  57. xinference/model/video/model_spec.json +15 -0
  58. xinference/model/video/model_spec_modelscope.json +16 -0
  59. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  60. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  61. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  62. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  63. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  64. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  65. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  66. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  67. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  68. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  69. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  70. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  71. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  72. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  73. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  74. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  75. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  76. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  77. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  78. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  79. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  80. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  81. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  82. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  83. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  84. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  85. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  86. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  87. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  88. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  89. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  90. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  91. xinference/thirdparty/f5_tts/api.py +166 -0
  92. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  93. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  94. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  95. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  96. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  97. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  98. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  99. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  100. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  101. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  102. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  103. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  104. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  105. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  106. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  107. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  108. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  109. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  110. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  111. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  112. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  114. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  115. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  116. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  117. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  118. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  119. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  120. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  121. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  122. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  123. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  124. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  125. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  126. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  127. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  128. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  129. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  130. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  131. xinference/thirdparty/f5_tts/train/README.md +77 -0
  132. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  133. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  134. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  135. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  136. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  137. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  138. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  139. xinference/thirdparty/f5_tts/train/train.py +75 -0
  140. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  141. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  142. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  143. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  144. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  145. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  146. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  147. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  148. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  149. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  150. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  151. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  152. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  153. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  154. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  155. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  156. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  157. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  158. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  159. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  160. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  161. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  162. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  163. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  164. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  165. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  166. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  167. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  168. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  169. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  170. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  171. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  172. xinference/thirdparty/matcha/utils/utils.py +2 -2
  173. xinference/thirdparty/melo/api.py +135 -0
  174. xinference/thirdparty/melo/app.py +61 -0
  175. xinference/thirdparty/melo/attentions.py +459 -0
  176. xinference/thirdparty/melo/commons.py +160 -0
  177. xinference/thirdparty/melo/configs/config.json +94 -0
  178. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  179. xinference/thirdparty/melo/data_utils.py +413 -0
  180. xinference/thirdparty/melo/download_utils.py +67 -0
  181. xinference/thirdparty/melo/infer.py +25 -0
  182. xinference/thirdparty/melo/init_downloads.py +14 -0
  183. xinference/thirdparty/melo/losses.py +58 -0
  184. xinference/thirdparty/melo/main.py +36 -0
  185. xinference/thirdparty/melo/mel_processing.py +174 -0
  186. xinference/thirdparty/melo/models.py +1030 -0
  187. xinference/thirdparty/melo/modules.py +598 -0
  188. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  189. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  190. xinference/thirdparty/melo/preprocess_text.py +135 -0
  191. xinference/thirdparty/melo/split_utils.py +174 -0
  192. xinference/thirdparty/melo/text/__init__.py +35 -0
  193. xinference/thirdparty/melo/text/chinese.py +199 -0
  194. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  195. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  196. xinference/thirdparty/melo/text/cleaner.py +36 -0
  197. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  198. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  199. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  200. xinference/thirdparty/melo/text/english.py +284 -0
  201. xinference/thirdparty/melo/text/english_bert.py +39 -0
  202. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  203. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  204. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  205. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  206. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  207. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  208. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  209. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  210. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  211. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  212. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  213. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  214. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  215. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  216. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  217. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  218. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  219. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  220. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  221. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  222. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  223. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  224. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  225. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  226. xinference/thirdparty/melo/text/french.py +94 -0
  227. xinference/thirdparty/melo/text/french_bert.py +39 -0
  228. xinference/thirdparty/melo/text/japanese.py +647 -0
  229. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  230. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  231. xinference/thirdparty/melo/text/korean.py +192 -0
  232. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  233. xinference/thirdparty/melo/text/spanish.py +122 -0
  234. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  235. xinference/thirdparty/melo/text/symbols.py +290 -0
  236. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  237. xinference/thirdparty/melo/train.py +635 -0
  238. xinference/thirdparty/melo/train.sh +19 -0
  239. xinference/thirdparty/melo/transforms.py +209 -0
  240. xinference/thirdparty/melo/utils.py +424 -0
  241. xinference/types.py +15 -0
  242. xinference/web/ui/build/asset-manifest.json +6 -6
  243. xinference/web/ui/build/index.html +1 -1
  244. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  245. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  246. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  247. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  248. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  249. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  250. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  251. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  252. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  253. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  254. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  255. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  256. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  257. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  258. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  259. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  260. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  261. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  262. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  263. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  264. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  265. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  266. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  292. xinference/web/ui/node_modules/.package-lock.json +67 -3
  293. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  294. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  295. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  296. xinference/web/ui/node_modules/i18next/package.json +129 -0
  297. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  298. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  299. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  300. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  301. xinference/web/ui/package-lock.json +69 -3
  302. xinference/web/ui/package.json +2 -0
  303. xinference/web/ui/src/locales/en.json +186 -0
  304. xinference/web/ui/src/locales/zh.json +186 -0
  305. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/METADATA +68 -32
  306. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/RECORD +316 -122
  307. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  308. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  309. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  310. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  311. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  312. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  313. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  314. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  315. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  316. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  317. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  318. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  319. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  320. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  321. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  322. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  323. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  324. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  325. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  326. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  327. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  328. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  329. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  330. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  331. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  332. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  333. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  334. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  335. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  336. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  337. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  338. /xinference/thirdparty/{fish_speech/tools → melo/text/fr_phonemizer}/__init__.py +0 -0
  339. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  340. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  341. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/WHEEL +0 -0
  342. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  343. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ class CosyVoiceModel:
39
39
  self._device = device
40
40
  self._model = None
41
41
  self._kwargs = kwargs
42
+ self._is_cosyvoice2 = False
42
43
 
43
44
  @property
44
45
  def model_ability(self):
@@ -48,14 +49,32 @@ class CosyVoiceModel:
48
49
  import os
49
50
  import sys
50
51
 
52
+ import torch
53
+
51
54
  # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
52
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
55
+ thirdparty_dir = os.path.join(os.path.dirname(__file__), "../../thirdparty")
56
+ sys.path.insert(0, thirdparty_dir)
57
+
58
+ if "CosyVoice2" in self._model_spec.model_name:
59
+ from cosyvoice.cli.cosyvoice import CosyVoice2 as CosyVoice
60
+
61
+ self._is_cosyvoice2 = True
62
+ else:
63
+ from cosyvoice.cli.cosyvoice import CosyVoice
53
64
 
54
- from cosyvoice.cli.cosyvoice import CosyVoice
65
+ self._is_cosyvoice2 = False
55
66
 
56
- self._model = CosyVoice(
57
- self._model_path, load_jit=self._kwargs.get("load_jit", False)
67
+ # Unify this configuration name as 'compile' to be compatible with the name 'load_jit'.
68
+ load_jit = self._kwargs.get("load_jit", False) or self._kwargs.get(
69
+ "compile", False
58
70
  )
71
+ logger.info("Loading CosyVoice model, compile=%s...", load_jit)
72
+ self._model = CosyVoice(self._model_path, load_jit=load_jit)
73
+ if self._is_cosyvoice2:
74
+ spk2info_file = os.path.join(thirdparty_dir, "cosyvoice/bin/spk2info.pt")
75
+ self._model.frontend.spk2info = torch.load(
76
+ spk2info_file, map_location=self._device
77
+ )
59
78
 
60
79
  def _speech_handle(
61
80
  self,
@@ -78,6 +97,15 @@ class CosyVoiceModel:
78
97
  output = self._model.inference_zero_shot(
79
98
  input, prompt_text, prompt_speech_16k, stream=stream
80
99
  )
100
+ elif instruct_text:
101
+ assert self._is_cosyvoice2
102
+ logger.info("CosyVoice inference_instruct")
103
+ output = self._model.inference_instruct2(
104
+ input,
105
+ instruct_text=instruct_text,
106
+ prompt_speech_16k=prompt_speech_16k,
107
+ stream=stream,
108
+ )
81
109
  else:
82
110
  logger.info("CosyVoice inference_cross_lingual")
83
111
  output = self._model.inference_cross_lingual(
@@ -87,6 +115,7 @@ class CosyVoiceModel:
87
115
  available_speakers = self._model.list_avaliable_spks()
88
116
  if not voice:
89
117
  voice = available_speakers[0]
118
+ logger.info("Auto select speaker: %s", voice)
90
119
  else:
91
120
  assert (
92
121
  voice in available_speakers
@@ -106,7 +135,9 @@ class CosyVoiceModel:
106
135
  def _generator_stream():
107
136
  with BytesIO() as out:
108
137
  writer = torchaudio.io.StreamWriter(out, format=response_format)
109
- writer.add_audio_stream(sample_rate=22050, num_channels=1)
138
+ writer.add_audio_stream(
139
+ sample_rate=self._model.sample_rate, num_channels=1
140
+ )
110
141
  i = 0
111
142
  last_pos = 0
112
143
  with writer.open():
@@ -125,7 +156,7 @@ class CosyVoiceModel:
125
156
  chunks = [o["tts_speech"] for o in output]
126
157
  t = torch.cat(chunks, dim=1)
127
158
  with BytesIO() as out:
128
- torchaudio.save(out, t, 22050, format=response_format)
159
+ torchaudio.save(out, t, self._model.sample_rate, format=response_format)
129
160
  return out.getvalue()
130
161
 
131
162
  return _generator_stream() if stream else _generator_block()
@@ -163,6 +194,8 @@ class CosyVoiceModel:
163
194
  assert (
164
195
  prompt_text is None
165
196
  ), "CosyVoice Instruct model does not support prompt_text"
197
+ elif self._is_cosyvoice2:
198
+ pass
166
199
  else:
167
200
  # inference_zero_shot
168
201
  # inference_cross_lingual
@@ -0,0 +1,200 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import io
15
+ import logging
16
+ import os
17
+ import re
18
+ from io import BytesIO
19
+ from typing import TYPE_CHECKING, Optional, Union
20
+
21
+ if TYPE_CHECKING:
22
+ from .core import AudioModelFamilyV1
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class F5TTSModel:
28
+ def __init__(
29
+ self,
30
+ model_uid: str,
31
+ model_path: str,
32
+ model_spec: "AudioModelFamilyV1",
33
+ device: Optional[str] = None,
34
+ **kwargs,
35
+ ):
36
+ self._model_uid = model_uid
37
+ self._model_path = model_path
38
+ self._model_spec = model_spec
39
+ self._device = device
40
+ self._model = None
41
+ self._vocoder = None
42
+ self._kwargs = kwargs
43
+
44
+ @property
45
+ def model_ability(self):
46
+ return self._model_spec.model_ability
47
+
48
+ def load(self):
49
+ import os
50
+ import sys
51
+
52
+ # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
53
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
54
+
55
+ from f5_tts.infer.utils_infer import load_model, load_vocoder
56
+ from f5_tts.model import DiT
57
+
58
+ vocoder_name = self._kwargs.get("vocoder_name", "vocos")
59
+ vocoder_path = self._kwargs.get("vocoder_path")
60
+
61
+ if vocoder_name not in ["vocos", "bigvgan"]:
62
+ raise Exception(f"Unsupported vocoder name: {vocoder_name}")
63
+
64
+ if vocoder_path is not None:
65
+ self._vocoder = load_vocoder(
66
+ vocoder_name=vocoder_name, is_local=True, local_path=vocoder_path
67
+ )
68
+ else:
69
+ self._vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=False)
70
+
71
+ model_cls = DiT
72
+ model_cfg = dict(
73
+ dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
74
+ )
75
+ if vocoder_name == "vocos":
76
+ exp_name = "F5TTS_Base"
77
+ ckpt_step = 1200000
78
+ elif vocoder_name == "bigvgan":
79
+ exp_name = "F5TTS_Base_bigvgan"
80
+ ckpt_step = 1250000
81
+ else:
82
+ assert False
83
+ ckpt_file = os.path.join(
84
+ self._model_path, exp_name, f"model_{ckpt_step}.safetensors"
85
+ )
86
+ logger.info(f"Loading %s...", ckpt_file)
87
+ self._model = load_model(
88
+ model_cls, model_cfg, ckpt_file, mel_spec_type=vocoder_name
89
+ )
90
+
91
+ def _infer(self, ref_audio, ref_text, text_gen, model_obj, mel_spec_type, speed):
92
+ import numpy as np
93
+ from f5_tts.infer.utils_infer import infer_process, preprocess_ref_audio_text
94
+
95
+ config = {}
96
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
97
+ if "voices" not in config:
98
+ voices = {"main": main_voice}
99
+ else:
100
+ voices = config["voices"]
101
+ voices["main"] = main_voice
102
+ for voice in voices:
103
+ (
104
+ voices[voice]["ref_audio"],
105
+ voices[voice]["ref_text"],
106
+ ) = preprocess_ref_audio_text(
107
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
108
+ )
109
+ logger.info("Voice:", voice)
110
+ logger.info("Ref_audio:", voices[voice]["ref_audio"])
111
+ logger.info("Ref_text:", voices[voice]["ref_text"])
112
+
113
+ final_sample_rate = None
114
+ generated_audio_segments = []
115
+ reg1 = r"(?=\[\w+\])"
116
+ chunks = re.split(reg1, text_gen)
117
+ reg2 = r"\[(\w+)\]"
118
+ for text in chunks:
119
+ if not text.strip():
120
+ continue
121
+ match = re.match(reg2, text)
122
+ if match:
123
+ voice = match[1]
124
+ else:
125
+ logger.info("No voice tag found, using main.")
126
+ voice = "main"
127
+ if voice not in voices:
128
+ logger.info(f"Voice {voice} not found, using main.")
129
+ voice = "main"
130
+ text = re.sub(reg2, "", text)
131
+ gen_text = text.strip()
132
+ ref_audio = voices[voice]["ref_audio"]
133
+ ref_text = voices[voice]["ref_text"]
134
+ logger.info(f"Voice: {voice}")
135
+ audio, final_sample_rate, spectragram = infer_process(
136
+ ref_audio,
137
+ ref_text,
138
+ gen_text,
139
+ model_obj,
140
+ self._vocoder,
141
+ mel_spec_type=mel_spec_type,
142
+ speed=speed,
143
+ )
144
+ generated_audio_segments.append(audio)
145
+
146
+ if generated_audio_segments:
147
+ final_wave = np.concatenate(generated_audio_segments)
148
+ return final_sample_rate, final_wave
149
+ return None, None
150
+
151
+ def speech(
152
+ self,
153
+ input: str,
154
+ voice: str,
155
+ response_format: str = "mp3",
156
+ speed: float = 1.0,
157
+ stream: bool = False,
158
+ **kwargs,
159
+ ):
160
+ import f5_tts
161
+ import soundfile
162
+ import tomli
163
+
164
+ if stream:
165
+ raise Exception("F5-TTS does not support stream generation.")
166
+
167
+ prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
168
+ prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
169
+
170
+ ref_audio: Union[str, io.BytesIO]
171
+ if prompt_speech is None:
172
+ base = os.path.dirname(f5_tts.__file__)
173
+ config = os.path.join(base, "infer/examples/basic/basic.toml")
174
+ with open(config, "rb") as f:
175
+ config_dict = tomli.load(f)
176
+ ref_audio = os.path.join(base, config_dict["ref_audio"])
177
+ prompt_text = config_dict["ref_text"]
178
+ else:
179
+ ref_audio = io.BytesIO(prompt_speech)
180
+ if prompt_text is None:
181
+ raise ValueError("`prompt_text` cannot be empty")
182
+
183
+ assert self._model is not None
184
+ vocoder_name = self._kwargs.get("vocoder_name", "vocos")
185
+ sample_rate, wav = self._infer(
186
+ ref_audio=ref_audio,
187
+ ref_text=prompt_text,
188
+ text_gen=input,
189
+ model_obj=self._model,
190
+ mel_spec_type=vocoder_name,
191
+ speed=speed,
192
+ )
193
+
194
+ # Save the generated audio
195
+ with BytesIO() as out:
196
+ with soundfile.SoundFile(
197
+ out, "w", sample_rate, 1, format=response_format.upper()
198
+ ) as f:
199
+ f.write(wav)
200
+ return out.getvalue()
@@ -0,0 +1,260 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ import io
17
+ import logging
18
+ import os
19
+ from io import BytesIO
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING, Literal, Optional, Union
22
+
23
+ import numpy as np
24
+ from tqdm import tqdm
25
+
26
+ if TYPE_CHECKING:
27
+ from .core import AudioModelFamilyV1
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class F5TTSMLXModel:
33
+ def __init__(
34
+ self,
35
+ model_uid: str,
36
+ model_path: str,
37
+ model_spec: "AudioModelFamilyV1",
38
+ device: Optional[str] = None,
39
+ **kwargs,
40
+ ):
41
+ self._model_uid = model_uid
42
+ self._model_path = model_path
43
+ self._model_spec = model_spec
44
+ self._device = device
45
+ self._model = None
46
+ self._kwargs = kwargs
47
+ self._model = None
48
+
49
+ @property
50
+ def model_ability(self):
51
+ return self._model_spec.model_ability
52
+
53
+ def load(self):
54
+ try:
55
+ import mlx.core as mx
56
+ from f5_tts_mlx.cfm import F5TTS
57
+ from f5_tts_mlx.dit import DiT
58
+ from f5_tts_mlx.duration import DurationPredictor, DurationTransformer
59
+ from vocos_mlx import Vocos
60
+ except ImportError:
61
+ error_message = "Failed to import module 'f5_tts_mlx'"
62
+ installation_guide = [
63
+ "Please make sure 'f5_tts_mlx' is installed.\n",
64
+ ]
65
+
66
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
67
+
68
+ path = Path(self._model_path)
69
+ # vocab
70
+
71
+ vocab_path = path / "vocab.txt"
72
+ vocab = {v: i for i, v in enumerate(Path(vocab_path).read_text().split("\n"))}
73
+ if len(vocab) == 0:
74
+ raise ValueError(f"Could not load vocab from {vocab_path}")
75
+
76
+ # duration predictor
77
+
78
+ duration_model_path = path / "duration_v2.safetensors"
79
+ duration_predictor = None
80
+
81
+ if duration_model_path.exists():
82
+ duration_predictor = DurationPredictor(
83
+ transformer=DurationTransformer(
84
+ dim=512,
85
+ depth=8,
86
+ heads=8,
87
+ text_dim=512,
88
+ ff_mult=2,
89
+ conv_layers=2,
90
+ text_num_embeds=len(vocab) - 1,
91
+ ),
92
+ vocab_char_map=vocab,
93
+ )
94
+ weights = mx.load(duration_model_path.as_posix(), format="safetensors")
95
+ duration_predictor.load_weights(list(weights.items()))
96
+
97
+ # vocoder
98
+
99
+ vocos = Vocos.from_pretrained("lucasnewman/vocos-mel-24khz")
100
+
101
+ # model
102
+
103
+ model_path = path / "model.safetensors"
104
+
105
+ f5tts = F5TTS(
106
+ transformer=DiT(
107
+ dim=1024,
108
+ depth=22,
109
+ heads=16,
110
+ ff_mult=2,
111
+ text_dim=512,
112
+ conv_layers=4,
113
+ text_num_embeds=len(vocab) - 1,
114
+ ),
115
+ vocab_char_map=vocab,
116
+ vocoder=vocos.decode,
117
+ duration_predictor=duration_predictor,
118
+ )
119
+
120
+ weights = mx.load(model_path.as_posix(), format="safetensors")
121
+ f5tts.load_weights(list(weights.items()))
122
+ mx.eval(f5tts.parameters())
123
+
124
+ self._model = f5tts
125
+
126
+ def speech(
127
+ self,
128
+ input: str,
129
+ voice: str,
130
+ response_format: str = "mp3",
131
+ speed: float = 1.0,
132
+ stream: bool = False,
133
+ **kwargs,
134
+ ):
135
+ import mlx.core as mx
136
+ import soundfile as sf
137
+ import tomli
138
+ from f5_tts_mlx.generate import (
139
+ FRAMES_PER_SEC,
140
+ SAMPLE_RATE,
141
+ TARGET_RMS,
142
+ convert_char_to_pinyin,
143
+ split_sentences,
144
+ )
145
+
146
+ from .utils import ensure_sample_rate
147
+
148
+ if stream:
149
+ raise Exception("F5-TTS does not support stream generation.")
150
+
151
+ prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
152
+ prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
153
+ duration: Optional[float] = kwargs.pop("duration", None)
154
+ steps: Optional[int] = kwargs.pop("steps", 8)
155
+ cfg_strength: Optional[float] = kwargs.pop("cfg_strength", 2.0)
156
+ method: Literal["euler", "midpoint"] = kwargs.pop("method", "rk4")
157
+ sway_sampling_coef: float = kwargs.pop("sway_sampling_coef", -1.0)
158
+ seed: Optional[int] = kwargs.pop("seed", None)
159
+
160
+ prompt_speech_path: Union[str, io.BytesIO]
161
+ if prompt_speech is None:
162
+ base = os.path.join(os.path.dirname(__file__), "../../thirdparty/f5_tts")
163
+ config = os.path.join(base, "infer/examples/basic/basic.toml")
164
+ with open(config, "rb") as f:
165
+ config_dict = tomli.load(f)
166
+ prompt_speech_path = os.path.join(base, config_dict["ref_audio"])
167
+ prompt_text = config_dict["ref_text"]
168
+ else:
169
+ prompt_speech_path = io.BytesIO(prompt_speech)
170
+
171
+ if prompt_text is None:
172
+ raise ValueError("`prompt_text` cannot be empty")
173
+
174
+ audio, sr = sf.read(prompt_speech_path)
175
+ audio = ensure_sample_rate(audio, sr, SAMPLE_RATE)
176
+
177
+ audio = mx.array(audio)
178
+ ref_audio_duration = audio.shape[0] / SAMPLE_RATE
179
+ logger.debug(
180
+ f"Got reference audio with duration: {ref_audio_duration:.2f} seconds"
181
+ )
182
+
183
+ rms = mx.sqrt(mx.mean(mx.square(audio)))
184
+ if rms < TARGET_RMS:
185
+ audio = audio * TARGET_RMS / rms
186
+
187
+ sentences = split_sentences(input)
188
+ is_single_generation = len(sentences) <= 1 or duration is not None
189
+
190
+ if is_single_generation:
191
+ generation_text = convert_char_to_pinyin([prompt_text + " " + input]) # type: ignore
192
+
193
+ if duration is not None:
194
+ duration = int(duration * FRAMES_PER_SEC)
195
+
196
+ start_date = datetime.datetime.now()
197
+
198
+ wave, _ = self._model.sample( # type: ignore
199
+ mx.expand_dims(audio, axis=0),
200
+ text=generation_text,
201
+ duration=duration,
202
+ steps=steps,
203
+ method=method,
204
+ speed=speed,
205
+ cfg_strength=cfg_strength,
206
+ sway_sampling_coef=sway_sampling_coef,
207
+ seed=seed,
208
+ )
209
+
210
+ wave = wave[audio.shape[0] :]
211
+ mx.eval(wave)
212
+
213
+ generated_duration = wave.shape[0] / SAMPLE_RATE
214
+ print(
215
+ f"Generated {generated_duration:.2f}s of audio in {datetime.datetime.now() - start_date}."
216
+ )
217
+
218
+ else:
219
+ start_date = datetime.datetime.now()
220
+
221
+ output = []
222
+
223
+ for sentence_text in tqdm(split_sentences(input)):
224
+ text = convert_char_to_pinyin([prompt_text + " " + sentence_text]) # type: ignore
225
+
226
+ if duration is not None:
227
+ duration = int(duration * FRAMES_PER_SEC)
228
+
229
+ wave, _ = self._model.sample( # type: ignore
230
+ mx.expand_dims(audio, axis=0),
231
+ text=text,
232
+ duration=duration,
233
+ steps=steps,
234
+ method=method,
235
+ speed=speed,
236
+ cfg_strength=cfg_strength,
237
+ sway_sampling_coef=sway_sampling_coef,
238
+ seed=seed,
239
+ )
240
+
241
+ # trim the reference audio
242
+ wave = wave[audio.shape[0] :]
243
+ mx.eval(wave)
244
+
245
+ output.append(wave)
246
+
247
+ wave = mx.concatenate(output, axis=0)
248
+
249
+ generated_duration = wave.shape[0] / SAMPLE_RATE
250
+ logger.debug(
251
+ f"Generated {generated_duration:.2f}s of audio in {datetime.datetime.now() - start_date}."
252
+ )
253
+
254
+ # Save the generated audio
255
+ with BytesIO() as out:
256
+ with sf.SoundFile(
257
+ out, "w", SAMPLE_RATE, 1, format=response_format.upper()
258
+ ) as f:
259
+ f.write(np.array(wave))
260
+ return out.getvalue()