xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import io
15
+ import logging
16
+ import os
17
+ import re
18
+ from io import BytesIO
19
+ from typing import TYPE_CHECKING, Optional, Union
20
+
21
+ if TYPE_CHECKING:
22
+ from .core import AudioModelFamilyV1
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class F5TTSModel:
28
+ def __init__(
29
+ self,
30
+ model_uid: str,
31
+ model_path: str,
32
+ model_spec: "AudioModelFamilyV1",
33
+ device: Optional[str] = None,
34
+ **kwargs,
35
+ ):
36
+ self._model_uid = model_uid
37
+ self._model_path = model_path
38
+ self._model_spec = model_spec
39
+ self._device = device
40
+ self._model = None
41
+ self._vocoder = None
42
+ self._kwargs = kwargs
43
+
44
+ @property
45
+ def model_ability(self):
46
+ return self._model_spec.model_ability
47
+
48
+ def load(self):
49
+ import os
50
+ import sys
51
+
52
+ # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
53
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
54
+
55
+ from f5_tts.infer.utils_infer import load_model, load_vocoder
56
+ from f5_tts.model import DiT
57
+
58
+ vocoder_name = self._kwargs.get("vocoder_name", "vocos")
59
+ vocoder_path = self._kwargs.get("vocoder_path")
60
+
61
+ if vocoder_name not in ["vocos", "bigvgan"]:
62
+ raise Exception(f"Unsupported vocoder name: {vocoder_name}")
63
+
64
+ if vocoder_path is not None:
65
+ self._vocoder = load_vocoder(
66
+ vocoder_name=vocoder_name, is_local=True, local_path=vocoder_path
67
+ )
68
+ else:
69
+ self._vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=False)
70
+
71
+ model_cls = DiT
72
+ model_cfg = dict(
73
+ dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
74
+ )
75
+ if vocoder_name == "vocos":
76
+ exp_name = "F5TTS_Base"
77
+ ckpt_step = 1200000
78
+ elif vocoder_name == "bigvgan":
79
+ exp_name = "F5TTS_Base_bigvgan"
80
+ ckpt_step = 1250000
81
+ else:
82
+ assert False
83
+ ckpt_file = os.path.join(
84
+ self._model_path, exp_name, f"model_{ckpt_step}.safetensors"
85
+ )
86
+ logger.info(f"Loading %s...", ckpt_file)
87
+ self._model = load_model(
88
+ model_cls, model_cfg, ckpt_file, mel_spec_type=vocoder_name
89
+ )
90
+
91
+ def _infer(self, ref_audio, ref_text, text_gen, model_obj, mel_spec_type, speed):
92
+ import numpy as np
93
+ from f5_tts.infer.utils_infer import infer_process, preprocess_ref_audio_text
94
+
95
+ config = {}
96
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
97
+ if "voices" not in config:
98
+ voices = {"main": main_voice}
99
+ else:
100
+ voices = config["voices"]
101
+ voices["main"] = main_voice
102
+ for voice in voices:
103
+ (
104
+ voices[voice]["ref_audio"],
105
+ voices[voice]["ref_text"],
106
+ ) = preprocess_ref_audio_text(
107
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
108
+ )
109
+ logger.info("Voice:", voice)
110
+ logger.info("Ref_audio:", voices[voice]["ref_audio"])
111
+ logger.info("Ref_text:", voices[voice]["ref_text"])
112
+
113
+ final_sample_rate = None
114
+ generated_audio_segments = []
115
+ reg1 = r"(?=\[\w+\])"
116
+ chunks = re.split(reg1, text_gen)
117
+ reg2 = r"\[(\w+)\]"
118
+ for text in chunks:
119
+ if not text.strip():
120
+ continue
121
+ match = re.match(reg2, text)
122
+ if match:
123
+ voice = match[1]
124
+ else:
125
+ logger.info("No voice tag found, using main.")
126
+ voice = "main"
127
+ if voice not in voices:
128
+ logger.info(f"Voice {voice} not found, using main.")
129
+ voice = "main"
130
+ text = re.sub(reg2, "", text)
131
+ gen_text = text.strip()
132
+ ref_audio = voices[voice]["ref_audio"]
133
+ ref_text = voices[voice]["ref_text"]
134
+ logger.info(f"Voice: {voice}")
135
+ audio, final_sample_rate, spectragram = infer_process(
136
+ ref_audio,
137
+ ref_text,
138
+ gen_text,
139
+ model_obj,
140
+ self._vocoder,
141
+ mel_spec_type=mel_spec_type,
142
+ speed=speed,
143
+ )
144
+ generated_audio_segments.append(audio)
145
+
146
+ if generated_audio_segments:
147
+ final_wave = np.concatenate(generated_audio_segments)
148
+ return final_sample_rate, final_wave
149
+ return None, None
150
+
151
+ def speech(
152
+ self,
153
+ input: str,
154
+ voice: str,
155
+ response_format: str = "mp3",
156
+ speed: float = 1.0,
157
+ stream: bool = False,
158
+ **kwargs,
159
+ ):
160
+ import f5_tts
161
+ import soundfile
162
+ import tomli
163
+
164
+ if stream:
165
+ raise Exception("F5-TTS does not support stream generation.")
166
+
167
+ prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
168
+ prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
169
+
170
+ ref_audio: Union[str, io.BytesIO]
171
+ if prompt_speech is None:
172
+ base = os.path.dirname(f5_tts.__file__)
173
+ config = os.path.join(base, "infer/examples/basic/basic.toml")
174
+ with open(config, "rb") as f:
175
+ config_dict = tomli.load(f)
176
+ ref_audio = os.path.join(base, config_dict["ref_audio"])
177
+ prompt_text = config_dict["ref_text"]
178
+ else:
179
+ ref_audio = io.BytesIO(prompt_speech)
180
+ if prompt_text is None:
181
+ raise ValueError("`prompt_text` cannot be empty")
182
+
183
+ assert self._model is not None
184
+ vocoder_name = self._kwargs.get("vocoder_name", "vocos")
185
+ sample_rate, wav = self._infer(
186
+ ref_audio=ref_audio,
187
+ ref_text=prompt_text,
188
+ text_gen=input,
189
+ model_obj=self._model,
190
+ mel_spec_type=vocoder_name,
191
+ speed=speed,
192
+ )
193
+
194
+ # Save the generated audio
195
+ with BytesIO() as out:
196
+ with soundfile.SoundFile(
197
+ out, "w", sample_rate, 1, format=response_format.upper()
198
+ ) as f:
199
+ f.write(wav)
200
+ return out.getvalue()
@@ -0,0 +1,260 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ import io
17
+ import logging
18
+ import os
19
+ from io import BytesIO
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING, Literal, Optional, Union
22
+
23
+ import numpy as np
24
+ from tqdm import tqdm
25
+
26
+ if TYPE_CHECKING:
27
+ from .core import AudioModelFamilyV1
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class F5TTSMLXModel:
33
+ def __init__(
34
+ self,
35
+ model_uid: str,
36
+ model_path: str,
37
+ model_spec: "AudioModelFamilyV1",
38
+ device: Optional[str] = None,
39
+ **kwargs,
40
+ ):
41
+ self._model_uid = model_uid
42
+ self._model_path = model_path
43
+ self._model_spec = model_spec
44
+ self._device = device
45
+ self._model = None
46
+ self._kwargs = kwargs
47
+ self._model = None
48
+
49
+ @property
50
+ def model_ability(self):
51
+ return self._model_spec.model_ability
52
+
53
+ def load(self):
54
+ try:
55
+ import mlx.core as mx
56
+ from f5_tts_mlx.cfm import F5TTS
57
+ from f5_tts_mlx.dit import DiT
58
+ from f5_tts_mlx.duration import DurationPredictor, DurationTransformer
59
+ from vocos_mlx import Vocos
60
+ except ImportError:
61
+ error_message = "Failed to import module 'f5_tts_mlx'"
62
+ installation_guide = [
63
+ "Please make sure 'f5_tts_mlx' is installed.\n",
64
+ ]
65
+
66
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
67
+
68
+ path = Path(self._model_path)
69
+ # vocab
70
+
71
+ vocab_path = path / "vocab.txt"
72
+ vocab = {v: i for i, v in enumerate(Path(vocab_path).read_text().split("\n"))}
73
+ if len(vocab) == 0:
74
+ raise ValueError(f"Could not load vocab from {vocab_path}")
75
+
76
+ # duration predictor
77
+
78
+ duration_model_path = path / "duration_v2.safetensors"
79
+ duration_predictor = None
80
+
81
+ if duration_model_path.exists():
82
+ duration_predictor = DurationPredictor(
83
+ transformer=DurationTransformer(
84
+ dim=512,
85
+ depth=8,
86
+ heads=8,
87
+ text_dim=512,
88
+ ff_mult=2,
89
+ conv_layers=2,
90
+ text_num_embeds=len(vocab) - 1,
91
+ ),
92
+ vocab_char_map=vocab,
93
+ )
94
+ weights = mx.load(duration_model_path.as_posix(), format="safetensors")
95
+ duration_predictor.load_weights(list(weights.items()))
96
+
97
+ # vocoder
98
+
99
+ vocos = Vocos.from_pretrained("lucasnewman/vocos-mel-24khz")
100
+
101
+ # model
102
+
103
+ model_path = path / "model.safetensors"
104
+
105
+ f5tts = F5TTS(
106
+ transformer=DiT(
107
+ dim=1024,
108
+ depth=22,
109
+ heads=16,
110
+ ff_mult=2,
111
+ text_dim=512,
112
+ conv_layers=4,
113
+ text_num_embeds=len(vocab) - 1,
114
+ ),
115
+ vocab_char_map=vocab,
116
+ vocoder=vocos.decode,
117
+ duration_predictor=duration_predictor,
118
+ )
119
+
120
+ weights = mx.load(model_path.as_posix(), format="safetensors")
121
+ f5tts.load_weights(list(weights.items()))
122
+ mx.eval(f5tts.parameters())
123
+
124
+ self._model = f5tts
125
+
126
+ def speech(
127
+ self,
128
+ input: str,
129
+ voice: str,
130
+ response_format: str = "mp3",
131
+ speed: float = 1.0,
132
+ stream: bool = False,
133
+ **kwargs,
134
+ ):
135
+ import mlx.core as mx
136
+ import soundfile as sf
137
+ import tomli
138
+ from f5_tts_mlx.generate import (
139
+ FRAMES_PER_SEC,
140
+ SAMPLE_RATE,
141
+ TARGET_RMS,
142
+ convert_char_to_pinyin,
143
+ split_sentences,
144
+ )
145
+
146
+ from .utils import ensure_sample_rate
147
+
148
+ if stream:
149
+ raise Exception("F5-TTS does not support stream generation.")
150
+
151
+ prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
152
+ prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
153
+ duration: Optional[float] = kwargs.pop("duration", None)
154
+ steps: Optional[int] = kwargs.pop("steps", 8)
155
+ cfg_strength: Optional[float] = kwargs.pop("cfg_strength", 2.0)
156
+ method: Literal["euler", "midpoint"] = kwargs.pop("method", "rk4")
157
+ sway_sampling_coef: float = kwargs.pop("sway_sampling_coef", -1.0)
158
+ seed: Optional[int] = kwargs.pop("seed", None)
159
+
160
+ prompt_speech_path: Union[str, io.BytesIO]
161
+ if prompt_speech is None:
162
+ base = os.path.join(os.path.dirname(__file__), "../../thirdparty/f5_tts")
163
+ config = os.path.join(base, "infer/examples/basic/basic.toml")
164
+ with open(config, "rb") as f:
165
+ config_dict = tomli.load(f)
166
+ prompt_speech_path = os.path.join(base, config_dict["ref_audio"])
167
+ prompt_text = config_dict["ref_text"]
168
+ else:
169
+ prompt_speech_path = io.BytesIO(prompt_speech)
170
+
171
+ if prompt_text is None:
172
+ raise ValueError("`prompt_text` cannot be empty")
173
+
174
+ audio, sr = sf.read(prompt_speech_path)
175
+ audio = ensure_sample_rate(audio, sr, SAMPLE_RATE)
176
+
177
+ audio = mx.array(audio)
178
+ ref_audio_duration = audio.shape[0] / SAMPLE_RATE
179
+ logger.debug(
180
+ f"Got reference audio with duration: {ref_audio_duration:.2f} seconds"
181
+ )
182
+
183
+ rms = mx.sqrt(mx.mean(mx.square(audio)))
184
+ if rms < TARGET_RMS:
185
+ audio = audio * TARGET_RMS / rms
186
+
187
+ sentences = split_sentences(input)
188
+ is_single_generation = len(sentences) <= 1 or duration is not None
189
+
190
+ if is_single_generation:
191
+ generation_text = convert_char_to_pinyin([prompt_text + " " + input]) # type: ignore
192
+
193
+ if duration is not None:
194
+ duration = int(duration * FRAMES_PER_SEC)
195
+
196
+ start_date = datetime.datetime.now()
197
+
198
+ wave, _ = self._model.sample( # type: ignore
199
+ mx.expand_dims(audio, axis=0),
200
+ text=generation_text,
201
+ duration=duration,
202
+ steps=steps,
203
+ method=method,
204
+ speed=speed,
205
+ cfg_strength=cfg_strength,
206
+ sway_sampling_coef=sway_sampling_coef,
207
+ seed=seed,
208
+ )
209
+
210
+ wave = wave[audio.shape[0] :]
211
+ mx.eval(wave)
212
+
213
+ generated_duration = wave.shape[0] / SAMPLE_RATE
214
+ print(
215
+ f"Generated {generated_duration:.2f}s of audio in {datetime.datetime.now() - start_date}."
216
+ )
217
+
218
+ else:
219
+ start_date = datetime.datetime.now()
220
+
221
+ output = []
222
+
223
+ for sentence_text in tqdm(split_sentences(input)):
224
+ text = convert_char_to_pinyin([prompt_text + " " + sentence_text]) # type: ignore
225
+
226
+ if duration is not None:
227
+ duration = int(duration * FRAMES_PER_SEC)
228
+
229
+ wave, _ = self._model.sample( # type: ignore
230
+ mx.expand_dims(audio, axis=0),
231
+ text=text,
232
+ duration=duration,
233
+ steps=steps,
234
+ method=method,
235
+ speed=speed,
236
+ cfg_strength=cfg_strength,
237
+ sway_sampling_coef=sway_sampling_coef,
238
+ seed=seed,
239
+ )
240
+
241
+ # trim the reference audio
242
+ wave = wave[audio.shape[0] :]
243
+ mx.eval(wave)
244
+
245
+ output.append(wave)
246
+
247
+ wave = mx.concatenate(output, axis=0)
248
+
249
+ generated_duration = wave.shape[0] / SAMPLE_RATE
250
+ logger.debug(
251
+ f"Generated {generated_duration:.2f}s of audio in {datetime.datetime.now() - start_date}."
252
+ )
253
+
254
+ # Save the generated audio
255
+ with BytesIO() as out:
256
+ with sf.SoundFile(
257
+ out, "w", SAMPLE_RATE, 1, format=response_format.upper()
258
+ ) as f:
259
+ f.write(np.array(wave))
260
+ return out.getvalue()
@@ -11,10 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import gc
15
14
  import logging
16
15
  import os.path
17
- import queue
18
16
  import sys
19
17
  from io import BytesIO
20
18
  from typing import TYPE_CHECKING, Optional
@@ -60,6 +58,7 @@ class FishSpeechModel:
60
58
  self._device = device
61
59
  self._llama_queue = None
62
60
  self._model = None
61
+ self._engine = None
63
62
  self._kwargs = kwargs
64
63
 
65
64
  @property
@@ -72,6 +71,7 @@ class FishSpeechModel:
72
71
  0, os.path.join(os.path.dirname(__file__), "../../thirdparty/fish_speech")
73
72
  )
74
73
 
74
+ from tools.inference_engine import TTSInferenceEngine
75
75
  from tools.llama.generate import launch_thread_safe_queue
76
76
  from tools.vqgan.inference import load_model as load_decoder_model
77
77
 
@@ -81,12 +81,19 @@ class FishSpeechModel:
81
81
  if not is_device_available(self._device):
82
82
  raise ValueError(f"Device {self._device} is not available!")
83
83
 
84
- logger.info("Loading Llama model...")
84
+ # https://github.com/pytorch/pytorch/issues/129207
85
+ if self._device == "mps":
86
+ logger.warning("The Conv1d has bugs on MPS backend, fallback to CPU.")
87
+ self._device = "cpu"
88
+
89
+ enable_compile = self._kwargs.get("compile", False)
90
+ precision = self._kwargs.get("precision", torch.bfloat16)
91
+ logger.info("Loading Llama model, compile=%s...", enable_compile)
85
92
  self._llama_queue = launch_thread_safe_queue(
86
93
  checkpoint_path=self._model_path,
87
94
  device=self._device,
88
- precision=torch.bfloat16,
89
- compile=False,
95
+ precision=precision,
96
+ compile=enable_compile,
90
97
  )
91
98
  logger.info("Llama model loaded, loading VQ-GAN model...")
92
99
 
@@ -100,98 +107,10 @@ class FishSpeechModel:
100
107
  device=self._device,
101
108
  )
102
109
 
103
- @torch.inference_mode()
104
- def _inference(
105
- self,
106
- text,
107
- enable_reference_audio,
108
- reference_audio,
109
- reference_text,
110
- max_new_tokens,
111
- chunk_length,
112
- top_p,
113
- repetition_penalty,
114
- temperature,
115
- streaming=False,
116
- ):
117
- from fish_speech.utils import autocast_exclude_mps
118
- from tools.api import decode_vq_tokens, encode_reference
119
- from tools.llama.generate import (
120
- GenerateRequest,
121
- GenerateResponse,
122
- WrappedGenerateResponse,
123
- )
124
-
125
- # Parse reference audio aka prompt
126
- prompt_tokens = encode_reference(
127
- decoder_model=self._model,
128
- reference_audio=reference_audio,
129
- enable_reference_audio=enable_reference_audio,
130
- )
131
-
132
- # LLAMA Inference
133
- request = dict(
134
- device=self._model.device,
135
- max_new_tokens=max_new_tokens,
136
- text=text,
137
- top_p=top_p,
138
- repetition_penalty=repetition_penalty,
139
- temperature=temperature,
140
- compile=False,
141
- iterative_prompt=chunk_length > 0,
142
- chunk_length=chunk_length,
143
- max_length=2048,
144
- prompt_tokens=prompt_tokens if enable_reference_audio else None,
145
- prompt_text=reference_text if enable_reference_audio else None,
110
+ self._engine = TTSInferenceEngine(
111
+ self._llama_queue, self._model, precision, enable_compile
146
112
  )
147
113
 
148
- response_queue = queue.Queue()
149
- self._llama_queue.put(
150
- GenerateRequest(
151
- request=request,
152
- response_queue=response_queue,
153
- )
154
- )
155
-
156
- if streaming:
157
- yield wav_chunk_header(), None, None
158
-
159
- segments = []
160
-
161
- while True:
162
- result: WrappedGenerateResponse = response_queue.get() # type: ignore
163
- if result.status == "error":
164
- raise Exception(str(result.response))
165
-
166
- result: GenerateResponse = result.response # type: ignore
167
- if result.action == "next":
168
- break
169
-
170
- with autocast_exclude_mps(
171
- device_type=self._model.device.type, dtype=torch.bfloat16
172
- ):
173
- fake_audios = decode_vq_tokens(
174
- decoder_model=self._model,
175
- codes=result.codes,
176
- )
177
-
178
- fake_audios = fake_audios.float().cpu().numpy()
179
- segments.append(fake_audios)
180
-
181
- if streaming:
182
- yield (fake_audios * 32768).astype(np.int16).tobytes(), None, None
183
-
184
- if len(segments) == 0:
185
- raise Exception("No audio generated, please check the input text.")
186
-
187
- # No matter streaming or not, we need to return the final audio
188
- audio = np.concatenate(segments, axis=0)
189
- yield None, (self._model.spec_transform.sample_rate, audio), None
190
-
191
- if torch.cuda.is_available():
192
- torch.cuda.empty_cache()
193
- gc.collect()
194
-
195
114
  def speech(
196
115
  self,
197
116
  input: str,
@@ -204,29 +123,70 @@ class FishSpeechModel:
204
123
  logger.warning("Fish speech does not support setting voice: %s.", voice)
205
124
  if speed != 1.0:
206
125
  logger.warning("Fish speech does not support setting speed: %s.", speed)
207
- if stream is True:
208
- logger.warning("stream mode is not implemented.")
209
126
  import torchaudio
127
+ from tools.schema import ServeReferenceAudio, ServeTTSRequest
128
+
129
+ prompt_speech = kwargs.get("prompt_speech")
130
+ prompt_text = kwargs.get("prompt_text", kwargs.get("reference_text", ""))
131
+ if prompt_speech is not None:
132
+ r = ServeReferenceAudio(audio=prompt_speech, text=prompt_text)
133
+ references = [r]
134
+ else:
135
+ references = []
210
136
 
211
- result = list(
212
- self._inference(
137
+ assert self._engine is not None
138
+ result = self._engine.inference(
139
+ ServeTTSRequest(
213
140
  text=input,
214
- enable_reference_audio=False,
215
- reference_audio=None,
216
- reference_text=kwargs.get("reference_text", ""),
141
+ references=references,
142
+ reference_id=kwargs.get("reference_id"),
143
+ seed=kwargs.get("seed"),
217
144
  max_new_tokens=kwargs.get("max_new_tokens", 1024),
218
145
  chunk_length=kwargs.get("chunk_length", 200),
219
146
  top_p=kwargs.get("top_p", 0.7),
220
147
  repetition_penalty=kwargs.get("repetition_penalty", 1.2),
221
148
  temperature=kwargs.get("temperature", 0.7),
149
+ streaming=stream,
150
+ format=response_format,
222
151
  )
223
152
  )
224
- sample_rate, audio = result[0][1]
225
- audio = np.array([audio])
226
153
 
227
- # Save the generated audio
228
- with BytesIO() as out:
229
- torchaudio.save(
230
- out, torch.from_numpy(audio), sample_rate, format=response_format
231
- )
232
- return out.getvalue()
154
+ if stream:
155
+
156
+ def _stream_generator():
157
+ with BytesIO() as out:
158
+ writer = torchaudio.io.StreamWriter(out, format=response_format)
159
+ writer.add_audio_stream(
160
+ sample_rate=self._model.spec_transform.sample_rate,
161
+ num_channels=1,
162
+ )
163
+ i = 0
164
+ last_pos = 0
165
+ with writer.open():
166
+ for chunk in result:
167
+ if chunk.code == "final":
168
+ continue
169
+ chunk = chunk.audio[1]
170
+ if chunk is not None:
171
+ chunk = chunk.reshape((chunk.shape[0], 1))
172
+ trans_chunk = torch.from_numpy(chunk)
173
+ writer.write_audio_chunk(i, trans_chunk)
174
+ new_last_pos = out.tell()
175
+ if new_last_pos != last_pos:
176
+ out.seek(last_pos)
177
+ encoded_bytes = out.read()
178
+ yield encoded_bytes
179
+ last_pos = new_last_pos
180
+
181
+ return _stream_generator()
182
+ else:
183
+ result = list(result)
184
+ sample_rate, audio = result[0].audio
185
+ audio = np.array([audio])
186
+
187
+ # Save the generated audio
188
+ with BytesIO() as out:
189
+ torchaudio.save(
190
+ out, torch.from_numpy(audio), sample_rate, format=response_format
191
+ )
192
+ return out.getvalue()