xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,298 @@
1
+ import base64
2
+ import ctypes
3
+ import io
4
+ import json
5
+ import os
6
+ import struct
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+ from typing import AsyncGenerator, Union
10
+
11
+ import httpx
12
+ import numpy as np
13
+ import ormsgpack
14
+ import soundfile as sf
15
+
16
+ from .schema import (
17
+ ServeChatRequest,
18
+ ServeMessage,
19
+ ServeTextPart,
20
+ ServeVQGANDecodeRequest,
21
+ ServeVQGANEncodeRequest,
22
+ ServeVQPart,
23
+ )
24
+
25
+
26
+ class CustomAudioFrame:
27
+ def __init__(self, data, sample_rate, num_channels, samples_per_channel):
28
+ if len(data) < num_channels * samples_per_channel * ctypes.sizeof(
29
+ ctypes.c_int16
30
+ ):
31
+ raise ValueError(
32
+ "data length must be >= num_channels * samples_per_channel * sizeof(int16)"
33
+ )
34
+
35
+ self._data = bytearray(data)
36
+ self._sample_rate = sample_rate
37
+ self._num_channels = num_channels
38
+ self._samples_per_channel = samples_per_channel
39
+
40
+ @property
41
+ def data(self):
42
+ return memoryview(self._data).cast("h")
43
+
44
+ @property
45
+ def sample_rate(self):
46
+ return self._sample_rate
47
+
48
+ @property
49
+ def num_channels(self):
50
+ return self._num_channels
51
+
52
+ @property
53
+ def samples_per_channel(self):
54
+ return self._samples_per_channel
55
+
56
+ @property
57
+ def duration(self):
58
+ return self.samples_per_channel / self.sample_rate
59
+
60
+ def __repr__(self):
61
+ return (
62
+ f"CustomAudioFrame(sample_rate={self.sample_rate}, "
63
+ f"num_channels={self.num_channels}, "
64
+ f"samples_per_channel={self.samples_per_channel}, "
65
+ f"duration={self.duration:.3f})"
66
+ )
67
+
68
+
69
+ class FishE2EEventType(Enum):
70
+ SPEECH_SEGMENT = 1
71
+ TEXT_SEGMENT = 2
72
+ END_OF_TEXT = 3
73
+ END_OF_SPEECH = 4
74
+ ASR_RESULT = 5
75
+ USER_CODES = 6
76
+
77
+
78
+ @dataclass
79
+ class FishE2EEvent:
80
+ type: FishE2EEventType
81
+ frame: np.ndarray = None
82
+ text: str = None
83
+ vq_codes: list[list[int]] = None
84
+
85
+
86
+ client = httpx.AsyncClient(
87
+ timeout=None,
88
+ limits=httpx.Limits(
89
+ max_connections=None,
90
+ max_keepalive_connections=None,
91
+ keepalive_expiry=None,
92
+ ),
93
+ )
94
+
95
+
96
+ class FishE2EAgent:
97
+ def __init__(self):
98
+ self.llm_url = "http://localhost:8080/v1/chat"
99
+ self.vqgan_url = "http://localhost:8080"
100
+ self.client = httpx.AsyncClient(timeout=None)
101
+
102
+ async def get_codes(self, audio_data, sample_rate):
103
+ audio_buffer = io.BytesIO()
104
+ sf.write(audio_buffer, audio_data, sample_rate, format="WAV")
105
+ audio_buffer.seek(0)
106
+ # Step 1: Encode audio using VQGAN
107
+ encode_request = ServeVQGANEncodeRequest(audios=[audio_buffer.read()])
108
+ encode_request_bytes = ormsgpack.packb(
109
+ encode_request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC
110
+ )
111
+ encode_response = await self.client.post(
112
+ f"{self.vqgan_url}/v1/vqgan/encode",
113
+ data=encode_request_bytes,
114
+ headers={"Content-Type": "application/msgpack"},
115
+ )
116
+ encode_response_data = ormsgpack.unpackb(encode_response.content)
117
+ codes = encode_response_data["tokens"][0]
118
+ return codes
119
+
120
+ async def stream(
121
+ self,
122
+ system_audio_data: np.ndarray | None,
123
+ user_audio_data: np.ndarray | None,
124
+ sample_rate: int,
125
+ num_channels: int,
126
+ chat_ctx: dict | None = None,
127
+ ) -> AsyncGenerator[bytes, None]:
128
+
129
+ if system_audio_data is not None:
130
+ sys_codes = await self.get_codes(system_audio_data, sample_rate)
131
+ else:
132
+ sys_codes = None
133
+ if user_audio_data is not None:
134
+ user_codes = await self.get_codes(user_audio_data, sample_rate)
135
+ # Step 2: Prepare LLM request
136
+ if chat_ctx is None:
137
+ sys_parts = [
138
+ ServeTextPart(
139
+ text='您是由 Fish Audio 设计的语音助手,提供端到端的语音交互,实现无缝用户体验。首先转录用户的语音,然后使用以下格式回答:"Question: [用户语音]\n\nAnswer: [你的回答]\n"。'
140
+ ),
141
+ ]
142
+ if system_audio_data is not None:
143
+ sys_parts.append(ServeVQPart(codes=sys_codes))
144
+ chat_ctx = {
145
+ "messages": [
146
+ ServeMessage(
147
+ role="system",
148
+ parts=sys_parts,
149
+ ),
150
+ ],
151
+ }
152
+ else:
153
+ if chat_ctx["added_sysaudio"] is False and sys_codes:
154
+ chat_ctx["added_sysaudio"] = True
155
+ chat_ctx["messages"][0].parts.append(ServeVQPart(codes=sys_codes))
156
+
157
+ prev_messages = chat_ctx["messages"].copy()
158
+ if user_audio_data is not None:
159
+ yield FishE2EEvent(
160
+ type=FishE2EEventType.USER_CODES,
161
+ vq_codes=user_codes,
162
+ )
163
+ else:
164
+ user_codes = None
165
+
166
+ request = ServeChatRequest(
167
+ messages=prev_messages
168
+ + (
169
+ [
170
+ ServeMessage(
171
+ role="user",
172
+ parts=[ServeVQPart(codes=user_codes)],
173
+ )
174
+ ]
175
+ if user_codes
176
+ else []
177
+ ),
178
+ streaming=True,
179
+ num_samples=1,
180
+ )
181
+
182
+ # Step 3: Stream LLM response and decode audio
183
+ buffer = b""
184
+ vq_codes = []
185
+ current_vq = False
186
+
187
+ async def decode_send():
188
+ nonlocal current_vq
189
+ nonlocal vq_codes
190
+
191
+ data = np.concatenate(vq_codes, axis=1).tolist()
192
+ # Decode VQ codes to audio
193
+ decode_request = ServeVQGANDecodeRequest(tokens=[data])
194
+ decode_response = await self.client.post(
195
+ f"{self.vqgan_url}/v1/vqgan/decode",
196
+ data=ormsgpack.packb(
197
+ decode_request,
198
+ option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
199
+ ),
200
+ headers={"Content-Type": "application/msgpack"},
201
+ )
202
+ decode_data = ormsgpack.unpackb(decode_response.content)
203
+
204
+ # Convert float16 audio data to int16
205
+ audio_data = np.frombuffer(decode_data["audios"][0], dtype=np.float16)
206
+ audio_data = (audio_data * 32768).astype(np.int16).tobytes()
207
+
208
+ audio_frame = CustomAudioFrame(
209
+ data=audio_data,
210
+ samples_per_channel=len(audio_data) // 2,
211
+ sample_rate=44100,
212
+ num_channels=1,
213
+ )
214
+ yield FishE2EEvent(
215
+ type=FishE2EEventType.SPEECH_SEGMENT,
216
+ frame=audio_frame,
217
+ vq_codes=data,
218
+ )
219
+
220
+ current_vq = False
221
+ vq_codes = []
222
+
223
+ async with self.client.stream(
224
+ "POST",
225
+ self.llm_url,
226
+ data=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
227
+ headers={"Content-Type": "application/msgpack"},
228
+ ) as response:
229
+
230
+ async for chunk in response.aiter_bytes():
231
+ buffer += chunk
232
+
233
+ while len(buffer) >= 4:
234
+ read_length = struct.unpack("I", buffer[:4])[0]
235
+ if len(buffer) < 4 + read_length:
236
+ break
237
+
238
+ body = buffer[4 : 4 + read_length]
239
+ buffer = buffer[4 + read_length :]
240
+ data = ormsgpack.unpackb(body)
241
+
242
+ if data["delta"] and data["delta"]["part"]:
243
+ if current_vq and data["delta"]["part"]["type"] == "text":
244
+ async for event in decode_send():
245
+ yield event
246
+ if data["delta"]["part"]["type"] == "text":
247
+ yield FishE2EEvent(
248
+ type=FishE2EEventType.TEXT_SEGMENT,
249
+ text=data["delta"]["part"]["text"],
250
+ )
251
+ elif data["delta"]["part"]["type"] == "vq":
252
+ vq_codes.append(np.array(data["delta"]["part"]["codes"]))
253
+ current_vq = True
254
+
255
+ if current_vq and vq_codes:
256
+ async for event in decode_send():
257
+ yield event
258
+
259
+ yield FishE2EEvent(type=FishE2EEventType.END_OF_TEXT)
260
+ yield FishE2EEvent(type=FishE2EEventType.END_OF_SPEECH)
261
+
262
+
263
+ # Example usage:
264
+ async def main():
265
+ import torchaudio
266
+
267
+ agent = FishE2EAgent()
268
+
269
+ # Replace this with actual audio data loading
270
+ with open("uz_story_en.m4a", "rb") as f:
271
+ audio_data = f.read()
272
+
273
+ audio_data, sample_rate = torchaudio.load("uz_story_en.m4a")
274
+ audio_data = (audio_data.numpy() * 32768).astype(np.int16)
275
+
276
+ stream = agent.stream(audio_data, sample_rate, 1)
277
+ if os.path.exists("audio_segment.wav"):
278
+ os.remove("audio_segment.wav")
279
+
280
+ async for event in stream:
281
+ if event.type == FishE2EEventType.SPEECH_SEGMENT:
282
+ # Handle speech segment (e.g., play audio or save to file)
283
+ with open("audio_segment.wav", "ab+") as f:
284
+ f.write(event.frame.data)
285
+ elif event.type == FishE2EEventType.ASR_RESULT:
286
+ print(event.text, flush=True)
287
+ elif event.type == FishE2EEventType.TEXT_SEGMENT:
288
+ print(event.text, flush=True, end="")
289
+ elif event.type == FishE2EEventType.END_OF_TEXT:
290
+ print("\nEnd of text reached.")
291
+ elif event.type == FishE2EEventType.END_OF_SPEECH:
292
+ print("End of speech reached.")
293
+
294
+
295
+ if __name__ == "__main__":
296
+ import asyncio
297
+
298
+ asyncio.run(main())
@@ -0,0 +1,192 @@
1
+ import gc
2
+ import queue
3
+ from typing import Generator
4
+
5
+ import numpy as np
6
+ import torch
7
+ from loguru import logger
8
+
9
+ from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
10
+ from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
11
+ from fish_speech.utils import autocast_exclude_mps, set_seed
12
+ from tools.inference_engine.reference_loader import ReferenceLoader
13
+ from tools.inference_engine.utils import InferenceResult, wav_chunk_header
14
+ from tools.inference_engine.vq_manager import VQManager
15
+ from tools.llama.generate import (
16
+ GenerateRequest,
17
+ GenerateResponse,
18
+ WrappedGenerateResponse,
19
+ )
20
+ from tools.schema import ServeTTSRequest
21
+
22
+
23
+ class TTSInferenceEngine(ReferenceLoader, VQManager):
24
+
25
+ def __init__(
26
+ self,
27
+ llama_queue: queue.Queue,
28
+ decoder_model: FireflyArchitecture,
29
+ precision: torch.dtype,
30
+ compile: bool,
31
+ ) -> None:
32
+
33
+ super().__init__()
34
+
35
+ self.llama_queue = llama_queue
36
+ self.decoder_model = decoder_model
37
+ self.precision = precision
38
+ self.compile = compile
39
+
40
+ @torch.inference_mode()
41
+ def inference(self, req: ServeTTSRequest) -> Generator[InferenceResult, None, None]:
42
+ """
43
+ Main inference function:
44
+ - Loads the reference audio and text.
45
+ - Calls the LLAMA model for inference.
46
+ - Decodes the VQ tokens to audio.
47
+ """
48
+
49
+ ref_id: str | None = req.reference_id
50
+ prompt_tokens, prompt_texts = [], []
51
+ # Load the reference audio and text based on id or hash
52
+ if ref_id is not None:
53
+ prompt_tokens, prompt_texts = self.load_by_id(ref_id, req.use_memory_cache)
54
+
55
+ elif req.references:
56
+ prompt_tokens, prompt_texts = self.load_by_hash(
57
+ req.references, req.use_memory_cache
58
+ )
59
+
60
+ # Set the random seed if provided
61
+ if req.seed is not None:
62
+ set_seed(req.seed)
63
+ logger.warning(f"set seed: {req.seed}")
64
+
65
+ # Get the symbolic tokens from the LLAMA model
66
+ response_queue = self.send_Llama_request(req, prompt_tokens, prompt_texts)
67
+
68
+ # Get the sample rate from the decoder model
69
+ sample_rate = self.decoder_model.spec_transform.sample_rate
70
+
71
+ # If streaming, send the header
72
+ # if req.streaming:
73
+ # yield InferenceResult(
74
+ # code="header",
75
+ # audio=(sample_rate, wav_chunk_header(sample_rate=sample_rate)),
76
+ # error=None,
77
+ # )
78
+
79
+ segments = []
80
+
81
+ while True:
82
+ # Get the response from the LLAMA model
83
+ wrapped_result: WrappedGenerateResponse = response_queue.get()
84
+ if wrapped_result.status == "error":
85
+ yield InferenceResult(
86
+ code="error",
87
+ audio=None,
88
+ error=(
89
+ wrapped_result.response
90
+ if isinstance(wrapped_result.response, Exception)
91
+ else Exception("Unknown error")
92
+ ),
93
+ )
94
+ break
95
+
96
+ # Check the response type
97
+ if not isinstance(wrapped_result.response, GenerateResponse):
98
+ raise TypeError(
99
+ "Expected GenerateResponse, got {type(wrapped_result.response).__name__}"
100
+ )
101
+
102
+ result: GenerateResponse = wrapped_result.response
103
+ if result.action != "next":
104
+ segment = self.get_audio_segment(result)
105
+
106
+ if req.streaming: # Used only by the API server
107
+ yield InferenceResult(
108
+ code="segment",
109
+ audio=(sample_rate, segment),
110
+ error=None,
111
+ )
112
+ segments.append(segment)
113
+ else:
114
+ break
115
+
116
+ # Clean up the memory
117
+ if torch.cuda.is_available():
118
+ torch.cuda.empty_cache()
119
+ gc.collect()
120
+
121
+ # Edge case: no audio generated
122
+ if len(segments) == 0:
123
+ yield InferenceResult(
124
+ code="error",
125
+ audio=None,
126
+ error=RuntimeError("No audio generated, please check the input text."),
127
+ )
128
+ else:
129
+ # Streaming or not, return the final audio
130
+ audio = np.concatenate(segments, axis=0)
131
+ yield InferenceResult(
132
+ code="final",
133
+ audio=(sample_rate, audio),
134
+ error=None,
135
+ )
136
+
137
+ return None
138
+
139
+ def send_Llama_request(
140
+ self, req: ServeTTSRequest, prompt_tokens: list, prompt_texts: list
141
+ ) -> queue.Queue:
142
+ """
143
+ Send a request to the LLAMA model to generate the symbolic tokens.
144
+ """
145
+
146
+ # Prepare the request
147
+ request = dict(
148
+ device=self.decoder_model.device,
149
+ max_new_tokens=req.max_new_tokens,
150
+ text=(
151
+ req.text
152
+ if not req.normalize
153
+ else ChnNormedText(raw_text=req.text).normalize()
154
+ ),
155
+ top_p=req.top_p,
156
+ repetition_penalty=req.repetition_penalty,
157
+ temperature=req.temperature,
158
+ compile=self.compile,
159
+ iterative_prompt=req.chunk_length > 0,
160
+ chunk_length=req.chunk_length,
161
+ max_length=4096,
162
+ prompt_tokens=prompt_tokens,
163
+ prompt_text=prompt_texts,
164
+ )
165
+
166
+ # Create a queue to get the response
167
+ response_queue = queue.Queue()
168
+
169
+ # Send the request to the LLAMA model
170
+ self.llama_queue.put(
171
+ GenerateRequest(
172
+ request=request,
173
+ response_queue=response_queue,
174
+ )
175
+ )
176
+
177
+ return response_queue
178
+
179
+ def get_audio_segment(self, result: GenerateResponse) -> np.ndarray:
180
+ """
181
+ Decode the VQ tokens to audio.
182
+ """
183
+
184
+ # Don't use autocast on MPS devices
185
+ with autocast_exclude_mps(
186
+ device_type=self.decoder_model.device.type, dtype=self.precision
187
+ ):
188
+ # Decode the symbolic tokens to audio
189
+ segment = self.decode_vq_tokens(codes=result.codes)
190
+
191
+ # Convert the audio to numpy
192
+ return segment.float().cpu().numpy()
@@ -0,0 +1,125 @@
1
+ import io
2
+ from hashlib import sha256
3
+ from pathlib import Path
4
+ from typing import Callable, Literal, Tuple
5
+
6
+ import torch
7
+ import torchaudio
8
+ from loguru import logger
9
+
10
+ from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
11
+ from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
12
+ from tools.schema import ServeReferenceAudio
13
+
14
+
15
+ class ReferenceLoader:
16
+
17
+ def __init__(self) -> None:
18
+ """
19
+ Component of the TTSInferenceEngine class.
20
+ Loads and manages the cache for the reference audio and text.
21
+ """
22
+ self.ref_by_id: dict = {}
23
+ self.ref_by_hash: dict = {}
24
+
25
+ # Make Pylance happy (attribut/method not defined...)
26
+ self.decoder_model: FireflyArchitecture
27
+ self.encode_reference: Callable
28
+
29
+ # Define the torchaudio backend
30
+ backends = torchaudio.list_audio_backends()
31
+ if "ffmpeg" in backends:
32
+ self.backend = "ffmpeg"
33
+ else:
34
+ self.backend = "soundfile"
35
+
36
+ def load_by_id(
37
+ self,
38
+ id: str,
39
+ use_cache: Literal["on", "off"],
40
+ ) -> Tuple:
41
+
42
+ # Load the references audio and text by id
43
+ ref_folder = Path("references") / id
44
+ ref_folder.mkdir(parents=True, exist_ok=True)
45
+ ref_audios = list_files(
46
+ ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
47
+ )
48
+
49
+ if use_cache == "off" or id not in self.ref_by_id:
50
+ # If the references are not already loaded, encode them
51
+ prompt_tokens = [
52
+ self.encode_reference(
53
+ # decoder_model=self.decoder_model,
54
+ reference_audio=audio_to_bytes(str(ref_audio)),
55
+ enable_reference_audio=True,
56
+ )
57
+ for ref_audio in ref_audios
58
+ ]
59
+ prompt_texts = [
60
+ read_ref_text(str(ref_audio.with_suffix(".lab")))
61
+ for ref_audio in ref_audios
62
+ ]
63
+ self.ref_by_id[id] = (prompt_tokens, prompt_texts)
64
+
65
+ else:
66
+ # Reuse already encoded references
67
+ logger.info("Use same references")
68
+ prompt_tokens, prompt_texts = self.ref_by_id[id]
69
+
70
+ return prompt_tokens, prompt_texts
71
+
72
+ def load_by_hash(
73
+ self,
74
+ references: list[ServeReferenceAudio],
75
+ use_cache: Literal["on", "off"],
76
+ ) -> Tuple:
77
+
78
+ # Load the references audio and text by hash
79
+ audio_hashes = [sha256(ref.audio).hexdigest() for ref in references]
80
+
81
+ cache_used = False
82
+ prompt_tokens, prompt_texts = [], []
83
+ for i, ref in enumerate(references):
84
+ if use_cache == "off" or audio_hashes[i] not in self.ref_by_hash:
85
+ # If the references are not already loaded, encode them
86
+ prompt_tokens.append(
87
+ self.encode_reference(
88
+ reference_audio=ref.audio,
89
+ enable_reference_audio=True,
90
+ )
91
+ )
92
+ prompt_texts.append(ref.text)
93
+ self.ref_by_hash[audio_hashes[i]] = (prompt_tokens, prompt_texts)
94
+
95
+ else:
96
+ # Reuse already encoded references
97
+ prompt_tokens, prompt_texts = self.ref_by_hash[audio_hashes[i]]
98
+ cache_used = True
99
+
100
+ if cache_used:
101
+ logger.info("Use same references")
102
+
103
+ return prompt_tokens, prompt_texts
104
+
105
+ def load_audio(self, reference_audio, sr):
106
+ """
107
+ Load the audio data from a file or bytes.
108
+ """
109
+ if len(reference_audio) > 255 or not Path(reference_audio).exists():
110
+ audio_data = reference_audio
111
+ reference_audio = io.BytesIO(audio_data)
112
+
113
+ waveform, original_sr = torchaudio.load(reference_audio, backend=self.backend)
114
+
115
+ if waveform.shape[0] > 1:
116
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
117
+
118
+ if original_sr != sr:
119
+ resampler = torchaudio.transforms.Resample(
120
+ orig_freq=original_sr, new_freq=sr
121
+ )
122
+ waveform = resampler(waveform)
123
+
124
+ audio = waveform.squeeze().numpy()
125
+ return audio
@@ -0,0 +1,39 @@
1
+ import io
2
+ import wave
3
+ from dataclasses import dataclass
4
+ from typing import Literal, Optional, Tuple
5
+
6
+ import numpy as np
7
+
8
+ from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
9
+
10
+
11
+ @dataclass
12
+ class InferenceResult:
13
+ code: Literal["header", "segment", "error", "final"]
14
+ audio: Optional[Tuple[int, np.ndarray | bytes]]
15
+ error: Optional[Exception]
16
+
17
+
18
+ def normalize_text(user_input: str, use_normalization: bool) -> str:
19
+ """Normalize user input text if needed."""
20
+ if use_normalization:
21
+ return ChnNormedText(raw_text=user_input).normalize()
22
+ else:
23
+ return user_input
24
+
25
+
26
+ def wav_chunk_header(
27
+ sample_rate: int = 44100, bit_depth: int = 16, channels: int = 1
28
+ ) -> bytes:
29
+ buffer = io.BytesIO()
30
+
31
+ with wave.open(buffer, "wb") as wav_file:
32
+ wav_file.setnchannels(channels)
33
+ wav_file.setsampwidth(bit_depth // 8)
34
+ wav_file.setframerate(sample_rate)
35
+
36
+ wav_header_bytes = buffer.getvalue()
37
+ buffer.close()
38
+
39
+ return wav_header_bytes