xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -8,14 +8,15 @@ import requests
8
8
  from pydub import AudioSegment
9
9
  from pydub.playback import play
10
10
 
11
- from tools.commons import ServeReferenceAudio, ServeTTSRequest
12
11
  from tools.file import audio_to_bytes, read_ref_text
12
+ from tools.schema import ServeReferenceAudio, ServeTTSRequest
13
13
 
14
14
 
15
15
  def parse_args():
16
16
 
17
17
  parser = argparse.ArgumentParser(
18
- description="Send a WAV file and text to a server and receive synthesized audio."
18
+ description="Send a WAV file and text to a server and receive synthesized audio.",
19
+ formatter_class=argparse.RawTextHelpFormatter,
19
20
  )
20
21
 
21
22
  parser.add_argument(
@@ -33,7 +34,7 @@ def parse_args():
33
34
  "-id",
34
35
  type=str,
35
36
  default=None,
36
- help="ID of the reference model o be used for the speech",
37
+ help="ID of the reference model to be used for the speech\n(Local: name of folder containing audios and files)",
37
38
  )
38
39
  parser.add_argument(
39
40
  "--reference_audio",
@@ -41,7 +42,7 @@ def parse_args():
41
42
  type=str,
42
43
  nargs="+",
43
44
  default=None,
44
- help="Path to the WAV file",
45
+ help="Path to the audio file",
45
46
  )
46
47
  parser.add_argument(
47
48
  "--reference_text",
@@ -68,17 +69,21 @@ def parse_args():
68
69
  parser.add_argument(
69
70
  "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
70
71
  )
71
- parser.add_argument("--mp3_bitrate", type=int, default=64)
72
- parser.add_argument("--opus_bitrate", type=int, default=-1000)
73
- parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
72
+ parser.add_argument(
73
+ "--latency",
74
+ type=str,
75
+ default="normal",
76
+ choices=["normal", "balanced"],
77
+ help="Used in api.fish.audio/v1/tts",
78
+ )
74
79
  parser.add_argument(
75
80
  "--max_new_tokens",
76
81
  type=int,
77
82
  default=1024,
78
- help="Maximum new tokens to generate",
83
+ help="Maximum new tokens to generate. \n0 means no limit.",
79
84
  )
80
85
  parser.add_argument(
81
- "--chunk_length", type=int, default=100, help="Chunk length for synthesis"
86
+ "--chunk_length", type=int, default=200, help="Chunk length for synthesis"
82
87
  )
83
88
  parser.add_argument(
84
89
  "--top_p", type=float, default=0.7, help="Top-p sampling for synthesis"
@@ -92,10 +97,7 @@ def parse_args():
92
97
  parser.add_argument(
93
98
  "--temperature", type=float, default=0.7, help="Temperature for sampling"
94
99
  )
95
- parser.add_argument(
96
- "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
97
- )
98
- parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
100
+
99
101
  parser.add_argument(
100
102
  "--streaming", type=bool, default=False, help="Enable streaming response"
101
103
  )
@@ -103,6 +105,20 @@ def parse_args():
103
105
  "--channels", type=int, default=1, help="Number of audio channels"
104
106
  )
105
107
  parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
108
+ parser.add_argument(
109
+ "--use_memory_cache",
110
+ type=str,
111
+ default="off",
112
+ choices=["on", "off"],
113
+ help="Cache encoded references codes in memory.\n",
114
+ )
115
+ parser.add_argument(
116
+ "--seed",
117
+ type=int,
118
+ default=None,
119
+ help="`None` means randomized inference, otherwise deterministic.\n"
120
+ "It can't be used for fixing a timbre.",
121
+ )
106
122
 
107
123
  return parser.parse_args()
108
124
 
@@ -132,22 +148,22 @@ if __name__ == "__main__":
132
148
  data = {
133
149
  "text": args.text,
134
150
  "references": [
135
- ServeReferenceAudio(audio=ref_audio, text=ref_text)
151
+ ServeReferenceAudio(
152
+ audio=ref_audio if ref_audio is not None else b"", text=ref_text
153
+ )
136
154
  for ref_text, ref_audio in zip(ref_texts, byte_audios)
137
155
  ],
138
156
  "reference_id": idstr,
139
157
  "normalize": args.normalize,
140
158
  "format": args.format,
141
- "mp3_bitrate": args.mp3_bitrate,
142
- "opus_bitrate": args.opus_bitrate,
143
159
  "max_new_tokens": args.max_new_tokens,
144
160
  "chunk_length": args.chunk_length,
145
161
  "top_p": args.top_p,
146
162
  "repetition_penalty": args.repetition_penalty,
147
163
  "temperature": args.temperature,
148
- "speaker": args.speaker,
149
- "emotion": args.emotion,
150
164
  "streaming": args.streaming,
165
+ "use_memory_cache": args.use_memory_cache,
166
+ "seed": args.seed,
151
167
  }
152
168
 
153
169
  pydantic_data = ServeTTSRequest(**data)
@@ -0,0 +1,98 @@
1
+ from threading import Lock
2
+
3
+ import pyrootutils
4
+ import uvicorn
5
+ from kui.asgi import FactoryClass, HTTPException, HttpRoute, Kui, OpenAPI, Routes
6
+ from loguru import logger
7
+
8
+ pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
9
+
10
+ from tools.server.api_utils import MsgPackRequest, parse_args
11
+ from tools.server.exception_handler import ExceptionHandler
12
+ from tools.server.model_manager import ModelManager
13
+ from tools.server.views import (
14
+ ASRView,
15
+ ChatView,
16
+ HealthView,
17
+ TTSView,
18
+ VQGANDecodeView,
19
+ VQGANEncodeView,
20
+ )
21
+
22
+
23
+ class API(ExceptionHandler):
24
+ def __init__(self):
25
+ self.args = parse_args()
26
+ self.routes = [
27
+ ("/v1/health", HealthView),
28
+ ("/v1/vqgan/encode", VQGANEncodeView),
29
+ ("/v1/vqgan/decode", VQGANDecodeView),
30
+ ("/v1/asr", ASRView),
31
+ ("/v1/tts", TTSView),
32
+ ("/v1/chat", ChatView),
33
+ ]
34
+ self.routes = Routes([HttpRoute(path, view) for path, view in self.routes])
35
+
36
+ self.openapi = OpenAPI(
37
+ {
38
+ "title": "Fish Speech API",
39
+ "version": "1.5.0",
40
+ },
41
+ ).routes
42
+
43
+ # Initialize the app
44
+ self.app = Kui(
45
+ routes=self.routes + self.openapi[1:], # Remove the default route
46
+ exception_handlers={
47
+ HTTPException: self.http_exception_handler,
48
+ Exception: self.other_exception_handler,
49
+ },
50
+ factory_class=FactoryClass(http=MsgPackRequest),
51
+ cors_config={},
52
+ )
53
+
54
+ # Add the state variables
55
+ self.app.state.lock = Lock()
56
+ self.app.state.device = self.args.device
57
+ self.app.state.max_text_length = self.args.max_text_length
58
+
59
+ # Associate the app with the model manager
60
+ self.app.on_startup(self.initialize_app)
61
+
62
+ async def initialize_app(self, app: Kui):
63
+ # Make the ModelManager available to the views
64
+ app.state.model_manager = ModelManager(
65
+ mode=self.args.mode,
66
+ device=self.args.device,
67
+ half=self.args.half,
68
+ compile=self.args.compile,
69
+ asr_enabled=self.args.load_asr_model,
70
+ llama_checkpoint_path=self.args.llama_checkpoint_path,
71
+ decoder_checkpoint_path=self.args.decoder_checkpoint_path,
72
+ decoder_config_name=self.args.decoder_config_name,
73
+ )
74
+
75
+ logger.info(f"Startup done, listening server at http://{self.args.listen}")
76
+
77
+
78
+ # Each worker process created by Uvicorn has its own memory space,
79
+ # meaning that models and variables are not shared between processes.
80
+ # Therefore, any variables (like `llama_queue` or `decoder_model`)
81
+ # will not be shared across workers.
82
+
83
+ # Multi-threading for deep learning can cause issues, such as inconsistent
84
+ # outputs if multiple threads access the same buffers simultaneously.
85
+ # Instead, it's better to use multiprocessing or independent models per thread.
86
+
87
+ if __name__ == "__main__":
88
+
89
+ api = API()
90
+ host, port = api.args.listen.split(":")
91
+
92
+ uvicorn.run(
93
+ api.app,
94
+ host=host,
95
+ port=int(port),
96
+ workers=api.args.workers,
97
+ log_level="info",
98
+ )
@@ -22,14 +22,14 @@ def check_and_download_files(repo_id, file_list, local_dir):
22
22
 
23
23
 
24
24
  # 1st
25
- repo_id_1 = "fishaudio/fish-speech-1.4"
26
- local_dir_1 = "./checkpoints/fish-speech-1.4"
25
+ repo_id_1 = "fishaudio/fish-speech-1.5"
26
+ local_dir_1 = "./checkpoints/fish-speech-1.5"
27
27
  files_1 = [
28
+ "gitattributes",
28
29
  "model.pth",
29
30
  "README.md",
30
- "special_tokens_map.json",
31
- "tokenizer_config.json",
32
- "tokenizer.json",
31
+ "special_tokens.json",
32
+ "tokenizer.tiktoken",
33
33
  "config.json",
34
34
  "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
35
35
  ]
@@ -0,0 +1,232 @@
1
+ import io
2
+ import re
3
+ import wave
4
+
5
+ import gradio as gr
6
+ import numpy as np
7
+
8
+ from .fish_e2e import FishE2EAgent, FishE2EEventType
9
+ from .schema import ServeMessage, ServeTextPart, ServeVQPart
10
+
11
+
12
+ def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
13
+ buffer = io.BytesIO()
14
+
15
+ with wave.open(buffer, "wb") as wav_file:
16
+ wav_file.setnchannels(channels)
17
+ wav_file.setsampwidth(bit_depth // 8)
18
+ wav_file.setframerate(sample_rate)
19
+
20
+ wav_header_bytes = buffer.getvalue()
21
+ buffer.close()
22
+ return wav_header_bytes
23
+
24
+
25
+ class ChatState:
26
+ def __init__(self):
27
+ self.conversation = []
28
+ self.added_systext = False
29
+ self.added_sysaudio = False
30
+
31
+ def get_history(self):
32
+ results = []
33
+ for msg in self.conversation:
34
+ results.append({"role": msg.role, "content": self.repr_message(msg)})
35
+
36
+ # Process assistant messages to extract questions and update user messages
37
+ for i, msg in enumerate(results):
38
+ if msg["role"] == "assistant":
39
+ match = re.search(r"Question: (.*?)\n\nResponse:", msg["content"])
40
+ if match and i > 0 and results[i - 1]["role"] == "user":
41
+ # Update previous user message with extracted question
42
+ results[i - 1]["content"] += "\n" + match.group(1)
43
+ # Remove the Question/Answer format from assistant message
44
+ msg["content"] = msg["content"].split("\n\nResponse: ", 1)[1]
45
+ return results
46
+
47
+ def repr_message(self, msg: ServeMessage):
48
+ response = ""
49
+ for part in msg.parts:
50
+ if isinstance(part, ServeTextPart):
51
+ response += part.text
52
+ elif isinstance(part, ServeVQPart):
53
+ response += f"<audio {len(part.codes[0]) / 21:.2f}s>"
54
+ return response
55
+
56
+
57
+ def clear_fn():
58
+ return [], ChatState(), None, None, None
59
+
60
+
61
+ async def process_audio_input(
62
+ sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
63
+ ):
64
+ if audio_input is None and not text_input:
65
+ raise gr.Error("No input provided")
66
+
67
+ agent = FishE2EAgent() # Create new agent instance for each request
68
+
69
+ # Convert audio input to numpy array
70
+ if isinstance(audio_input, tuple):
71
+ sr, audio_data = audio_input
72
+ elif text_input:
73
+ sr = 44100
74
+ audio_data = None
75
+ else:
76
+ raise gr.Error("Invalid audio format")
77
+
78
+ if isinstance(sys_audio_input, tuple):
79
+ sr, sys_audio_data = sys_audio_input
80
+ else:
81
+ sr = 44100
82
+ sys_audio_data = None
83
+
84
+ def append_to_chat_ctx(
85
+ part: ServeTextPart | ServeVQPart, role: str = "assistant"
86
+ ) -> None:
87
+ if not state.conversation or state.conversation[-1].role != role:
88
+ state.conversation.append(ServeMessage(role=role, parts=[part]))
89
+ else:
90
+ state.conversation[-1].parts.append(part)
91
+
92
+ if state.added_systext is False and sys_text_input:
93
+ state.added_systext = True
94
+ append_to_chat_ctx(ServeTextPart(text=sys_text_input), role="system")
95
+ if text_input:
96
+ append_to_chat_ctx(ServeTextPart(text=text_input), role="user")
97
+ audio_data = None
98
+
99
+ result_audio = b""
100
+ async for event in agent.stream(
101
+ sys_audio_data,
102
+ audio_data,
103
+ sr,
104
+ 1,
105
+ chat_ctx={
106
+ "messages": state.conversation,
107
+ "added_sysaudio": state.added_sysaudio,
108
+ },
109
+ ):
110
+ if event.type == FishE2EEventType.USER_CODES:
111
+ append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
112
+ elif event.type == FishE2EEventType.SPEECH_SEGMENT:
113
+ append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
114
+ yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
115
+ elif event.type == FishE2EEventType.TEXT_SEGMENT:
116
+ append_to_chat_ctx(ServeTextPart(text=event.text))
117
+ yield state.get_history(), None, None, None
118
+
119
+ yield state.get_history(), None, None, None
120
+
121
+
122
+ async def process_text_input(
123
+ sys_audio_input, sys_text_input, state: ChatState, text_input: str
124
+ ):
125
+ async for event in process_audio_input(
126
+ sys_audio_input, sys_text_input, None, state, text_input
127
+ ):
128
+ yield event
129
+
130
+
131
+ def create_demo():
132
+ with gr.Blocks() as demo:
133
+ state = gr.State(ChatState())
134
+
135
+ with gr.Row():
136
+ # Left column (70%) for chatbot and notes
137
+ with gr.Column(scale=7):
138
+ chatbot = gr.Chatbot(
139
+ [],
140
+ elem_id="chatbot",
141
+ bubble_full_width=False,
142
+ height=600,
143
+ type="messages",
144
+ )
145
+
146
+ # notes = gr.Markdown(
147
+ # """
148
+ # # Fish Agent
149
+ # 1. 此Demo为Fish Audio自研端到端语言模型Fish Agent 3B版本.
150
+ # 2. 你可以在我们的官方仓库找到代码以及权重,但是相关内容全部基于 CC BY-NC-SA 4.0 许可证发布.
151
+ # 3. Demo为早期灰度测试版本,推理速度尚待优化.
152
+ # # 特色
153
+ # 1. 该模型自动集成ASR与TTS部分,不需要外挂其它模型,即真正的端到端,而非三段式(ASR+LLM+TTS).
154
+ # 2. 模型可以使用reference audio控制说话音色.
155
+ # 3. 可以生成具有较强情感与韵律的音频.
156
+ # """
157
+ # )
158
+ notes = gr.Markdown(
159
+ """
160
+ # Fish Agent
161
+ 1. This demo is Fish Audio's self-researh end-to-end language model, Fish Agent version 3B.
162
+ 2. You can find the code and weights in our official repo in [gitub](https://github.com/fishaudio/fish-speech) and [hugging face](https://huggingface.co/fishaudio/fish-agent-v0.1-3b), but the content is released under a CC BY-NC-SA 4.0 licence.
163
+ 3. The demo is an early alpha test version, the inference speed needs to be optimised.
164
+ # Features
165
+ 1. The model automatically integrates ASR and TTS parts, no need to plug-in other models, i.e., true end-to-end, not three-stage (ASR+LLM+TTS).
166
+ 2. The model can use reference audio to control the speech timbre.
167
+ 3. The model can generate speech with strong emotion.
168
+ """
169
+ )
170
+
171
+ # Right column (30%) for controls
172
+ with gr.Column(scale=3):
173
+ sys_audio_input = gr.Audio(
174
+ sources=["upload"],
175
+ type="numpy",
176
+ label="Give a timbre for your assistant",
177
+ )
178
+ sys_text_input = gr.Textbox(
179
+ label="What is your assistant's role?",
180
+ value="You are a voice assistant created by Fish Audio, offering end-to-end voice interaction for a seamless user experience. You are required to first transcribe the user's speech, then answer it in the following format: 'Question: [USER_SPEECH]\n\nAnswer: [YOUR_RESPONSE]\n'. You are required to use the following voice in this conversation.",
181
+ type="text",
182
+ )
183
+ audio_input = gr.Audio(
184
+ sources=["microphone"], type="numpy", label="Speak your message"
185
+ )
186
+
187
+ text_input = gr.Textbox(label="Or type your message", type="text")
188
+
189
+ output_audio = gr.Audio(
190
+ label="Assistant's Voice",
191
+ streaming=True,
192
+ autoplay=True,
193
+ interactive=False,
194
+ )
195
+
196
+ send_button = gr.Button("Send", variant="primary")
197
+ clear_button = gr.Button("Clear")
198
+
199
+ # Event handlers
200
+ audio_input.stop_recording(
201
+ process_audio_input,
202
+ inputs=[sys_audio_input, sys_text_input, audio_input, state, text_input],
203
+ outputs=[chatbot, output_audio, audio_input, text_input],
204
+ show_progress=True,
205
+ )
206
+
207
+ send_button.click(
208
+ process_text_input,
209
+ inputs=[sys_audio_input, sys_text_input, state, text_input],
210
+ outputs=[chatbot, output_audio, audio_input, text_input],
211
+ show_progress=True,
212
+ )
213
+
214
+ text_input.submit(
215
+ process_text_input,
216
+ inputs=[sys_audio_input, sys_text_input, state, text_input],
217
+ outputs=[chatbot, output_audio, audio_input, text_input],
218
+ show_progress=True,
219
+ )
220
+
221
+ clear_button.click(
222
+ clear_fn,
223
+ inputs=[],
224
+ outputs=[chatbot, state, audio_input, output_audio, text_input],
225
+ )
226
+
227
+ return demo
228
+
229
+
230
+ if __name__ == "__main__":
231
+ demo = create_demo()
232
+ demo.launch(server_name="127.0.0.1", server_port=7860, share=True)