xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +77 -71
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +79 -19
  6. xinference/core/supervisor.py +172 -10
  7. xinference/core/utils.py +12 -8
  8. xinference/core/worker.py +102 -4
  9. xinference/deploy/cmdline.py +3 -1
  10. xinference/deploy/test/test_cmdline.py +56 -0
  11. xinference/isolation.py +24 -0
  12. xinference/model/audio/core.py +16 -0
  13. xinference/model/audio/cosyvoice.py +39 -6
  14. xinference/model/audio/f5tts.py +200 -0
  15. xinference/model/audio/f5tts_mlx.py +260 -0
  16. xinference/model/audio/fish_speech.py +36 -111
  17. xinference/model/audio/melotts.py +110 -0
  18. xinference/model/audio/model_spec.json +99 -3
  19. xinference/model/audio/model_spec_modelscope.json +27 -0
  20. xinference/model/audio/utils.py +32 -0
  21. xinference/model/audio/whisper.py +35 -10
  22. xinference/model/embedding/core.py +203 -142
  23. xinference/model/embedding/model_spec.json +7 -0
  24. xinference/model/embedding/model_spec_modelscope.json +8 -0
  25. xinference/model/image/core.py +69 -1
  26. xinference/model/image/model_spec.json +145 -4
  27. xinference/model/image/model_spec_modelscope.json +150 -4
  28. xinference/model/image/stable_diffusion/core.py +45 -13
  29. xinference/model/llm/__init__.py +4 -2
  30. xinference/model/llm/llm_family.json +536 -53
  31. xinference/model/llm/llm_family.py +15 -36
  32. xinference/model/llm/llm_family_modelscope.json +454 -20
  33. xinference/model/llm/memory.py +1 -1
  34. xinference/model/llm/mlx/core.py +248 -52
  35. xinference/model/llm/sglang/core.py +1 -0
  36. xinference/model/llm/transformers/chatglm.py +9 -5
  37. xinference/model/llm/transformers/cogagent.py +272 -0
  38. xinference/model/llm/transformers/core.py +2 -0
  39. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  40. xinference/model/llm/transformers/utils.py +16 -8
  41. xinference/model/llm/utils.py +36 -4
  42. xinference/model/llm/vllm/core.py +53 -10
  43. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  44. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  45. xinference/model/llm/vllm/xavier/block.py +111 -0
  46. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  47. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  48. xinference/model/llm/vllm/xavier/collective.py +74 -0
  49. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  50. xinference/model/llm/vllm/xavier/engine.py +247 -0
  51. xinference/model/llm/vllm/xavier/executor.py +134 -0
  52. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  53. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  54. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  55. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  56. xinference/model/video/diffusers.py +14 -0
  57. xinference/model/video/model_spec.json +15 -0
  58. xinference/model/video/model_spec_modelscope.json +16 -0
  59. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  60. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  61. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  62. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  63. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  64. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  65. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  66. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  67. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  68. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  69. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  70. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  71. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  72. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  73. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  74. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  75. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  76. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  77. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  78. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  79. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  80. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  81. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  82. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  83. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  84. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  85. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  86. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  87. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  88. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  89. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  90. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  91. xinference/thirdparty/f5_tts/api.py +166 -0
  92. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  93. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  94. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  95. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  96. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  97. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  98. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  99. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  100. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  101. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  102. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  103. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  104. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  105. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  106. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  107. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  108. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  109. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  110. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  111. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  112. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  114. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  115. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  116. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  117. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  118. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  119. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  120. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  121. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  122. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  123. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  124. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  125. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  126. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  127. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  128. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  129. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  130. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  131. xinference/thirdparty/f5_tts/train/README.md +77 -0
  132. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  133. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  134. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  135. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  136. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  137. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  138. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  139. xinference/thirdparty/f5_tts/train/train.py +75 -0
  140. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  141. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  142. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  143. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  144. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  145. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  146. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  147. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  148. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  149. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  150. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  151. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  152. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  153. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  154. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  155. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  156. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  157. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  158. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  159. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  160. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  161. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  162. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  163. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  164. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  165. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  166. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  167. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  168. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  169. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  170. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  171. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  172. xinference/thirdparty/matcha/utils/utils.py +2 -2
  173. xinference/thirdparty/melo/api.py +135 -0
  174. xinference/thirdparty/melo/app.py +61 -0
  175. xinference/thirdparty/melo/attentions.py +459 -0
  176. xinference/thirdparty/melo/commons.py +160 -0
  177. xinference/thirdparty/melo/configs/config.json +94 -0
  178. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  179. xinference/thirdparty/melo/data_utils.py +413 -0
  180. xinference/thirdparty/melo/download_utils.py +67 -0
  181. xinference/thirdparty/melo/infer.py +25 -0
  182. xinference/thirdparty/melo/init_downloads.py +14 -0
  183. xinference/thirdparty/melo/losses.py +58 -0
  184. xinference/thirdparty/melo/main.py +36 -0
  185. xinference/thirdparty/melo/mel_processing.py +174 -0
  186. xinference/thirdparty/melo/models.py +1030 -0
  187. xinference/thirdparty/melo/modules.py +598 -0
  188. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  189. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  190. xinference/thirdparty/melo/preprocess_text.py +135 -0
  191. xinference/thirdparty/melo/split_utils.py +174 -0
  192. xinference/thirdparty/melo/text/__init__.py +35 -0
  193. xinference/thirdparty/melo/text/chinese.py +199 -0
  194. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  195. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  196. xinference/thirdparty/melo/text/cleaner.py +36 -0
  197. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  198. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  199. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  200. xinference/thirdparty/melo/text/english.py +284 -0
  201. xinference/thirdparty/melo/text/english_bert.py +39 -0
  202. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  203. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  204. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  205. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  206. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  207. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  208. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  209. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  210. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  211. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  212. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  213. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  214. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  215. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  216. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  217. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  218. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  219. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  220. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  221. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  222. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  223. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  224. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  225. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  226. xinference/thirdparty/melo/text/french.py +94 -0
  227. xinference/thirdparty/melo/text/french_bert.py +39 -0
  228. xinference/thirdparty/melo/text/japanese.py +647 -0
  229. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  230. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  231. xinference/thirdparty/melo/text/korean.py +192 -0
  232. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  233. xinference/thirdparty/melo/text/spanish.py +122 -0
  234. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  235. xinference/thirdparty/melo/text/symbols.py +290 -0
  236. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  237. xinference/thirdparty/melo/train.py +635 -0
  238. xinference/thirdparty/melo/train.sh +19 -0
  239. xinference/thirdparty/melo/transforms.py +209 -0
  240. xinference/thirdparty/melo/utils.py +424 -0
  241. xinference/types.py +15 -0
  242. xinference/web/ui/build/asset-manifest.json +6 -6
  243. xinference/web/ui/build/index.html +1 -1
  244. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  245. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  246. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  247. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  248. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  249. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  250. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  251. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  252. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  253. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  254. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  255. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  256. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  257. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  258. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  259. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  260. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  261. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  262. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  263. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  264. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  265. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  266. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  292. xinference/web/ui/node_modules/.package-lock.json +67 -3
  293. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  294. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  295. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  296. xinference/web/ui/node_modules/i18next/package.json +129 -0
  297. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  298. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  299. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  300. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  301. xinference/web/ui/package-lock.json +69 -3
  302. xinference/web/ui/package.json +2 -0
  303. xinference/web/ui/src/locales/en.json +186 -0
  304. xinference/web/ui/src/locales/zh.json +186 -0
  305. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/METADATA +68 -32
  306. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/RECORD +316 -122
  307. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  308. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  309. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  310. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  311. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  312. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  313. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  314. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  315. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  316. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  317. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  318. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  319. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  320. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  321. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  322. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  323. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  324. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  325. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  326. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  327. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  328. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  329. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  330. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  331. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  332. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  333. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  334. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  335. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  336. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  337. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  338. /xinference/thirdparty/{fish_speech/tools → melo/text/fr_phonemizer}/__init__.py +0 -0
  339. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  340. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  341. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/WHEEL +0 -0
  342. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  343. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -11,10 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import gc
15
14
  import logging
16
15
  import os.path
17
- import queue
18
16
  import sys
19
17
  from io import BytesIO
20
18
  from typing import TYPE_CHECKING, Optional
@@ -60,6 +58,7 @@ class FishSpeechModel:
60
58
  self._device = device
61
59
  self._llama_queue = None
62
60
  self._model = None
61
+ self._engine = None
63
62
  self._kwargs = kwargs
64
63
 
65
64
  @property
@@ -72,6 +71,7 @@ class FishSpeechModel:
72
71
  0, os.path.join(os.path.dirname(__file__), "../../thirdparty/fish_speech")
73
72
  )
74
73
 
74
+ from tools.inference_engine import TTSInferenceEngine
75
75
  from tools.llama.generate import launch_thread_safe_queue
76
76
  from tools.vqgan.inference import load_model as load_decoder_model
77
77
 
@@ -81,6 +81,11 @@ class FishSpeechModel:
81
81
  if not is_device_available(self._device):
82
82
  raise ValueError(f"Device {self._device} is not available!")
83
83
 
84
+ # https://github.com/pytorch/pytorch/issues/129207
85
+ if self._device == "mps":
86
+ logger.warning("The Conv1d has bugs on MPS backend, fallback to CPU.")
87
+ self._device = "cpu"
88
+
84
89
  enable_compile = self._kwargs.get("compile", False)
85
90
  precision = self._kwargs.get("precision", torch.bfloat16)
86
91
  logger.info("Loading Llama model, compile=%s...", enable_compile)
@@ -102,102 +107,10 @@ class FishSpeechModel:
102
107
  device=self._device,
103
108
  )
104
109
 
105
- @torch.inference_mode()
106
- def _inference(
107
- self,
108
- text,
109
- enable_reference_audio,
110
- reference_audio,
111
- reference_text,
112
- max_new_tokens,
113
- chunk_length,
114
- top_p,
115
- repetition_penalty,
116
- temperature,
117
- seed="0",
118
- streaming=False,
119
- ):
120
- from fish_speech.utils import autocast_exclude_mps, set_seed
121
- from tools.api import decode_vq_tokens, encode_reference
122
- from tools.llama.generate import (
123
- GenerateRequest,
124
- GenerateResponse,
125
- WrappedGenerateResponse,
126
- )
127
-
128
- seed = int(seed)
129
- if seed != 0:
130
- set_seed(seed)
131
- logger.warning(f"set seed: {seed}")
132
-
133
- # Parse reference audio aka prompt
134
- prompt_tokens = encode_reference(
135
- decoder_model=self._model,
136
- reference_audio=reference_audio,
137
- enable_reference_audio=enable_reference_audio,
138
- )
139
-
140
- # LLAMA Inference
141
- request = dict(
142
- device=self._model.device,
143
- max_new_tokens=max_new_tokens,
144
- text=text,
145
- top_p=top_p,
146
- repetition_penalty=repetition_penalty,
147
- temperature=temperature,
148
- compile=self._kwargs.get("compile", False),
149
- iterative_prompt=chunk_length > 0,
150
- chunk_length=chunk_length,
151
- max_length=2048,
152
- prompt_tokens=prompt_tokens if enable_reference_audio else None,
153
- prompt_text=reference_text if enable_reference_audio else None,
154
- )
155
-
156
- response_queue = queue.Queue()
157
- self._llama_queue.put(
158
- GenerateRequest(
159
- request=request,
160
- response_queue=response_queue,
161
- )
110
+ self._engine = TTSInferenceEngine(
111
+ self._llama_queue, self._model, precision, enable_compile
162
112
  )
163
113
 
164
- segments = []
165
-
166
- while True:
167
- result: WrappedGenerateResponse = response_queue.get()
168
- if result.status == "error":
169
- raise result.response
170
-
171
- result: GenerateResponse = result.response
172
- if result.action == "next":
173
- break
174
-
175
- with autocast_exclude_mps(
176
- device_type=self._model.device.type,
177
- dtype=self._kwargs.get("precision", torch.bfloat16),
178
- ):
179
- fake_audios = decode_vq_tokens(
180
- decoder_model=self._model,
181
- codes=result.codes,
182
- )
183
-
184
- fake_audios = fake_audios.float().cpu().numpy()
185
- segments.append(fake_audios)
186
-
187
- if streaming:
188
- yield fake_audios, None, None
189
-
190
- if len(segments) == 0:
191
- raise Exception("No audio generated, please check the input text.")
192
-
193
- # No matter streaming or not, we need to return the final audio
194
- audio = np.concatenate(segments, axis=0)
195
- yield None, (self._model.spec_transform.sample_rate, audio), None
196
-
197
- if torch.cuda.is_available():
198
- torch.cuda.empty_cache()
199
- gc.collect()
200
-
201
114
  def speech(
202
115
  self,
203
116
  input: str,
@@ -211,21 +124,31 @@ class FishSpeechModel:
211
124
  if speed != 1.0:
212
125
  logger.warning("Fish speech does not support setting speed: %s.", speed)
213
126
  import torchaudio
127
+ from tools.schema import ServeReferenceAudio, ServeTTSRequest
214
128
 
215
129
  prompt_speech = kwargs.get("prompt_speech")
216
- result = self._inference(
217
- text=input,
218
- enable_reference_audio=kwargs.get(
219
- "enable_reference_audio", prompt_speech is not None
220
- ),
221
- reference_audio=prompt_speech,
222
- reference_text=kwargs.get("reference_text", ""),
223
- max_new_tokens=kwargs.get("max_new_tokens", 1024),
224
- chunk_length=kwargs.get("chunk_length", 200),
225
- top_p=kwargs.get("top_p", 0.7),
226
- repetition_penalty=kwargs.get("repetition_penalty", 1.2),
227
- temperature=kwargs.get("temperature", 0.7),
228
- streaming=stream,
130
+ prompt_text = kwargs.get("prompt_text", kwargs.get("reference_text", ""))
131
+ if prompt_speech is not None:
132
+ r = ServeReferenceAudio(audio=prompt_speech, text=prompt_text)
133
+ references = [r]
134
+ else:
135
+ references = []
136
+
137
+ assert self._engine is not None
138
+ result = self._engine.inference(
139
+ ServeTTSRequest(
140
+ text=input,
141
+ references=references,
142
+ reference_id=kwargs.get("reference_id"),
143
+ seed=kwargs.get("seed"),
144
+ max_new_tokens=kwargs.get("max_new_tokens", 1024),
145
+ chunk_length=kwargs.get("chunk_length", 200),
146
+ top_p=kwargs.get("top_p", 0.7),
147
+ repetition_penalty=kwargs.get("repetition_penalty", 1.2),
148
+ temperature=kwargs.get("temperature", 0.7),
149
+ streaming=stream,
150
+ format=response_format,
151
+ )
229
152
  )
230
153
 
231
154
  if stream:
@@ -241,7 +164,9 @@ class FishSpeechModel:
241
164
  last_pos = 0
242
165
  with writer.open():
243
166
  for chunk in result:
244
- chunk = chunk[0]
167
+ if chunk.code == "final":
168
+ continue
169
+ chunk = chunk.audio[1]
245
170
  if chunk is not None:
246
171
  chunk = chunk.reshape((chunk.shape[0], 1))
247
172
  trans_chunk = torch.from_numpy(chunk)
@@ -256,7 +181,7 @@ class FishSpeechModel:
256
181
  return _stream_generator()
257
182
  else:
258
183
  result = list(result)
259
- sample_rate, audio = result[0][1]
184
+ sample_rate, audio = result[0].audio
260
185
  audio = np.array([audio])
261
186
 
262
187
  # Save the generated audio
@@ -0,0 +1,110 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from io import BytesIO
16
+ from typing import TYPE_CHECKING, Optional
17
+
18
+ from ...device_utils import get_available_device, is_device_available
19
+
20
+ if TYPE_CHECKING:
21
+ from .core import AudioModelFamilyV1
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class MeloTTSModel:
27
+ def __init__(
28
+ self,
29
+ model_uid: str,
30
+ model_path: str,
31
+ model_spec: "AudioModelFamilyV1",
32
+ device: Optional[str] = None,
33
+ **kwargs,
34
+ ):
35
+ self._model_uid = model_uid
36
+ self._model_path = model_path
37
+ self._model_spec = model_spec
38
+ self._device = device
39
+ self._model = None
40
+ self._kwargs = kwargs
41
+
42
+ @property
43
+ def model_ability(self):
44
+ return self._model_spec.model_ability
45
+
46
+ def load(self):
47
+ if self._device is None:
48
+ self._device = get_available_device()
49
+ else:
50
+ if not is_device_available(self._device):
51
+ raise ValueError(f"Device {self._device} is not available!")
52
+
53
+ import os
54
+ import sys
55
+
56
+ import nltk
57
+
58
+ # English language requires download averaged_perceptron_tagger_eng
59
+ nltk.download("averaged_perceptron_tagger_eng")
60
+
61
+ # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
62
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
63
+
64
+ from melo.api import TTS
65
+
66
+ config_path = os.path.join(self._model_path, "config.json")
67
+ ckpt_path = os.path.join(self._model_path, "checkpoint.pth")
68
+ self._model = TTS(
69
+ language=self._model_spec.language,
70
+ device=self._device,
71
+ config_path=config_path,
72
+ ckpt_path=ckpt_path,
73
+ )
74
+
75
+ def speech(
76
+ self,
77
+ input: str,
78
+ voice: str,
79
+ response_format: str = "mp3",
80
+ speed: float = 1.0,
81
+ stream: bool = False,
82
+ **kwargs,
83
+ ):
84
+ import soundfile
85
+
86
+ if stream:
87
+ raise Exception("MeloTTS does not support stream mode.")
88
+ assert self._model is not None
89
+ speaker_ids = self._model.hps.data.spk2id
90
+ if not voice:
91
+ voice = next(iter(speaker_ids.keys()))
92
+ logger.info("Auto select speaker: %s", voice)
93
+ elif voice not in speaker_ids:
94
+ raise ValueError(
95
+ f"Invalid voice: {voice}, available speakers: {speaker_ids}"
96
+ )
97
+ audio = self._model.tts_to_file(
98
+ text=input, speaker_id=speaker_ids[voice], speed=speed, **kwargs
99
+ )
100
+ # Save the generated audio
101
+ with BytesIO() as out:
102
+ with soundfile.SoundFile(
103
+ out,
104
+ "w",
105
+ self._model.hps.data.sampling_rate,
106
+ 1,
107
+ format=response_format.upper(),
108
+ ) as f:
109
+ f.write(audio)
110
+ return out.getvalue()
@@ -236,11 +236,107 @@
236
236
  "multilingual": true
237
237
  },
238
238
  {
239
- "model_name": "FishSpeech-1.4",
239
+ "model_name": "CosyVoice2-0.5B",
240
+ "model_family": "CosyVoice",
241
+ "model_id": "mrfakename/CosyVoice2-0.5B",
242
+ "model_revision": "5676baabc8a76dc93ef60a88bbd2420deaa2f644",
243
+ "model_ability": "text-to-audio",
244
+ "multilingual": true
245
+ },
246
+ {
247
+ "model_name": "FishSpeech-1.5",
240
248
  "model_family": "FishAudio",
241
- "model_id": "fishaudio/fish-speech-1.4",
242
- "model_revision": "069c573759936b35191d3380deb89183c0656f59",
249
+ "model_id": "fishaudio/fish-speech-1.5",
250
+ "model_revision": "268b6ec86243dd683bc78dab7e9a6cedf9191f2a",
251
+ "model_ability": "text-to-audio",
252
+ "multilingual": true
253
+ },
254
+ {
255
+ "model_name": "F5-TTS",
256
+ "model_family": "F5-TTS",
257
+ "model_id": "SWivid/F5-TTS",
258
+ "model_revision": "4dcc16f297f2ff98a17b3726b16f5de5a5e45672",
243
259
  "model_ability": "text-to-audio",
244
260
  "multilingual": true
261
+ },
262
+ {
263
+ "model_name": "F5-TTS-MLX",
264
+ "model_family": "F5-TTS-MLX",
265
+ "model_id": "lucasnewman/f5-tts-mlx",
266
+ "model_revision": "7642bb232e3fcacf92c51c786edebb8624da6b93",
267
+ "model_ability": "text-to-audio",
268
+ "multilingual": true
269
+ },
270
+ {
271
+ "model_name": "MeloTTS-English",
272
+ "model_family": "MeloTTS",
273
+ "model_id": "myshell-ai/MeloTTS-English",
274
+ "model_revision": "bb4fb7346d566d277ba8c8c7dbfdf6786139b8ef",
275
+ "model_ability": "text-to-audio",
276
+ "multilingual": false,
277
+ "language": "EN"
278
+ },
279
+ {
280
+ "model_name": "MeloTTS-English-v2",
281
+ "model_family": "MeloTTS",
282
+ "model_id": "myshell-ai/MeloTTS-English-v2",
283
+ "model_revision": "a53e3509c4ee4ff16d79272feb2474ff864e18f3",
284
+ "model_ability": "text-to-audio",
285
+ "multilingual": false,
286
+ "language": "EN"
287
+ },
288
+ {
289
+ "model_name": "MeloTTS-English-v3",
290
+ "model_family": "MeloTTS",
291
+ "model_id": "myshell-ai/MeloTTS-English-v3",
292
+ "model_revision": "f7c4a35392c0e9be24a755f1edb4c3f63040f759",
293
+ "model_ability": "text-to-audio",
294
+ "multilingual": false,
295
+ "language": "EN"
296
+ },
297
+ {
298
+ "model_name": "MeloTTS-French",
299
+ "model_family": "MeloTTS",
300
+ "model_id": "myshell-ai/MeloTTS-French",
301
+ "model_revision": "1e9bf590262392d8bffb679b0a3b0c16b0f9fdaf",
302
+ "model_ability": "text-to-audio",
303
+ "multilingual": false,
304
+ "language": "FR"
305
+ },
306
+ {
307
+ "model_name": "MeloTTS-Japanese",
308
+ "model_family": "MeloTTS",
309
+ "model_id": "myshell-ai/MeloTTS-Japanese",
310
+ "model_revision": "367f8795464b531b4e97c1515bddfc1243e60891",
311
+ "model_ability": "text-to-audio",
312
+ "multilingual": false,
313
+ "language": "JP"
314
+ },
315
+ {
316
+ "model_name": "MeloTTS-Spanish",
317
+ "model_family": "MeloTTS",
318
+ "model_id": "myshell-ai/MeloTTS-Spanish",
319
+ "model_revision": "dbb5496df39d11a66c1d5f5a9ca357c3c9fb95fb",
320
+ "model_ability": "text-to-audio",
321
+ "multilingual": false,
322
+ "language": "ES"
323
+ },
324
+ {
325
+ "model_name": "MeloTTS-Chinese",
326
+ "model_family": "MeloTTS",
327
+ "model_id": "myshell-ai/MeloTTS-Chinese",
328
+ "model_revision": "af5d207a364ea4208c6f589c89f57f88414bdd16",
329
+ "model_ability": "text-to-audio",
330
+ "multilingual": false,
331
+ "language": "ZH"
332
+ },
333
+ {
334
+ "model_name": "MeloTTS-Korean",
335
+ "model_family": "MeloTTS",
336
+ "model_id": "myshell-ai/MeloTTS-Korean",
337
+ "model_revision": "0207e5adfc90129a51b6b03d89be6d84360ed323",
338
+ "model_ability": "text-to-audio",
339
+ "multilingual": false,
340
+ "language": "KR"
245
341
  }
246
342
  ]
@@ -17,6 +17,15 @@
17
17
  "model_ability": "audio-to-text",
18
18
  "multilingual": true
19
19
  },
20
+ {
21
+ "model_name": "Belle-whisper-large-v3-zh",
22
+ "model_family": "whisper",
23
+ "model_hub": "modelscope",
24
+ "model_id": "Xorbits/Belle-whisper-large-v3-zh",
25
+ "model_revision": "master",
26
+ "model_ability": "audio-to-text",
27
+ "multilingual": false
28
+ },
20
29
  {
21
30
  "model_name": "SenseVoiceSmall",
22
31
  "model_family": "funasr",
@@ -73,5 +82,23 @@
73
82
  "model_revision": "master",
74
83
  "model_ability": "text-to-audio",
75
84
  "multilingual": true
85
+ },
86
+ {
87
+ "model_name": "CosyVoice2-0.5B",
88
+ "model_family": "CosyVoice",
89
+ "model_hub": "modelscope",
90
+ "model_id": "iic/CosyVoice2-0.5B",
91
+ "model_revision": "master",
92
+ "model_ability": "text-to-audio",
93
+ "multilingual": true
94
+ },
95
+ {
96
+ "model_name": "F5-TTS",
97
+ "model_family": "F5-TTS",
98
+ "model_hub": "modelscope",
99
+ "model_id": "SWivid/F5-TTS_Emilia-ZH-EN",
100
+ "model_revision": "master",
101
+ "model_ability": "text-to-audio",
102
+ "multilingual": true
76
103
  }
77
104
  ]
@@ -11,8 +11,40 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+ import io
16
+
17
+ import numpy as np
18
+
14
19
  from .core import AudioModelFamilyV1
15
20
 
16
21
 
17
22
  def get_model_version(audio_model: AudioModelFamilyV1) -> str:
18
23
  return audio_model.model_name
24
+
25
+
26
+ def ensure_sample_rate(
27
+ audio: np.ndarray, old_sample_rate: int, sample_rate: int
28
+ ) -> np.ndarray:
29
+ import soundfile as sf
30
+ from scipy.signal import resample
31
+
32
+ if old_sample_rate != sample_rate:
33
+ # Calculate the new data length
34
+ new_length = int(len(audio) * sample_rate / old_sample_rate)
35
+
36
+ # Resample the data
37
+ resampled_data = resample(audio, new_length)
38
+
39
+ # Use BytesIO to save the resampled data to memory
40
+ with io.BytesIO() as buffer:
41
+ # Write the resampled data to the memory buffer
42
+ sf.write(buffer, resampled_data, sample_rate, format="WAV")
43
+
44
+ # Reset the buffer position to the beginning
45
+ buffer.seek(0)
46
+
47
+ # Read the data from the memory buffer
48
+ audio, sr = sf.read(buffer, dtype="float32")
49
+
50
+ return audio
@@ -13,9 +13,12 @@
13
13
  # limitations under the License.
14
14
  import logging
15
15
  import os
16
+ import typing
16
17
  from glob import glob
17
18
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
18
19
 
20
+ from typing_extensions import TypedDict
21
+
19
22
  from ...device_utils import (
20
23
  get_available_device,
21
24
  get_device_preferred_dtype,
@@ -28,6 +31,13 @@ if TYPE_CHECKING:
28
31
  logger = logging.getLogger(__name__)
29
32
 
30
33
 
34
+ class WhisperModelConfig(TypedDict, total=False):
35
+ chunk_length_s: Optional[float]
36
+ stride_length_s: Optional[float]
37
+ return_timestamps: Optional[bool]
38
+ batch_size: Optional[int]
39
+
40
+
31
41
  class WhisperModel:
32
42
  def __init__(
33
43
  self,
@@ -35,6 +45,7 @@ class WhisperModel:
35
45
  model_path: str,
36
46
  model_spec: "AudioModelFamilyV1",
37
47
  device: Optional[str] = None,
48
+ max_new_tokens: Optional[int] = 128,
38
49
  **kwargs,
39
50
  ):
40
51
  self._model_uid = model_uid
@@ -42,7 +53,21 @@ class WhisperModel:
42
53
  self._model_spec = model_spec
43
54
  self._device = device
44
55
  self._model = None
45
- self._kwargs = kwargs
56
+ self._max_new_tokens = max_new_tokens
57
+ self._model_config: WhisperModelConfig = self._sanitize_model_config(
58
+ typing.cast(WhisperModelConfig, kwargs)
59
+ )
60
+
61
+ def _sanitize_model_config(
62
+ self, model_config: Optional[WhisperModelConfig]
63
+ ) -> WhisperModelConfig:
64
+ if model_config is None:
65
+ model_config = WhisperModelConfig()
66
+ model_config.setdefault("chunk_length_s", 30)
67
+ model_config.setdefault("stride_length_s", None)
68
+ model_config.setdefault("return_timestamps", False)
69
+ model_config.setdefault("batch_size", 16)
70
+ return model_config
46
71
 
47
72
  @property
48
73
  def model_ability(self):
@@ -75,10 +100,10 @@ class WhisperModel:
75
100
  model=model,
76
101
  tokenizer=processor.tokenizer,
77
102
  feature_extractor=processor.feature_extractor,
78
- max_new_tokens=128,
79
- chunk_length_s=30,
80
- batch_size=16,
81
- return_timestamps=False,
103
+ chunk_length_s=self._model_config.get("chunk_length_s"),
104
+ stride_length_s=self._model_config.get("stride_length_s"),
105
+ return_timestamps=self._model_config.get("return_timestamps"),
106
+ batch_size=self._model_config.get("batch_size"),
82
107
  torch_dtype=torch_dtype,
83
108
  device=self._device,
84
109
  )
@@ -185,13 +210,13 @@ class WhisperModel:
185
210
  logger.warning(
186
211
  "Prompt for whisper transcriptions will be ignored: %s", prompt
187
212
  )
213
+ generate_kwargs = {"max_new_tokens": self._max_new_tokens, "task": "transcribe"}
214
+ if language is not None:
215
+ generate_kwargs["language"] = language
216
+
188
217
  return self._call_model(
189
218
  audio=audio,
190
- generate_kwargs=(
191
- {"language": language, "task": "transcribe"}
192
- if language is not None
193
- else {"task": "transcribe"}
194
- ),
219
+ generate_kwargs=generate_kwargs,
195
220
  response_format=response_format,
196
221
  temperature=temperature,
197
222
  timestamp_granularities=timestamp_granularities,