xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,110 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from io import BytesIO
16
+ from typing import TYPE_CHECKING, Optional
17
+
18
+ from ...device_utils import get_available_device, is_device_available
19
+
20
+ if TYPE_CHECKING:
21
+ from .core import AudioModelFamilyV1
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class MeloTTSModel:
27
+ def __init__(
28
+ self,
29
+ model_uid: str,
30
+ model_path: str,
31
+ model_spec: "AudioModelFamilyV1",
32
+ device: Optional[str] = None,
33
+ **kwargs,
34
+ ):
35
+ self._model_uid = model_uid
36
+ self._model_path = model_path
37
+ self._model_spec = model_spec
38
+ self._device = device
39
+ self._model = None
40
+ self._kwargs = kwargs
41
+
42
+ @property
43
+ def model_ability(self):
44
+ return self._model_spec.model_ability
45
+
46
+ def load(self):
47
+ if self._device is None:
48
+ self._device = get_available_device()
49
+ else:
50
+ if not is_device_available(self._device):
51
+ raise ValueError(f"Device {self._device} is not available!")
52
+
53
+ import os
54
+ import sys
55
+
56
+ import nltk
57
+
58
+ # English language requires download averaged_perceptron_tagger_eng
59
+ nltk.download("averaged_perceptron_tagger_eng")
60
+
61
+ # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
62
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
63
+
64
+ from melo.api import TTS
65
+
66
+ config_path = os.path.join(self._model_path, "config.json")
67
+ ckpt_path = os.path.join(self._model_path, "checkpoint.pth")
68
+ self._model = TTS(
69
+ language=self._model_spec.language,
70
+ device=self._device,
71
+ config_path=config_path,
72
+ ckpt_path=ckpt_path,
73
+ )
74
+
75
+ def speech(
76
+ self,
77
+ input: str,
78
+ voice: str,
79
+ response_format: str = "mp3",
80
+ speed: float = 1.0,
81
+ stream: bool = False,
82
+ **kwargs,
83
+ ):
84
+ import soundfile
85
+
86
+ if stream:
87
+ raise Exception("MeloTTS does not support stream mode.")
88
+ assert self._model is not None
89
+ speaker_ids = self._model.hps.data.spk2id
90
+ if not voice:
91
+ voice = next(iter(speaker_ids.keys()))
92
+ logger.info("Auto select speaker: %s", voice)
93
+ elif voice not in speaker_ids:
94
+ raise ValueError(
95
+ f"Invalid voice: {voice}, available speakers: {speaker_ids}"
96
+ )
97
+ audio = self._model.tts_to_file(
98
+ text=input, speaker_id=speaker_ids[voice], speed=speed, **kwargs
99
+ )
100
+ # Save the generated audio
101
+ with BytesIO() as out:
102
+ with soundfile.SoundFile(
103
+ out,
104
+ "w",
105
+ self._model.hps.data.sampling_rate,
106
+ 1,
107
+ format=response_format.upper(),
108
+ ) as f:
109
+ f.write(audio)
110
+ return out.getvalue()
@@ -103,6 +103,86 @@
103
103
  "model_ability": "audio-to-text",
104
104
  "multilingual": false
105
105
  },
106
+ {
107
+ "model_name": "whisper-tiny-mlx",
108
+ "model_family": "whisper",
109
+ "model_id": "mlx-community/whisper-tiny",
110
+ "model_ability": "audio-to-text",
111
+ "multilingual": true,
112
+ "engine": "mlx"
113
+ },
114
+ {
115
+ "model_name": "whisper-tiny.en-mlx",
116
+ "model_family": "whisper",
117
+ "model_id": "mlx-community/whisper-tiny.en-mlx",
118
+ "model_ability": "audio-to-text",
119
+ "multilingual": false,
120
+ "engine": "mlx"
121
+ },
122
+ {
123
+ "model_name": "whisper-base-mlx",
124
+ "model_family": "whisper",
125
+ "model_id": "mlx-community/whisper-base-mlx",
126
+ "model_ability": "audio-to-text",
127
+ "multilingual": true,
128
+ "engine": "mlx"
129
+ },
130
+ {
131
+ "model_name": "whisper-base.en-mlx",
132
+ "model_family": "whisper",
133
+ "model_id": "mlx-community/whisper-base.en-mlx",
134
+ "model_ability": "audio-to-text",
135
+ "multilingual": false,
136
+ "engine": "mlx"
137
+ },
138
+ {
139
+ "model_name": "whisper-small-mlx",
140
+ "model_family": "whisper",
141
+ "model_id": "mlx-community/whisper-small-mlx",
142
+ "model_ability": "audio-to-text",
143
+ "multilingual": true,
144
+ "engine": "mlx"
145
+ },
146
+ {
147
+ "model_name": "whisper-small.en-mlx",
148
+ "model_family": "whisper",
149
+ "model_id": "mlx-community/whisper-small.en-mlx",
150
+ "model_ability": "audio-to-text",
151
+ "multilingual": false,
152
+ "engine": "mlx"
153
+ },
154
+ {
155
+ "model_name": "whisper-medium-mlx",
156
+ "model_family": "whisper",
157
+ "model_id": "mlx-community/whisper-medium-mlx",
158
+ "model_ability": "audio-to-text",
159
+ "multilingual": true,
160
+ "engine": "mlx"
161
+ },
162
+ {
163
+ "model_name": "whisper-medium.en-mlx",
164
+ "model_family": "whisper",
165
+ "model_id": "mlx-community/whisper-medium.en-mlx",
166
+ "model_ability": "audio-to-text",
167
+ "multilingual": false,
168
+ "engine": "mlx"
169
+ },
170
+ {
171
+ "model_name": "whisper-large-v3-mlx",
172
+ "model_family": "whisper",
173
+ "model_id": "mlx-community/whisper-large-v3-mlx",
174
+ "model_ability": "audio-to-text",
175
+ "multilingual": true,
176
+ "engine": "mlx"
177
+ },
178
+ {
179
+ "model_name": "whisper-large-v3-turbo-mlx",
180
+ "model_family": "whisper",
181
+ "model_id": "mlx-community/whisper-large-v3-turbo",
182
+ "model_ability": "audio-to-text",
183
+ "multilingual": true,
184
+ "engine": "mlx"
185
+ },
106
186
  {
107
187
  "model_name": "SenseVoiceSmall",
108
188
  "model_family": "funasr",
@@ -156,11 +236,107 @@
156
236
  "multilingual": true
157
237
  },
158
238
  {
159
- "model_name": "FishSpeech-1.4",
239
+ "model_name": "CosyVoice2-0.5B",
240
+ "model_family": "CosyVoice",
241
+ "model_id": "mrfakename/CosyVoice2-0.5B",
242
+ "model_revision": "5676baabc8a76dc93ef60a88bbd2420deaa2f644",
243
+ "model_ability": "text-to-audio",
244
+ "multilingual": true
245
+ },
246
+ {
247
+ "model_name": "FishSpeech-1.5",
160
248
  "model_family": "FishAudio",
161
- "model_id": "fishaudio/fish-speech-1.4",
162
- "model_revision": "3c49651b8e583b6b13f55e375432e0d57e1aa84d",
249
+ "model_id": "fishaudio/fish-speech-1.5",
250
+ "model_revision": "268b6ec86243dd683bc78dab7e9a6cedf9191f2a",
163
251
  "model_ability": "text-to-audio",
164
252
  "multilingual": true
253
+ },
254
+ {
255
+ "model_name": "F5-TTS",
256
+ "model_family": "F5-TTS",
257
+ "model_id": "SWivid/F5-TTS",
258
+ "model_revision": "4dcc16f297f2ff98a17b3726b16f5de5a5e45672",
259
+ "model_ability": "text-to-audio",
260
+ "multilingual": true
261
+ },
262
+ {
263
+ "model_name": "F5-TTS-MLX",
264
+ "model_family": "F5-TTS-MLX",
265
+ "model_id": "lucasnewman/f5-tts-mlx",
266
+ "model_revision": "7642bb232e3fcacf92c51c786edebb8624da6b93",
267
+ "model_ability": "text-to-audio",
268
+ "multilingual": true
269
+ },
270
+ {
271
+ "model_name": "MeloTTS-English",
272
+ "model_family": "MeloTTS",
273
+ "model_id": "myshell-ai/MeloTTS-English",
274
+ "model_revision": "bb4fb7346d566d277ba8c8c7dbfdf6786139b8ef",
275
+ "model_ability": "text-to-audio",
276
+ "multilingual": false,
277
+ "language": "EN"
278
+ },
279
+ {
280
+ "model_name": "MeloTTS-English-v2",
281
+ "model_family": "MeloTTS",
282
+ "model_id": "myshell-ai/MeloTTS-English-v2",
283
+ "model_revision": "a53e3509c4ee4ff16d79272feb2474ff864e18f3",
284
+ "model_ability": "text-to-audio",
285
+ "multilingual": false,
286
+ "language": "EN"
287
+ },
288
+ {
289
+ "model_name": "MeloTTS-English-v3",
290
+ "model_family": "MeloTTS",
291
+ "model_id": "myshell-ai/MeloTTS-English-v3",
292
+ "model_revision": "f7c4a35392c0e9be24a755f1edb4c3f63040f759",
293
+ "model_ability": "text-to-audio",
294
+ "multilingual": false,
295
+ "language": "EN"
296
+ },
297
+ {
298
+ "model_name": "MeloTTS-French",
299
+ "model_family": "MeloTTS",
300
+ "model_id": "myshell-ai/MeloTTS-French",
301
+ "model_revision": "1e9bf590262392d8bffb679b0a3b0c16b0f9fdaf",
302
+ "model_ability": "text-to-audio",
303
+ "multilingual": false,
304
+ "language": "FR"
305
+ },
306
+ {
307
+ "model_name": "MeloTTS-Japanese",
308
+ "model_family": "MeloTTS",
309
+ "model_id": "myshell-ai/MeloTTS-Japanese",
310
+ "model_revision": "367f8795464b531b4e97c1515bddfc1243e60891",
311
+ "model_ability": "text-to-audio",
312
+ "multilingual": false,
313
+ "language": "JP"
314
+ },
315
+ {
316
+ "model_name": "MeloTTS-Spanish",
317
+ "model_family": "MeloTTS",
318
+ "model_id": "myshell-ai/MeloTTS-Spanish",
319
+ "model_revision": "dbb5496df39d11a66c1d5f5a9ca357c3c9fb95fb",
320
+ "model_ability": "text-to-audio",
321
+ "multilingual": false,
322
+ "language": "ES"
323
+ },
324
+ {
325
+ "model_name": "MeloTTS-Chinese",
326
+ "model_family": "MeloTTS",
327
+ "model_id": "myshell-ai/MeloTTS-Chinese",
328
+ "model_revision": "af5d207a364ea4208c6f589c89f57f88414bdd16",
329
+ "model_ability": "text-to-audio",
330
+ "multilingual": false,
331
+ "language": "ZH"
332
+ },
333
+ {
334
+ "model_name": "MeloTTS-Korean",
335
+ "model_family": "MeloTTS",
336
+ "model_id": "myshell-ai/MeloTTS-Korean",
337
+ "model_revision": "0207e5adfc90129a51b6b03d89be6d84360ed323",
338
+ "model_ability": "text-to-audio",
339
+ "multilingual": false,
340
+ "language": "KR"
165
341
  }
166
342
  ]
@@ -17,6 +17,15 @@
17
17
  "model_ability": "audio-to-text",
18
18
  "multilingual": true
19
19
  },
20
+ {
21
+ "model_name": "Belle-whisper-large-v3-zh",
22
+ "model_family": "whisper",
23
+ "model_hub": "modelscope",
24
+ "model_id": "Xorbits/Belle-whisper-large-v3-zh",
25
+ "model_revision": "master",
26
+ "model_ability": "audio-to-text",
27
+ "multilingual": false
28
+ },
20
29
  {
21
30
  "model_name": "SenseVoiceSmall",
22
31
  "model_family": "funasr",
@@ -73,5 +82,23 @@
73
82
  "model_revision": "master",
74
83
  "model_ability": "text-to-audio",
75
84
  "multilingual": true
85
+ },
86
+ {
87
+ "model_name": "CosyVoice2-0.5B",
88
+ "model_family": "CosyVoice",
89
+ "model_hub": "modelscope",
90
+ "model_id": "iic/CosyVoice2-0.5B",
91
+ "model_revision": "master",
92
+ "model_ability": "text-to-audio",
93
+ "multilingual": true
94
+ },
95
+ {
96
+ "model_name": "F5-TTS",
97
+ "model_family": "F5-TTS",
98
+ "model_hub": "modelscope",
99
+ "model_id": "SWivid/F5-TTS_Emilia-ZH-EN",
100
+ "model_revision": "master",
101
+ "model_ability": "text-to-audio",
102
+ "multilingual": true
76
103
  }
77
104
  ]
@@ -11,8 +11,40 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+ import io
16
+
17
+ import numpy as np
18
+
14
19
  from .core import AudioModelFamilyV1
15
20
 
16
21
 
17
22
  def get_model_version(audio_model: AudioModelFamilyV1) -> str:
18
23
  return audio_model.model_name
24
+
25
+
26
+ def ensure_sample_rate(
27
+ audio: np.ndarray, old_sample_rate: int, sample_rate: int
28
+ ) -> np.ndarray:
29
+ import soundfile as sf
30
+ from scipy.signal import resample
31
+
32
+ if old_sample_rate != sample_rate:
33
+ # Calculate the new data length
34
+ new_length = int(len(audio) * sample_rate / old_sample_rate)
35
+
36
+ # Resample the data
37
+ resampled_data = resample(audio, new_length)
38
+
39
+ # Use BytesIO to save the resampled data to memory
40
+ with io.BytesIO() as buffer:
41
+ # Write the resampled data to the memory buffer
42
+ sf.write(buffer, resampled_data, sample_rate, format="WAV")
43
+
44
+ # Reset the buffer position to the beginning
45
+ buffer.seek(0)
46
+
47
+ # Read the data from the memory buffer
48
+ audio, sr = sf.read(buffer, dtype="float32")
49
+
50
+ return audio
@@ -13,9 +13,12 @@
13
13
  # limitations under the License.
14
14
  import logging
15
15
  import os
16
+ import typing
16
17
  from glob import glob
17
18
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
18
19
 
20
+ from typing_extensions import TypedDict
21
+
19
22
  from ...device_utils import (
20
23
  get_available_device,
21
24
  get_device_preferred_dtype,
@@ -28,6 +31,13 @@ if TYPE_CHECKING:
28
31
  logger = logging.getLogger(__name__)
29
32
 
30
33
 
34
+ class WhisperModelConfig(TypedDict, total=False):
35
+ chunk_length_s: Optional[float]
36
+ stride_length_s: Optional[float]
37
+ return_timestamps: Optional[bool]
38
+ batch_size: Optional[int]
39
+
40
+
31
41
  class WhisperModel:
32
42
  def __init__(
33
43
  self,
@@ -35,6 +45,7 @@ class WhisperModel:
35
45
  model_path: str,
36
46
  model_spec: "AudioModelFamilyV1",
37
47
  device: Optional[str] = None,
48
+ max_new_tokens: Optional[int] = 128,
38
49
  **kwargs,
39
50
  ):
40
51
  self._model_uid = model_uid
@@ -42,7 +53,21 @@ class WhisperModel:
42
53
  self._model_spec = model_spec
43
54
  self._device = device
44
55
  self._model = None
45
- self._kwargs = kwargs
56
+ self._max_new_tokens = max_new_tokens
57
+ self._model_config: WhisperModelConfig = self._sanitize_model_config(
58
+ typing.cast(WhisperModelConfig, kwargs)
59
+ )
60
+
61
+ def _sanitize_model_config(
62
+ self, model_config: Optional[WhisperModelConfig]
63
+ ) -> WhisperModelConfig:
64
+ if model_config is None:
65
+ model_config = WhisperModelConfig()
66
+ model_config.setdefault("chunk_length_s", 30)
67
+ model_config.setdefault("stride_length_s", None)
68
+ model_config.setdefault("return_timestamps", False)
69
+ model_config.setdefault("batch_size", 16)
70
+ return model_config
46
71
 
47
72
  @property
48
73
  def model_ability(self):
@@ -75,10 +100,10 @@ class WhisperModel:
75
100
  model=model,
76
101
  tokenizer=processor.tokenizer,
77
102
  feature_extractor=processor.feature_extractor,
78
- max_new_tokens=128,
79
- chunk_length_s=30,
80
- batch_size=16,
81
- return_timestamps=False,
103
+ chunk_length_s=self._model_config.get("chunk_length_s"),
104
+ stride_length_s=self._model_config.get("stride_length_s"),
105
+ return_timestamps=self._model_config.get("return_timestamps"),
106
+ batch_size=self._model_config.get("batch_size"),
82
107
  torch_dtype=torch_dtype,
83
108
  device=self._device,
84
109
  )
@@ -185,13 +210,13 @@ class WhisperModel:
185
210
  logger.warning(
186
211
  "Prompt for whisper transcriptions will be ignored: %s", prompt
187
212
  )
213
+ generate_kwargs = {"max_new_tokens": self._max_new_tokens, "task": "transcribe"}
214
+ if language is not None:
215
+ generate_kwargs["language"] = language
216
+
188
217
  return self._call_model(
189
218
  audio=audio,
190
- generate_kwargs=(
191
- {"language": language, "task": "transcribe"}
192
- if language is not None
193
- else {"task": "transcribe"}
194
- ),
219
+ generate_kwargs=generate_kwargs,
195
220
  response_format=response_format,
196
221
  temperature=temperature,
197
222
  timestamp_granularities=timestamp_granularities,
@@ -0,0 +1,208 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import functools
15
+ import itertools
16
+ import logging
17
+ import tempfile
18
+ from typing import TYPE_CHECKING, List, Optional
19
+
20
+ if TYPE_CHECKING:
21
+ from .core import AudioModelFamilyV1
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class WhisperMLXModel:
27
+ def __init__(
28
+ self,
29
+ model_uid: str,
30
+ model_path: str,
31
+ model_spec: "AudioModelFamilyV1",
32
+ device: Optional[str] = None,
33
+ **kwargs,
34
+ ):
35
+ self._model_uid = model_uid
36
+ self._model_path = model_path
37
+ self._model_spec = model_spec
38
+ self._device = device
39
+ self._model = None
40
+ self._kwargs = kwargs
41
+ self._use_lighting = False
42
+
43
+ @property
44
+ def model_ability(self):
45
+ return self._model_spec.model_ability
46
+
47
+ def load(self):
48
+ use_lightning = self._kwargs.get("use_lightning", "auto")
49
+ if use_lightning not in ("auto", True, False, None):
50
+ raise ValueError("use_lightning can only be True, False, None or auto")
51
+
52
+ if use_lightning == "auto" or use_lightning is True:
53
+ try:
54
+ import mlx.core as mx
55
+ from lightning_whisper_mlx.transcribe import ModelHolder
56
+ except ImportError:
57
+ if use_lightning == "auto":
58
+ use_lightning = False
59
+ else:
60
+ error_message = "Failed to import module 'lightning_whisper_mlx'"
61
+ installation_guide = [
62
+ "Please make sure 'lightning_whisper_mlx' is installed.\n",
63
+ ]
64
+
65
+ raise ImportError(
66
+ f"{error_message}\n\n{''.join(installation_guide)}"
67
+ )
68
+ else:
69
+ use_lightning = True
70
+ if not use_lightning:
71
+ try:
72
+ import mlx.core as mx # noqa: F811
73
+ from mlx_whisper.transcribe import ModelHolder # noqa: F811
74
+ except ImportError:
75
+ error_message = "Failed to import module 'mlx_whisper'"
76
+ installation_guide = [
77
+ "Please make sure 'mlx_whisper' is installed.\n",
78
+ ]
79
+
80
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
81
+ else:
82
+ use_lightning = False
83
+
84
+ logger.info(
85
+ "Loading MLX whisper from %s, use lightning: %s",
86
+ self._model_path,
87
+ use_lightning,
88
+ )
89
+ self._use_lighting = use_lightning
90
+ self._model = ModelHolder.get_model(self._model_path, mx.float16)
91
+
92
+ def transcriptions(
93
+ self,
94
+ audio: bytes,
95
+ language: Optional[str] = None,
96
+ prompt: Optional[str] = None,
97
+ response_format: str = "json",
98
+ temperature: float = 0,
99
+ timestamp_granularities: Optional[List[str]] = None,
100
+ ):
101
+ return self._call(
102
+ audio,
103
+ language=language,
104
+ prompt=prompt,
105
+ response_format=response_format,
106
+ temperature=temperature,
107
+ timestamp_granularities=timestamp_granularities,
108
+ task="transcribe",
109
+ )
110
+
111
+ def translations(
112
+ self,
113
+ audio: bytes,
114
+ language: Optional[str] = None,
115
+ prompt: Optional[str] = None,
116
+ response_format: str = "json",
117
+ temperature: float = 0,
118
+ timestamp_granularities: Optional[List[str]] = None,
119
+ ):
120
+ if not self._model_spec.multilingual:
121
+ raise RuntimeError(
122
+ f"Model {self._model_spec.model_name} is not suitable for translations."
123
+ )
124
+ return self._call(
125
+ audio,
126
+ language=language,
127
+ prompt=prompt,
128
+ response_format=response_format,
129
+ temperature=temperature,
130
+ timestamp_granularities=timestamp_granularities,
131
+ task="translate",
132
+ )
133
+
134
+ def _call(
135
+ self,
136
+ audio: bytes,
137
+ language: Optional[str] = None,
138
+ prompt: Optional[str] = None,
139
+ response_format: str = "json",
140
+ temperature: float = 0,
141
+ timestamp_granularities: Optional[List[str]] = None,
142
+ task: str = "transcribe",
143
+ ):
144
+ if self._use_lighting:
145
+ from lightning_whisper_mlx.transcribe import transcribe_audio
146
+
147
+ transcribe = functools.partial(
148
+ transcribe_audio, batch_size=self._kwargs.get("batch_size", 12)
149
+ )
150
+ else:
151
+ from mlx_whisper import transcribe # type: ignore
152
+
153
+ with tempfile.NamedTemporaryFile(delete=True) as f:
154
+ f.write(audio)
155
+
156
+ kwargs = {"task": task}
157
+ if response_format == "verbose_json":
158
+ if timestamp_granularities == ["word"]:
159
+ kwargs["word_timestamps"] = True # type: ignore
160
+
161
+ result = transcribe(
162
+ f.name,
163
+ path_or_hf_repo=self._model_path,
164
+ language=language,
165
+ temperature=temperature,
166
+ initial_prompt=prompt,
167
+ **kwargs,
168
+ )
169
+ text = result["text"]
170
+ segments = result["segments"]
171
+ language = result["language"]
172
+
173
+ if response_format == "json":
174
+ return {"text": text}
175
+ elif response_format == "verbose_json":
176
+ if not timestamp_granularities or timestamp_granularities == [
177
+ "segment"
178
+ ]:
179
+ return {
180
+ "task": task,
181
+ "language": language,
182
+ "duration": segments[-1]["end"] if segments else 0,
183
+ "text": text,
184
+ "segments": segments,
185
+ }
186
+ else:
187
+ assert timestamp_granularities == ["word"]
188
+
189
+ def _extract_word(word: dict) -> dict:
190
+ return {
191
+ "start": word["start"].item(),
192
+ "end": word["end"].item(),
193
+ "word": word["word"],
194
+ }
195
+
196
+ words = [
197
+ _extract_word(w)
198
+ for w in itertools.chain(*[s["words"] for s in segments])
199
+ ]
200
+ return {
201
+ "task": task,
202
+ "language": language,
203
+ "duration": words[-1]["end"] if words else 0,
204
+ "text": text,
205
+ "words": words,
206
+ }
207
+ else:
208
+ raise ValueError(f"Unsupported response format: {response_format}")