xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -126,6 +126,43 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
126
126
  response_data = response.json()
127
127
  return response_data
128
128
 
129
+ def convert_ids_to_tokens(
130
+ self, input: Union[List, List[List]], **kwargs
131
+ ) -> List[str]:
132
+ """
133
+ Convert token IDs to human readable tokens via RESTful APIs.
134
+
135
+ Parameters
136
+ ----------
137
+ input: Union[List, List[List]]
138
+ Input token IDs to convert, can be a single list of token IDs or a list of token ID lists.
139
+ To convert multiple sequences in a single request, pass a list of token ID lists.
140
+
141
+ Returns
142
+ -------
143
+ list
144
+ A list of decoded tokens in human readable format.
145
+
146
+ Raises
147
+ ------
148
+ RuntimeError
149
+ Report the failure of token conversion and provide the error message.
150
+
151
+ """
152
+ url = f"{self._base_url}/v1/convert_ids_to_tokens"
153
+ request_body = {
154
+ "model": self._model_uid,
155
+ "input": input,
156
+ }
157
+ request_body.update(kwargs)
158
+ response = requests.post(url, json=request_body, headers=self.auth_headers)
159
+ if response.status_code != 200:
160
+ raise RuntimeError(
161
+ f"Failed to decode token ids, detail: {_get_error_string(response)}"
162
+ )
163
+ response_data = response.json()
164
+ return response_data
165
+
129
166
 
130
167
  class RESTfulRerankModelHandle(RESTfulModelHandle):
131
168
  def rerank(
@@ -174,6 +211,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
174
211
  "max_chunks_per_doc": max_chunks_per_doc,
175
212
  "return_documents": return_documents,
176
213
  "return_len": return_len,
214
+ "kwargs": json.dumps(kwargs),
177
215
  }
178
216
  request_body.update(kwargs)
179
217
  response = requests.post(url, json=request_body, headers=self.auth_headers)
@@ -703,6 +741,8 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
703
741
  The speed of the generated audio.
704
742
  stream: bool
705
743
  Use stream or not.
744
+ prompt_speech: bytes
745
+ The audio bytes to be provided to the model.
706
746
 
707
747
  Returns
708
748
  -------
@@ -1357,7 +1397,7 @@ class Client:
1357
1397
  response_data = response.json()
1358
1398
  return response_data
1359
1399
 
1360
- def abort_request(self, model_uid: str, request_id: str):
1400
+ def abort_request(self, model_uid: str, request_id: str, block_duration: int = 30):
1361
1401
  """
1362
1402
  Abort a request.
1363
1403
  Abort a submitted request. If the request is finished or not found, this method will be a no-op.
@@ -1369,13 +1409,18 @@ class Client:
1369
1409
  Model uid.
1370
1410
  request_id: str
1371
1411
  Request id.
1412
+ block_duration: int
1413
+ The duration to make the request id abort. If set to 0, the abort_request will be immediate, which may
1414
+ prevent it from taking effect if it arrives before the request operation.
1372
1415
  Returns
1373
1416
  -------
1374
1417
  Dict
1375
1418
  Return empty dict.
1376
1419
  """
1377
1420
  url = f"{self.base_url}/v1/models/{model_uid}/requests/{request_id}/abort"
1378
- response = requests.post(url, headers=self._headers)
1421
+ response = requests.post(
1422
+ url, headers=self._headers, json={"block_duration": block_duration}
1423
+ )
1379
1424
  if response.status_code != 200:
1380
1425
  raise RuntimeError(
1381
1426
  f"Failed to abort request, detail: {_get_error_string(response)}"
xinference/constants.py CHANGED
@@ -88,3 +88,4 @@ XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE = os.environ.get(
88
88
  XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE, None
89
89
  )
90
90
  XINFERENCE_LAUNCH_MODEL_RETRY = 3
91
+ XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION = 30
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import base64
16
+ import html
16
17
  import logging
17
18
  import os
18
19
  from io import BytesIO
@@ -137,7 +138,11 @@ class GradioInterface:
137
138
  if "content" not in delta:
138
139
  continue
139
140
  else:
140
- response_content += delta["content"]
141
+ # some model like deepseek-r1-distill-qwen
142
+ # will generate <think>...</think> ...
143
+ # in gradio, no output will be rendered,
144
+ # thus escape html tags in advance
145
+ response_content += html.escape(delta["content"])
141
146
  yield response_content
142
147
 
143
148
  yield response_content
xinference/core/model.py CHANGED
@@ -35,12 +35,14 @@ from typing import (
35
35
  List,
36
36
  Optional,
37
37
  Union,
38
+ no_type_check,
38
39
  )
39
40
 
40
41
  import sse_starlette.sse
41
42
  import xoscar as xo
42
43
 
43
44
  from ..constants import (
45
+ XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
44
46
  XINFERENCE_LAUNCH_MODEL_RETRY,
45
47
  XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE,
46
48
  )
@@ -57,7 +59,7 @@ import logging
57
59
  logger = logging.getLogger(__name__)
58
60
 
59
61
  from ..device_utils import empty_cache
60
- from .utils import json_dumps, log_async
62
+ from .utils import CancelMixin, json_dumps, log_async
61
63
 
62
64
  try:
63
65
  from torch.cuda import OutOfMemoryError
@@ -77,6 +79,9 @@ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
77
79
  ]
78
80
 
79
81
  XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
82
+ XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
83
+ os.getenv("XINFERENCE_TEST_OUT_OF_MEMORY_ERROR", False)
84
+ )
80
85
 
81
86
 
82
87
  def request_limit(fn):
@@ -90,21 +95,26 @@ def request_limit(fn):
90
95
  logger.debug(
91
96
  f"Request {fn.__name__}, current serve request count: {self._serve_count}, request limit: {self._request_limits} for the model {self.model_uid()}"
92
97
  )
93
- if self._request_limits is not None:
94
- if 1 + self._serve_count <= self._request_limits:
95
- self._serve_count += 1
96
- else:
97
- raise RuntimeError(
98
- f"Rate limit reached for the model. Request limit {self._request_limits} for the model: {self.model_uid()}"
99
- )
98
+ if 1 + self._serve_count <= self._request_limits:
99
+ self._serve_count += 1
100
+ else:
101
+ raise RuntimeError(
102
+ f"Rate limit reached for the model. Request limit {self._request_limits} for the model: {self.model_uid()}"
103
+ )
104
+ ret = None
100
105
  try:
101
106
  ret = await fn(self, *args, **kwargs)
102
107
  finally:
103
- if self._request_limits is not None:
108
+ if ret is not None and (
109
+ inspect.isasyncgen(ret) or inspect.isgenerator(ret)
110
+ ):
111
+ # stream case, let client call model_ref to decrease self._serve_count
112
+ pass
113
+ else:
104
114
  self._serve_count -= 1
105
- logger.debug(
106
- f"After request {fn.__name__}, current serve request count: {self._serve_count} for the model {self.model_uid()}"
107
- )
115
+ logger.debug(
116
+ f"After request {fn.__name__}, current serve request count: {self._serve_count} for the model {self.model_uid()}"
117
+ )
108
118
  return ret
109
119
 
110
120
  return wrapped_func
@@ -112,20 +122,25 @@ def request_limit(fn):
112
122
 
113
123
  def oom_check(fn):
114
124
  @functools.wraps(fn)
115
- def _wrapper(*args, **kwargs):
125
+ def _wrapper(self, *args, **kwargs):
116
126
  try:
117
- return fn(*args, **kwargs)
118
- except OutOfMemoryError:
119
- logger.exception("Model actor is out of memory.")
120
- os._exit(1)
127
+ if XINFERENCE_TEST_OUT_OF_MEMORY_ERROR:
128
+ raise OutOfMemoryError("Test Out of Memory Error")
129
+ return fn(self, *args, **kwargs)
130
+ except OutOfMemoryError as ex:
131
+ assert self._loop is not None
132
+ asyncio.run_coroutine_threadsafe(
133
+ self._handle_oom_error(ex), loop=self._loop
134
+ )
121
135
 
122
136
  @functools.wraps(fn)
123
- async def _async_wrapper(*args, **kwargs):
137
+ async def _async_wrapper(self, *args, **kwargs):
124
138
  try:
125
- return await fn(*args, **kwargs)
126
- except OutOfMemoryError:
127
- logger.exception("Model actor is out of memory.")
128
- os._exit(1)
139
+ if XINFERENCE_TEST_OUT_OF_MEMORY_ERROR:
140
+ raise OutOfMemoryError("Test Out of Memory Error")
141
+ return await fn(self, *args, **kwargs)
142
+ except OutOfMemoryError as ex:
143
+ await self._handle_oom_error(ex)
129
144
 
130
145
  assert not inspect.isasyncgen(fn)
131
146
  assert not inspect.isgenerator(fn)
@@ -136,7 +151,7 @@ def oom_check(fn):
136
151
  return _wrapper
137
152
 
138
153
 
139
- class ModelActor(xo.StatelessActor):
154
+ class ModelActor(xo.StatelessActor, CancelMixin):
140
155
  _replica_model_uid: Optional[str]
141
156
 
142
157
  @classmethod
@@ -172,6 +187,16 @@ class ModelActor(xo.StatelessActor):
172
187
  if hasattr(self._model, "stop") and callable(self._model.stop):
173
188
  self._model.stop()
174
189
 
190
+ if isinstance(self._model, LLMVLLMModel):
191
+ if self._transfer_ref is not None:
192
+ try:
193
+ await xo.destroy_actor(self._transfer_ref)
194
+ del self._transfer_ref
195
+ except Exception as e:
196
+ logger.debug(
197
+ f"Destroy transfer actor failed, address: {self.address}, error: {e}"
198
+ )
199
+
175
200
  if (
176
201
  isinstance(self._model, (LLMPytorchModel, LLMVLLMModel, SGLANGModel))
177
202
  and self._model.model_spec.model_format == "pytorch"
@@ -200,6 +225,7 @@ class ModelActor(xo.StatelessActor):
200
225
  replica_model_uid: str,
201
226
  model_description: Optional["ModelDescription"] = None,
202
227
  request_limits: Optional[int] = None,
228
+ xavier_config: Optional[Dict] = None,
203
229
  ):
204
230
  super().__init__()
205
231
  from ..model.llm.lmdeploy.core import LMDeployModel
@@ -214,7 +240,9 @@ class ModelActor(xo.StatelessActor):
214
240
  self._model_description = (
215
241
  model_description.to_dict() if model_description else {}
216
242
  )
217
- self._request_limits = request_limits
243
+ self._request_limits = (
244
+ float("inf") if request_limits is None else request_limits
245
+ )
218
246
  self._pending_requests: asyncio.Queue = asyncio.Queue()
219
247
  self._handle_pending_requests_task = None
220
248
  self._lock = (
@@ -239,6 +267,11 @@ class ModelActor(xo.StatelessActor):
239
267
  self._scheduler_ref = None
240
268
  self._text_to_image_scheduler_ref = None
241
269
 
270
+ if isinstance(self._model, VLLMModel):
271
+ self._xavier_config = xavier_config
272
+ self._model.set_xavier_config(xavier_config)
273
+ self._transfer_ref = None
274
+
242
275
  async def __post_create__(self):
243
276
  self._loop = asyncio.get_running_loop()
244
277
 
@@ -267,6 +300,32 @@ class ModelActor(xo.StatelessActor):
267
300
  def __repr__(self) -> str:
268
301
  return f"ModelActor({self._replica_model_uid})"
269
302
 
303
+ def decrease_serve_count(self):
304
+ self._serve_count -= 1
305
+
306
+ @no_type_check
307
+ async def start_transfer_for_vllm(self, rank_addresses: List[str]):
308
+ from ..model.llm.vllm.core import VLLMModel
309
+ from ..model.llm.vllm.xavier.transfer import TransferActor
310
+
311
+ assert isinstance(self._model, VLLMModel)
312
+ rank = self._xavier_config.get("rank") # type: ignore
313
+ self._transfer_ref = await xo.create_actor(
314
+ TransferActor,
315
+ address=self.address,
316
+ uid=f"{TransferActor.default_uid()}-{rank}",
317
+ rank=rank,
318
+ world_size=self._xavier_config.get("world_size"), # type: ignore
319
+ rank_address=self._xavier_config.get("rank_address"), # type: ignore
320
+ store_address=self._xavier_config.get("store_address"), # type: ignore
321
+ store_port=self._xavier_config.get("store_port"), # type: ignore
322
+ world_addresses=rank_addresses,
323
+ )
324
+ await self._model.init_xavier()
325
+ logger.debug(
326
+ f"Init transfer actor: {self._transfer_ref.address}, rank: {rank} done for vllm." # type: ignore
327
+ )
328
+
270
329
  async def _record_completion_metrics(
271
330
  self, duration, completion_tokens, prompt_tokens
272
331
  ):
@@ -429,11 +488,24 @@ class ModelActor(xo.StatelessActor):
429
488
  )
430
489
  )
431
490
 
491
+ async def _handle_oom_error(self, ex):
492
+ error_message = (
493
+ f"Model actor is out of memory, model id: {self.model_uid()}, error: {ex}"
494
+ )
495
+ logger.exception(error_message)
496
+ worker_ref = await self._get_worker_ref()
497
+ await worker_ref.update_model_status(
498
+ self._replica_model_uid, last_error=error_message
499
+ )
500
+ os._exit(1)
501
+
432
502
  def _to_generator(self, output_type: str, gen: types.GeneratorType):
433
503
  start_time = time.time()
434
504
  time_to_first_token = None
435
505
  final_usage = None
436
506
  try:
507
+ if XINFERENCE_TEST_OUT_OF_MEMORY_ERROR:
508
+ raise OutOfMemoryError("Test Out of Memory Error")
437
509
  for v in gen:
438
510
  if time_to_first_token is None:
439
511
  time_to_first_token = (time.time() - start_time) * 1000
@@ -445,11 +517,11 @@ class ModelActor(xo.StatelessActor):
445
517
  output_type == "binary"
446
518
  ), f"Unknown output type '{output_type}'"
447
519
  yield sse_starlette.sse.ensure_bytes(v, None)
448
- except OutOfMemoryError:
449
- logger.exception(
450
- "Model actor is out of memory, model id: %s", self.model_uid()
520
+ except OutOfMemoryError as ex:
521
+ assert self._loop is not None
522
+ asyncio.run_coroutine_threadsafe(
523
+ self._handle_oom_error(ex), loop=self._loop
451
524
  )
452
- os._exit(1)
453
525
  finally:
454
526
  if self._loop is not None and time_to_first_token is not None:
455
527
  coro = self.record_metrics(
@@ -471,6 +543,8 @@ class ModelActor(xo.StatelessActor):
471
543
  time_to_first_token = None
472
544
  final_usage = None
473
545
  try:
546
+ if XINFERENCE_TEST_OUT_OF_MEMORY_ERROR:
547
+ raise OutOfMemoryError("Test Out of Memory Error")
474
548
  async for v in gen:
475
549
  if time_to_first_token is None:
476
550
  time_to_first_token = (time.time() - start_time) * 1000
@@ -483,11 +557,8 @@ class ModelActor(xo.StatelessActor):
483
557
  output_type == "binary"
484
558
  ), f"Unknown output type '{output_type}'"
485
559
  yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
486
- except OutOfMemoryError:
487
- logger.exception(
488
- "Model actor is out of memory, model id: %s", self.model_uid()
489
- )
490
- os._exit(1)
560
+ except OutOfMemoryError as ex:
561
+ await self._handle_oom_error(ex)
491
562
  finally:
492
563
  coros = []
493
564
  if time_to_first_token is not None:
@@ -553,6 +624,7 @@ class ModelActor(xo.StatelessActor):
553
624
 
554
625
  @oom_check
555
626
  async def _call_wrapper(self, output_type: str, fn: Callable, *args, **kwargs):
627
+ self._add_running_task(kwargs.get("request_id"))
556
628
  if self._lock is None:
557
629
  if inspect.iscoroutinefunction(fn):
558
630
  ret = await fn(*args, **kwargs)
@@ -761,9 +833,14 @@ class ModelActor(xo.StatelessActor):
761
833
  prompt_tokens,
762
834
  )
763
835
 
764
- async def abort_request(self, request_id: str) -> str:
836
+ async def abort_request(
837
+ self,
838
+ request_id: str,
839
+ block_duration: int = XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
840
+ ) -> str:
765
841
  from .utils import AbortRequestMessage
766
842
 
843
+ self._cancel_running_task(request_id, block_duration)
767
844
  if self.allow_batching():
768
845
  if self._scheduler_ref is None:
769
846
  return AbortRequestMessage.NOT_FOUND.name
@@ -787,6 +864,19 @@ class ModelActor(xo.StatelessActor):
787
864
  f"Model {self._model.model_spec} is not for creating embedding."
788
865
  )
789
866
 
867
+ @request_limit
868
+ @log_async(logger=logger)
869
+ async def convert_ids_to_tokens(
870
+ self, input: Union[List, List[List]], *args, **kwargs
871
+ ):
872
+ kwargs.pop("request_id", None)
873
+ if hasattr(self._model, "convert_ids_to_tokens"):
874
+ return await self._call_wrapper_json(
875
+ self._model.convert_ids_to_tokens, input, *args, **kwargs
876
+ )
877
+
878
+ raise AttributeError(f"Model {self._model.model_spec} can convert token id.")
879
+
790
880
  @request_limit
791
881
  @log_async(logger=logger)
792
882
  async def rerank(
@@ -35,6 +35,7 @@ from typing import (
35
35
  import xoscar as xo
36
36
 
37
37
  from ..constants import (
38
+ XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
38
39
  XINFERENCE_DISABLE_HEALTH_CHECK,
39
40
  XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
40
41
  XINFERENCE_HEALTH_CHECK_INTERVAL,
@@ -266,6 +267,14 @@ class SupervisorActor(xo.StatelessActor):
266
267
  signal.SIGTERM, lambda: asyncio.create_task(signal_handler())
267
268
  )
268
269
 
270
+ from ..model.llm.vllm.xavier.block_tracker import VLLMBlockTracker
271
+ from ..model.llm.vllm.xavier.collective_manager import CollectiveManager
272
+
273
+ self._block_tracker_mapping: Dict[str, xo.ActorRefType[VLLMBlockTracker]] = {}
274
+ self._collective_manager_mapping: Dict[
275
+ str, xo.ActorRefType[CollectiveManager]
276
+ ] = {}
277
+
269
278
  @typing.no_type_check
270
279
  async def get_cluster_device_info(self, detailed: bool = False) -> List:
271
280
  import psutil
@@ -958,29 +967,83 @@ class SupervisorActor(xo.StatelessActor):
958
967
  if model_uid is None:
959
968
  model_uid = self._gen_model_uid(model_name)
960
969
 
970
+ # Xavier-related
971
+ enable_xavier: bool = (
972
+ bool(kwargs.pop("enable_xavier", False))
973
+ and model_engine is not None
974
+ and model_engine.lower() == "vllm"
975
+ )
976
+ store_address = None
977
+ store_port = None
978
+ world_size = None
979
+ if enable_xavier:
980
+ if replica <= 1:
981
+ logger.warning(f"Enabling xavier when `replica<=1` is meaningless.")
982
+ enable_xavier = False
983
+ else:
984
+ from ..model.llm.vllm.xavier.block_tracker import VLLMBlockTracker
985
+ from ..model.llm.vllm.xavier.collective_manager import CollectiveManager
986
+
987
+ self._block_tracker_mapping[model_uid] = await xo.create_actor(
988
+ VLLMBlockTracker,
989
+ address=self.address,
990
+ uid=f"{VLLMBlockTracker.default_uid()}-{model_uid}",
991
+ )
992
+ world_size = replica + 1
993
+ logger.info(f"Going to start xavier with world size: {world_size}")
994
+ self._collective_manager_mapping[model_uid] = await xo.create_actor(
995
+ CollectiveManager,
996
+ address=self.address,
997
+ uid=f"{CollectiveManager.default_uid()}-{model_uid}",
998
+ model_uid=model_uid,
999
+ )
1000
+ logger.info(f"Start collective manager for {model_uid} done.")
1001
+
961
1002
  model_size = str(model_size_in_billions) if model_size_in_billions else ""
962
1003
  logger.debug(
963
1004
  f"Enter launch_builtin_model, model_uid: {model_uid}, model_name: {model_name}, model_size: {model_size}, "
964
- f"model_format: {model_format}, quantization: {quantization}, replica: {replica}, "
1005
+ f"model_format: {model_format}, quantization: {quantization}, replica: {replica}, enable_xavier: {enable_xavier}, "
965
1006
  f"kwargs: {kwargs}"
966
1007
  )
967
1008
 
968
- async def _launch_one_model(_replica_model_uid):
1009
+ async def _launch_one_model(worker_ref, _replica_model_uid, rank: int):
969
1010
  if _replica_model_uid in self._replica_model_uid_to_worker:
970
1011
  raise ValueError(
971
1012
  f"Model is already in the model list, uid: {_replica_model_uid}"
972
1013
  )
1014
+
1015
+ nonlocal store_address
1016
+ nonlocal store_port
1017
+ xavier_config = (
1018
+ {
1019
+ "block_tracker_uid": self._block_tracker_mapping[model_uid].uid,
1020
+ "block_tracker_address": self._block_tracker_mapping[
1021
+ model_uid
1022
+ ].address,
1023
+ "rank": rank,
1024
+ "world_size": world_size,
1025
+ "store_address": store_address,
1026
+ "store_port": store_port,
1027
+ }
1028
+ if enable_xavier
1029
+ else None
1030
+ )
1031
+
1032
+ if enable_xavier and rank == 0:
1033
+ rank0_address, _port = await worker_ref.launch_rank0_model(
1034
+ _replica_model_uid, xavier_config
1035
+ )
1036
+ self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
1037
+ store_address = rank0_address.split(":")[0]
1038
+ store_port = _port
1039
+ return rank0_address
1040
+
973
1041
  replica_gpu_idx = assign_replica_gpu(_replica_model_uid, replica, gpu_idx)
974
1042
  nonlocal model_type
975
1043
 
976
- worker_ref = (
977
- target_ip_worker_ref
978
- if target_ip_worker_ref is not None
979
- else await self._choose_worker()
980
- )
981
1044
  # LLM as default for compatibility
982
1045
  model_type = model_type or "LLM"
983
- await worker_ref.launch_builtin_model(
1046
+ subpool_address = await worker_ref.launch_builtin_model(
984
1047
  model_uid=_replica_model_uid,
985
1048
  model_name=model_name,
986
1049
  model_size_in_billions=model_size_in_billions,
@@ -994,14 +1057,64 @@ class SupervisorActor(xo.StatelessActor):
994
1057
  gpu_idx=replica_gpu_idx,
995
1058
  download_hub=download_hub,
996
1059
  model_path=model_path,
1060
+ xavier_config=xavier_config,
997
1061
  **kwargs,
998
1062
  )
999
1063
  self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
1064
+ return subpool_address
1000
1065
 
1001
1066
  async def _launch_model():
1002
1067
  try:
1003
- for rep_model_uid in iter_replica_model_uid(model_uid, replica):
1004
- await _launch_one_model(rep_model_uid)
1068
+ worker_refs = []
1069
+ rank_addresses = []
1070
+ for _idx, rep_model_uid in enumerate(
1071
+ iter_replica_model_uid(model_uid, replica)
1072
+ ):
1073
+ worker_ref = (
1074
+ target_ip_worker_ref
1075
+ if target_ip_worker_ref is not None
1076
+ else await self._choose_worker()
1077
+ )
1078
+ if enable_xavier and _idx == 0:
1079
+ """
1080
+ Start the rank 0 model actor on the worker that holds the rank 1 replica,
1081
+ solely for constructing the collective communication world.
1082
+ """
1083
+ _uid = model_uid + "-rank0"
1084
+ rank0_address = await _launch_one_model(worker_ref, _uid, 0)
1085
+ worker_refs.append((worker_ref, _uid))
1086
+ rank_addresses.append(rank0_address)
1087
+
1088
+ subpool_address = await _launch_one_model(
1089
+ worker_ref, rep_model_uid, _idx + 1
1090
+ )
1091
+ worker_refs.append((worker_ref, rep_model_uid))
1092
+ rank_addresses.append(subpool_address)
1093
+
1094
+ # For xavier, start all the vllm instances first,
1095
+ # and then start the transfer component,
1096
+ # because the transfer actor needs all the rank addresses used for collective communication
1097
+ if enable_xavier:
1098
+ logger.debug(f"Init transfer component for xavier...")
1099
+ collective_manager_ref = self._collective_manager_mapping[model_uid]
1100
+ tasks = []
1101
+ for worker_ref, rep_model_uid in worker_refs:
1102
+ tasks.append(
1103
+ worker_ref.start_transfer_for_vllm(
1104
+ rep_model_uid, rank_addresses
1105
+ )
1106
+ )
1107
+ # Here you must use asyncio.gather, not a for loop,
1108
+ # or you will get stuck.
1109
+ await asyncio.gather(*tasks)
1110
+
1111
+ # init collective_manager
1112
+ for idx, addr in enumerate(rank_addresses):
1113
+ await collective_manager_ref.register_rank(
1114
+ idx, addr, update=False
1115
+ )
1116
+
1117
+ logger.debug(f"Init transfer component for xavier done.")
1005
1118
  except Exception:
1006
1119
  # terminate_model will remove the replica info.
1007
1120
  await self.terminate_model(model_uid, suppress_exception=True)
@@ -1130,6 +1243,38 @@ class SupervisorActor(xo.StatelessActor):
1130
1243
  raise
1131
1244
  self._model_uid_to_replica_info.pop(model_uid, None)
1132
1245
 
1246
+ # clear for xavier
1247
+ rank0_uid = model_uid + "-rank0"
1248
+ if rank0_uid in self._replica_model_uid_to_worker:
1249
+ await _terminate_one_model(rank0_uid)
1250
+
1251
+ collective_manager_ref = self._collective_manager_mapping.pop(model_uid, None)
1252
+ if collective_manager_ref is not None:
1253
+ try:
1254
+ await xo.destroy_actor(collective_manager_ref)
1255
+ except Exception as e:
1256
+ logger.debug(
1257
+ "Destroy collective_manager_ref failed, model uid: %s, error: %s",
1258
+ model_uid,
1259
+ e,
1260
+ )
1261
+ finally:
1262
+ logger.debug(
1263
+ f"Destroy collective_manager_ref done. model uid: {model_uid}"
1264
+ )
1265
+ block_tracker_ref = self._block_tracker_mapping.pop(model_uid, None)
1266
+ if block_tracker_ref is not None:
1267
+ try:
1268
+ await xo.destroy_actor(block_tracker_ref)
1269
+ except Exception as e:
1270
+ logger.debug(
1271
+ "Destroy block_tracker_ref failed, model uid: %s, error: %s",
1272
+ model_uid,
1273
+ e,
1274
+ )
1275
+ finally:
1276
+ logger.debug(f"Destroy block_tracker_ref done. model uid: {model_uid}")
1277
+
1133
1278
  @log_async(logger=logger)
1134
1279
  async def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
1135
1280
  replica_info = self._model_uid_to_replica_info.get(model_uid, None)
@@ -1147,6 +1292,15 @@ class SupervisorActor(xo.StatelessActor):
1147
1292
  )
1148
1293
  return await worker_ref.get_model(model_uid=replica_model_uid)
1149
1294
 
1295
+ @log_async(logger=logger)
1296
+ async def get_model_status(self, replica_model_uid: str):
1297
+ worker_ref = self._replica_model_uid_to_worker.get(replica_model_uid, None)
1298
+ if worker_ref is None:
1299
+ raise ValueError(
1300
+ f"Model not found in the model list, uid: {replica_model_uid}"
1301
+ )
1302
+ return await worker_ref.get_model_status(replica_model_uid)
1303
+
1150
1304
  @log_async(logger=logger)
1151
1305
  async def describe_model(self, model_uid: str) -> Dict[str, Any]:
1152
1306
  replica_info = self._model_uid_to_replica_info.get(model_uid, None)
@@ -1213,7 +1367,12 @@ class SupervisorActor(xo.StatelessActor):
1213
1367
  return cached_models
1214
1368
 
1215
1369
  @log_async(logger=logger)
1216
- async def abort_request(self, model_uid: str, request_id: str) -> Dict:
1370
+ async def abort_request(
1371
+ self,
1372
+ model_uid: str,
1373
+ request_id: str,
1374
+ block_duration: int = XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
1375
+ ) -> Dict:
1217
1376
  from .scheduler import AbortRequestMessage
1218
1377
 
1219
1378
  res = {"msg": AbortRequestMessage.NO_OP.name}
@@ -1228,7 +1387,7 @@ class SupervisorActor(xo.StatelessActor):
1228
1387
  if worker_ref is None:
1229
1388
  continue
1230
1389
  model_ref = await worker_ref.get_model(model_uid=rep_mid)
1231
- result_info = await model_ref.abort_request(request_id)
1390
+ result_info = await model_ref.abort_request(request_id, block_duration)
1232
1391
  res["msg"] = result_info
1233
1392
  if result_info == AbortRequestMessage.DONE.name:
1234
1393
  break
@@ -1371,3 +1530,12 @@ class SupervisorActor(xo.StatelessActor):
1371
1530
 
1372
1531
  async def get_progress(self, request_id: str) -> float:
1373
1532
  return await self._progress_tracker.get_progress(request_id)
1533
+
1534
+ async def call_collective_manager(
1535
+ self, model_uid: str, func_name: str, *args, **kwargs
1536
+ ):
1537
+ """
1538
+ Used by worker.
1539
+ """
1540
+ collective_manager_ref = self._collective_manager_mapping[model_uid]
1541
+ await getattr(collective_manager_ref, func_name)(*args, **kwargs)