xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,134 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
15
+
16
+ import xoscar as xo
17
+ from vllm.executor.gpu_executor import GPUExecutorAsync
18
+ from vllm.model_executor.layers.sampler import SamplerOutput
19
+ from vllm.sequence import ExecuteModelRequest, PoolerOutput
20
+ from vllm.utils import is_pin_memory_available
21
+ from vllm.worker.cache_engine import CacheEngine
22
+
23
+ if TYPE_CHECKING:
24
+ from .scheduler import XavierScheduler
25
+
26
+
27
+ class XavierExecutor(GPUExecutorAsync):
28
+ scheduler: Optional[List["XavierScheduler"]] = None
29
+
30
+ def _init_executor(self) -> None:
31
+ super()._init_executor()
32
+ self._transfer_ref = None
33
+ self._block_tracker_ref = None
34
+
35
+ async def init_transfer(self):
36
+ """
37
+ In vllm, the `cache_engine` is the entity that truly manages the KV cache tensors.
38
+ Retrieve the necessary transmission information from the `cache_engine`.
39
+ """
40
+ transfer_ref = await self._get_transfer_ref()
41
+ ref_cache_engine: CacheEngine = self.driver_worker.cache_engine[0]
42
+ buffer_dtype = ref_cache_engine.dtype
43
+ buffer_device = "cpu"
44
+ buffer_pin_memory = is_pin_memory_available()
45
+ num_attn_layers = ref_cache_engine.num_attention_layers
46
+ kv_cache_shape = ref_cache_engine.gpu_cache[0].shape
47
+ assert kv_cache_shape[0] == 2
48
+ buffer_num = 2
49
+ transfer_block_num = self.vllm_config.xavier_config.get("transfer_block_num")
50
+ buffer_shape = (
51
+ transfer_block_num,
52
+ num_attn_layers,
53
+ kv_cache_shape[0],
54
+ *kv_cache_shape[2:],
55
+ )
56
+ await transfer_ref.setup(
57
+ self.driver_worker.cache_engine,
58
+ self.scheduler,
59
+ num_buffer=buffer_num,
60
+ buffer_shape=buffer_shape,
61
+ buffer_dtype=buffer_dtype,
62
+ buffer_device=buffer_device,
63
+ pin_memory=buffer_pin_memory,
64
+ )
65
+
66
+ async def _get_block_tracker_ref(self):
67
+ if self._block_tracker_ref is None:
68
+ block_tracker_address = self.vllm_config.xavier_config.get(
69
+ "block_tracker_address"
70
+ )
71
+ block_tracker_uid = self.vllm_config.xavier_config.get("block_tracker_uid")
72
+ self._block_tracker_ref = await xo.actor_ref(
73
+ address=block_tracker_address, uid=block_tracker_uid
74
+ )
75
+ return self._block_tracker_ref
76
+
77
+ async def _get_transfer_ref(self):
78
+ from .transfer import TransferActor
79
+
80
+ if self._transfer_ref is None:
81
+ transfer_address = self.vllm_config.xavier_config.get("rank_address")
82
+ rank = self.vllm_config.xavier_config.get("rank")
83
+ self._transfer_ref = await xo.actor_ref(
84
+ address=transfer_address, uid=f"{TransferActor.default_uid()}-{rank}"
85
+ )
86
+ return self._transfer_ref
87
+
88
+ def get_rank(self) -> int:
89
+ return self.vllm_config.xavier_config.get("rank")
90
+
91
+ async def execute_model_async(
92
+ self,
93
+ execute_model_req: ExecuteModelRequest,
94
+ ) -> List[Union[SamplerOutput, PoolerOutput]]:
95
+ """
96
+ Collect information about the blocks involved in the execution before the vllm `ModelRunner` executes.
97
+ This information will be used by the tracker after execution to register the locally computed blocks.
98
+ """
99
+ virtual_engine = execute_model_req.virtual_engine
100
+ block_tracker_ref = await self._get_block_tracker_ref()
101
+ scheduler = self.scheduler[virtual_engine] # type: ignore
102
+ rank = self.get_rank()
103
+ executed_blocks_details: Set[Tuple[int, int]] = set()
104
+ for meta in execute_model_req.seq_group_metadata_list:
105
+ block_tables = meta.block_tables
106
+ for seq_id, block_ids in block_tables.items():
107
+ for _id in block_ids:
108
+ b = scheduler.block_manager.get_block_by_block_id(seq_id, _id)
109
+ # The `executed` attribute is used to prevent duplicate registration of the block.
110
+ executed = scheduler.block_manager.get_block_status_by_block_id(
111
+ "executed", _id
112
+ )
113
+ detail = (b.content_hash, b.block_id)
114
+ if (b.content_hash is not None) and (not executed):
115
+ executed_blocks_details.add(detail)
116
+
117
+ res = await super().execute_model_async(execute_model_req)
118
+
119
+ if executed_blocks_details:
120
+ """
121
+ Why not collect and register the information after execution?
122
+ Because after execution, the model's execution callback hook will release the block_id,
123
+ causing the block manager to lose access to the correct information.
124
+ """
125
+ await block_tracker_ref.register_blocks(
126
+ virtual_engine, list(executed_blocks_details), rank
127
+ )
128
+
129
+ for _, _id in executed_blocks_details:
130
+ scheduler.block_manager.set_block_status_by_block_id(
131
+ "executed", _id, True
132
+ )
133
+
134
+ return res
@@ -0,0 +1,438 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import asyncio
15
+ import logging
16
+ import time
17
+ from collections import deque
18
+ from typing import Callable, Deque, Dict, List, Optional, Set, Tuple, no_type_check
19
+
20
+ import xoscar as xo
21
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
22
+ from vllm.core.block.interfaces import Block
23
+ from vllm.core.interfaces import BlockSpaceManager
24
+ from vllm.core.scheduler import Scheduler, SchedulerOutputs
25
+ from vllm.sequence import (
26
+ SequenceData,
27
+ SequenceGroup,
28
+ SequenceGroupMetadata,
29
+ SequenceGroupMetadataDelta,
30
+ SequenceStage,
31
+ SequenceStatus,
32
+ )
33
+
34
+ from .block_manager import XavierBlockManager
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class XavierScheduler(Scheduler):
40
+ @staticmethod
41
+ def _get_block_space_manager_class(version: str):
42
+ logger.debug("Init xavier block manager.")
43
+ return XavierBlockManager
44
+
45
+ def __init__(
46
+ self,
47
+ scheduler_config: SchedulerConfig,
48
+ cache_config: CacheConfig,
49
+ lora_config: Optional[LoRAConfig],
50
+ pipeline_parallel_size: int = 1,
51
+ output_proc_callback: Optional[Callable] = None,
52
+ xavier_config: Optional[Dict] = None,
53
+ virtual_engine: Optional[int] = 0,
54
+ ) -> None:
55
+ BlockSpaceManager.get_block_space_manager_class = (
56
+ self._get_block_space_manager_class
57
+ )
58
+ super().__init__(
59
+ scheduler_config,
60
+ cache_config,
61
+ lora_config,
62
+ pipeline_parallel_size,
63
+ output_proc_callback,
64
+ )
65
+ xavier_config["virtual_engine"] = virtual_engine # type: ignore
66
+ self.block_manager.xavier_config = xavier_config
67
+ self._xavier_config = xavier_config
68
+ self._virtual_engine = virtual_engine
69
+ self._block_tracker_ref = None
70
+ self._transfer_ref = None
71
+ self._transferring: Deque[SequenceGroup] = deque()
72
+ self._transfer_status: Dict[SequenceGroup, Set[int]] = {}
73
+
74
+ async def _get_block_tracker_ref(self):
75
+ if self._block_tracker_ref is None:
76
+ block_tracker_address = self._xavier_config.get("block_tracker_address")
77
+ block_tracker_uid = self._xavier_config.get("block_tracker_uid")
78
+ self._block_tracker_ref = await xo.actor_ref(
79
+ address=block_tracker_address, uid=block_tracker_uid
80
+ )
81
+ return self._block_tracker_ref
82
+
83
+ async def _get_transfer_ref(self):
84
+ from .transfer import TransferActor
85
+
86
+ if self._transfer_ref is None:
87
+ transfer_address = self._xavier_config.get("rank_address")
88
+ rank = self._xavier_config.get("rank")
89
+ self._transfer_ref = await xo.actor_ref(
90
+ address=transfer_address, uid=f"{TransferActor.default_uid()}-{rank}"
91
+ )
92
+ return self._transfer_ref
93
+
94
+ async def _get_transfer_details(
95
+ self,
96
+ virtual_engine: int,
97
+ block_tables: Dict[int, List[int]],
98
+ seq_group: SequenceGroup,
99
+ ) -> Tuple[Set[int], Dict[int, Set[Tuple[int, int, int]]]]:
100
+ # If the `seq_group` has the `force_calculation` attribute set to `True`,
101
+ # it indicates that there were issues during the transmission process.
102
+ # In this case, force the computation and exclude it from the Xavier process.
103
+ if getattr(seq_group, "force_calculation", False):
104
+ return set(), dict()
105
+ """
106
+ Retrieve information from other replicas to check if any blocks have already been computed,
107
+ for the purpose of data transfer.
108
+ """
109
+ details: Set[Tuple[int, int]] = set()
110
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
111
+ block_ids = block_tables[seq.seq_id]
112
+ for _id in block_ids:
113
+ block: Block = self.block_manager.get_block_by_block_id(seq.seq_id, _id)
114
+ detail = (block.content_hash, _id)
115
+ """
116
+ 1. `block.content_hash is not None` means that the block has been filled with tokens.
117
+ Unless it is evicted from the cache, the computation result of this block is constant.
118
+ 2. Check the `transferred` status of the block.
119
+ If it is `True`, it means the block has already been transferred locally
120
+ and does not need to be transferred again.
121
+ 3. Check the `executed` status of the block.
122
+ If it is `True`, it means the block has already been computed locally
123
+ and does not need to be transferred.
124
+ """
125
+ if (
126
+ (block.content_hash is not None)
127
+ and (
128
+ not self.block_manager.get_block_status_by_block_id(
129
+ "transferred", block.block_id
130
+ )
131
+ )
132
+ and (
133
+ not self.block_manager.get_block_status_by_block_id(
134
+ "executed", block.block_id
135
+ )
136
+ )
137
+ ):
138
+ details.add(detail)
139
+
140
+ if details:
141
+ tracker_ref = await self._get_block_tracker_ref()
142
+ remote = await tracker_ref.query_blocks(virtual_engine, list(details))
143
+ # Not all queried blocks have corresponding results in other replicas.
144
+ # Therefore, it is necessary to record which local block data was actually transferred.
145
+ local: Set[int] = set()
146
+ for _, remote_details in remote.items():
147
+ for _, _, local_block_id in remote_details:
148
+ local.add(local_block_id)
149
+ if local:
150
+ logger.debug(
151
+ f"Data in local blocks: {local} will be transmitted from the remote."
152
+ )
153
+ return local, remote
154
+ else:
155
+ return set(), dict()
156
+
157
+ async def _do_transfer_inner(
158
+ self, virtual_engine: int, remote: Dict[int, Set[Tuple[int, int, int]]]
159
+ ):
160
+ transfer_ref = await self._get_transfer_ref()
161
+ for from_rank, hash_and_block_id in remote.items():
162
+ src_to_dst: Dict[int, int] = {x[1]: x[2] for x in hash_and_block_id}
163
+ await transfer_ref.recv(virtual_engine, from_rank, src_to_dst)
164
+
165
+ async def _do_transfer(
166
+ self,
167
+ virtual_engine: int,
168
+ local: Set[int],
169
+ remote: Dict[int, Set[Tuple[int, int, int]]],
170
+ seq_group: SequenceGroup,
171
+ ):
172
+ try:
173
+ await self._do_transfer_inner(virtual_engine, remote)
174
+ except Exception as e:
175
+ """
176
+ The exception here is most likely due to the sender triggering recovery during the transmission process.
177
+ In this case, fallback to performing computation during the prefill stage.
178
+ """
179
+ logger.error(f"Transfer failed: {e}")
180
+ # Force this `seq_group` to perform computation.
181
+ seq_group.force_calculation = True
182
+ self._transfer_status.pop(seq_group, None)
183
+ self.waiting.appendleft(seq_group)
184
+ self._transferring.remove(seq_group)
185
+ else:
186
+ # After the transfer is completed, update the corresponding metadata.
187
+ self._transfer_status[seq_group] = local
188
+ for _id in local:
189
+ self.block_manager.set_block_status_by_block_id(
190
+ "transferred", _id, True
191
+ )
192
+ # After the transfer, place the `seq_group` back into the `waiting` queue to
193
+ # wait for the next scheduling execution.
194
+ self.waiting.appendleft(seq_group)
195
+ self._transferring.remove(seq_group)
196
+
197
+ @no_type_check
198
+ async def schedule(
199
+ self,
200
+ ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
201
+ virtual_engine = self._virtual_engine
202
+
203
+ # Schedule sequence groups.
204
+ # This function call changes the internal states of the scheduler
205
+ # such as self.running, self.swapped, and self.waiting.
206
+ scheduler_start_time = time.perf_counter()
207
+
208
+ scheduler_outputs: SchedulerOutputs = self._schedule()
209
+ now = time.time()
210
+
211
+ if not self.cache_config.enable_prefix_caching:
212
+ common_computed_block_nums = []
213
+
214
+ allow_async_output_proc: bool = self.use_async_output_proc
215
+
216
+ """Xinference Change!!!
217
+ Additional data structures required by Xavier.
218
+ """
219
+ scheduled_seq_groups = []
220
+ has_transferring = False
221
+
222
+ # Create input data structures.
223
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
224
+ for i, scheduled_seq_group in enumerate(scheduler_outputs.scheduled_seq_groups):
225
+ seq_group = scheduled_seq_group.seq_group
226
+ token_chunk_size = scheduled_seq_group.token_chunk_size
227
+ seq_group.maybe_set_first_scheduled_time(now)
228
+
229
+ seq_group_metadata = self._seq_group_metadata_cache[
230
+ self.cache_id
231
+ ].get_object()
232
+ seq_group_metadata.seq_data.clear()
233
+ seq_group_metadata.block_tables.clear()
234
+
235
+ # seq_id -> SequenceData
236
+ seq_data: Dict[int, SequenceData] = {}
237
+ # seq_id -> physical block numbers
238
+ block_tables: Dict[int, List[int]] = {}
239
+
240
+ if seq_group.is_encoder_decoder():
241
+ # Encoder associated with SequenceGroup
242
+ encoder_seq = seq_group.get_encoder_seq()
243
+ assert encoder_seq is not None
244
+ encoder_seq_data = encoder_seq.data
245
+ # Block table for cross-attention
246
+ # Also managed at SequenceGroup level
247
+ cross_block_table = self.block_manager.get_cross_block_table(seq_group)
248
+ else:
249
+ encoder_seq_data = None
250
+ cross_block_table = None
251
+
252
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
253
+ seq_id = seq.seq_id
254
+ seq_data[seq_id] = seq.data
255
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
256
+ self.block_manager.access_all_blocks_in_seq(seq, now)
257
+
258
+ """Xinference Change!!!
259
+ After completing the scheduling, the blocks have been allocated.
260
+ Therefore, it is possible to check whether some blocks have already been computed on other replicas based on this information,
261
+ and subsequently initiate the transfer.
262
+ According to the internal code comments in vllm,
263
+ whether `token_chunk_size` is 1 can indicate whether the `seq_group` is in the decode or prefill stage.
264
+ It is noted that data transmission is only applied during the prefill stage.
265
+ In the decode stage, it only applies to the last token of the block, which can negatively impact throughput.
266
+ """
267
+ is_prefill: bool = token_chunk_size != 1
268
+ if is_prefill:
269
+ local, remote = await self._get_transfer_details(
270
+ virtual_engine, block_tables, seq_group
271
+ )
272
+ if remote:
273
+ running_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
274
+ for seq in running_seqs:
275
+ seq.status = SequenceStatus.WAITING
276
+ # Additional attribute `transferred` to mark that this `seq_group` involves a transfer process.
277
+ # During the next scheduling, block allocation will no longer be required
278
+ # since it has already been completed.
279
+ seq.transferred = True
280
+ seq.data._stage = SequenceStage.PREFILL
281
+ self._transfer_status[seq_group] = set()
282
+ # Use `create_task` to avoid blocking subsequent scheduling.
283
+ asyncio.create_task(
284
+ self._do_transfer(virtual_engine, local, remote, seq_group)
285
+ )
286
+ # The `seq_group` that is currently being transferred enters a new queue.
287
+ self._transferring.append(seq_group)
288
+ has_transferring = True
289
+ continue
290
+ else:
291
+ scheduled_seq_groups.append(seq_group)
292
+
293
+ if self.cache_config.enable_prefix_caching:
294
+ common_computed_block_nums = (
295
+ self.block_manager.get_common_computed_block_ids(
296
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)
297
+ )
298
+ )
299
+ """Xinference Change!!!
300
+ This is very important and is the core of Xavier.
301
+ `computed_block_nums` is the key attribute that determines which blocks do not need to be computed,
302
+ as decided by the `model_runner`.
303
+ Therefore, after the transfer is completed, this attribute needs to be updated.
304
+ """
305
+ if seq_group in self._transfer_status:
306
+ transferred_blocks = self._transfer_status[seq_group]
307
+ if transferred_blocks:
308
+ common_computed_block_nums.extend(transferred_blocks)
309
+ common_computed_block_nums = list(
310
+ sorted(common_computed_block_nums)
311
+ )
312
+ del self._transfer_status[seq_group]
313
+
314
+ do_sample = True
315
+ is_prompt = seq_group.is_prefill()
316
+ # We should send the metadata to workers when the first prefill
317
+ # is sent. Subsequent requests could be chunked prefill or decode.
318
+ is_first_prefill = False
319
+ if is_prompt:
320
+ seqs = seq_group.get_seqs()
321
+ # Prefill has only 1 sequence.
322
+ assert len(seqs) == 1
323
+ num_computed_tokens = seqs[0].data.get_num_computed_tokens()
324
+ is_first_prefill = num_computed_tokens == 0
325
+ # In the next iteration, all prompt tokens are not computed.
326
+ # It means the prefill is chunked, and we don't need sampling.
327
+ # NOTE: We use get_len instead of get_prompt_len because when
328
+ # a sequence is preempted, prefill includes previous generated
329
+ # output tokens.
330
+ if token_chunk_size + num_computed_tokens < seqs[0].data.get_len():
331
+ do_sample = False
332
+
333
+ # It assumes the scheduled_seq_groups is ordered by
334
+ # prefill < decoding.
335
+ if is_first_prefill or not self.scheduler_config.send_delta_data:
336
+ seq_group_metadata = SequenceGroupMetadata(
337
+ request_id=seq_group.request_id,
338
+ is_prompt=is_prompt,
339
+ seq_data=seq_data,
340
+ sampling_params=seq_group.sampling_params,
341
+ block_tables=block_tables,
342
+ do_sample=do_sample,
343
+ pooling_params=seq_group.pooling_params,
344
+ token_chunk_size=token_chunk_size,
345
+ lora_request=seq_group.lora_request,
346
+ computed_block_nums=common_computed_block_nums,
347
+ encoder_seq_data=encoder_seq_data,
348
+ cross_block_table=cross_block_table,
349
+ state=seq_group.state,
350
+ token_type_ids=seq_group.token_type_ids,
351
+ # `multi_modal_data` will only be present for the 1st comm
352
+ # between engine and worker.
353
+ # the subsequent comms can still use delta, but
354
+ # `multi_modal_data` will be None.
355
+ multi_modal_data=seq_group.multi_modal_data
356
+ if scheduler_outputs.num_prefill_groups > 0
357
+ else None,
358
+ multi_modal_placeholders=seq_group.multi_modal_placeholders
359
+ if scheduler_outputs.num_prefill_groups > 0
360
+ else None,
361
+ mm_processor_kwargs=seq_group.mm_processor_kwargs,
362
+ prompt_adapter_request=seq_group.prompt_adapter_request,
363
+ )
364
+ else:
365
+ # When SPMD mode is enabled, we only send delta data except for
366
+ # the first request to reduce serialization cost.
367
+ seq_data_delta = {}
368
+ for id, data in seq_data.items():
369
+ seq_data_delta[id] = data.get_delta_and_reset()
370
+ seq_group_metadata = SequenceGroupMetadataDelta(
371
+ seq_data_delta,
372
+ seq_group.request_id,
373
+ block_tables,
374
+ is_prompt,
375
+ do_sample=do_sample,
376
+ token_chunk_size=token_chunk_size,
377
+ computed_block_nums=common_computed_block_nums,
378
+ )
379
+ seq_group_metadata_list.append(seq_group_metadata)
380
+
381
+ if allow_async_output_proc:
382
+ allow_async_output_proc = self._allow_async_output_proc(seq_group)
383
+
384
+ """Xinference Change!!!
385
+ If the `seq_group` in this scheduling triggers a transfer,
386
+ it needs to be removed from the running queue (as it is already in the transferring queue).
387
+ It should remain in the transferring queue until the transfer is complete,
388
+ and then it can be placed back into the appropriate queue for scheduling.
389
+ """
390
+ if has_transferring:
391
+ scheduler_outputs.scheduled_seq_groups = scheduled_seq_groups
392
+ for seq_group in self.running.copy():
393
+ if seq_group in self._transfer_status:
394
+ self.running.remove(seq_group)
395
+
396
+ # Now that the batch has been created, we can assume all blocks in the
397
+ # batch will have been computed before the next scheduling invocation.
398
+ # This is because the engine assumes that a failure in model execution
399
+ # will crash the vLLM instance / will not retry.
400
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
401
+ self.block_manager.mark_blocks_as_computed(
402
+ scheduled_seq_group.seq_group, scheduled_seq_group.token_chunk_size
403
+ )
404
+
405
+ self._seq_group_metadata_cache[self.next_cache_id].reset()
406
+
407
+ scheduler_time = time.perf_counter() - scheduler_start_time
408
+ # Add this to scheduler time to all the sequences that are currently
409
+ # running. This will help estimate if the scheduler is a significant
410
+ # component in the e2e latency.
411
+ for seq_group in self.running:
412
+ if seq_group is not None and seq_group.metrics is not None:
413
+ if seq_group.metrics.scheduler_time is not None:
414
+ seq_group.metrics.scheduler_time += scheduler_time
415
+ else:
416
+ seq_group.metrics.scheduler_time = scheduler_time
417
+
418
+ # Move to next cache (if exists)
419
+ self.cache_id = self.next_cache_id
420
+
421
+ # Return results
422
+ return (seq_group_metadata_list, scheduler_outputs, allow_async_output_proc)
423
+
424
+ def has_unfinished_seqs(self) -> bool:
425
+ """
426
+ This interface is used to determine whether the scheduling process should stop,
427
+ so it needs to include information about the transferring queue.
428
+ """
429
+ res = super().has_unfinished_seqs()
430
+ return res or len(self._transferring) != 0
431
+
432
+ def get_num_unfinished_seq_groups(self) -> int:
433
+ """
434
+ When retrieving information from this interface,
435
+ the information from the transferring queue needs to be taken into account.
436
+ """
437
+ res = super().get_num_unfinished_seq_groups()
438
+ return res + len(self._transferring)
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.