xinference 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (210) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +72 -66
  4. xinference/core/model.py +78 -25
  5. xinference/core/supervisor.py +81 -10
  6. xinference/core/utils.py +12 -8
  7. xinference/core/worker.py +32 -0
  8. xinference/model/audio/core.py +5 -0
  9. xinference/model/audio/cosyvoice.py +25 -3
  10. xinference/model/audio/f5tts.py +15 -10
  11. xinference/model/audio/f5tts_mlx.py +260 -0
  12. xinference/model/audio/fish_speech.py +35 -111
  13. xinference/model/audio/model_spec.json +19 -3
  14. xinference/model/audio/model_spec_modelscope.json +9 -0
  15. xinference/model/audio/utils.py +32 -0
  16. xinference/model/image/core.py +69 -1
  17. xinference/model/image/model_spec.json +145 -4
  18. xinference/model/image/model_spec_modelscope.json +150 -4
  19. xinference/model/image/stable_diffusion/core.py +45 -13
  20. xinference/model/llm/__init__.py +2 -0
  21. xinference/model/llm/llm_family.json +143 -0
  22. xinference/model/llm/llm_family.py +15 -36
  23. xinference/model/llm/llm_family_modelscope.json +148 -0
  24. xinference/model/llm/mlx/core.py +37 -32
  25. xinference/model/llm/transformers/cogagent.py +272 -0
  26. xinference/model/llm/transformers/core.py +2 -0
  27. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  28. xinference/model/llm/utils.py +28 -3
  29. xinference/model/llm/vllm/core.py +48 -9
  30. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  31. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  32. xinference/model/llm/vllm/xavier/block.py +112 -0
  33. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  34. xinference/model/llm/vllm/xavier/block_tracker.py +116 -0
  35. xinference/model/llm/vllm/xavier/engine.py +247 -0
  36. xinference/model/llm/vllm/xavier/executor.py +132 -0
  37. xinference/model/llm/vllm/xavier/scheduler.py +422 -0
  38. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  39. xinference/model/llm/vllm/xavier/test/test_xavier.py +122 -0
  40. xinference/model/llm/vllm/xavier/transfer.py +298 -0
  41. xinference/model/video/diffusers.py +14 -0
  42. xinference/model/video/model_spec.json +15 -0
  43. xinference/model/video/model_spec_modelscope.json +16 -0
  44. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  45. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  46. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  47. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  48. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  49. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  50. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  51. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  52. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  53. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  54. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  55. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  56. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  57. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  58. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  59. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  60. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  61. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  62. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  63. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  64. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  65. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  66. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  67. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  68. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  69. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  70. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  71. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  72. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  73. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  74. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  75. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  76. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  77. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  78. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  79. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  80. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  81. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  82. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  83. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  84. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  85. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  86. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  87. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  88. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  89. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  90. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  91. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  92. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  93. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  94. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  95. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  96. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  97. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  98. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  99. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  100. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  101. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  102. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  103. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  104. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  105. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  106. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  107. xinference/thirdparty/matcha/utils/utils.py +2 -2
  108. xinference/types.py +13 -0
  109. xinference/web/ui/build/asset-manifest.json +6 -6
  110. xinference/web/ui/build/index.html +1 -1
  111. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  112. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  113. xinference/web/ui/build/static/js/main.1eb206d1.js +3 -0
  114. xinference/web/ui/build/static/js/main.1eb206d1.js.map +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  116. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  117. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  118. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  119. xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +1 -0
  120. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  121. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  122. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  123. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  124. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  125. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  126. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  127. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  128. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  129. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  130. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  131. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  132. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  133. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  134. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  135. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  136. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  137. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  138. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  139. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  140. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  141. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  142. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  143. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  144. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  145. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  146. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  147. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  148. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  149. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  150. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  151. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  152. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  153. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  154. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  155. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  156. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  157. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  158. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  159. xinference/web/ui/node_modules/.package-lock.json +67 -3
  160. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  161. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  162. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  163. xinference/web/ui/node_modules/i18next/package.json +129 -0
  164. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  165. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  166. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  167. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  168. xinference/web/ui/package-lock.json +69 -3
  169. xinference/web/ui/package.json +2 -0
  170. xinference/web/ui/src/locales/en.json +186 -0
  171. xinference/web/ui/src/locales/zh.json +186 -0
  172. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/METADATA +19 -11
  173. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/RECORD +178 -111
  174. xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
  175. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  176. xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
  177. xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
  178. xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
  179. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  180. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  181. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  182. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  183. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  184. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  185. xinference/web/ui/build/static/js/main.4eb4ee80.js +0 -3
  186. xinference/web/ui/build/static/js/main.4eb4ee80.js.map +0 -1
  187. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  188. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  189. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  190. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  191. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  192. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  193. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  194. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  195. xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +0 -1
  196. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  197. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  198. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  199. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  200. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  201. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  202. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  203. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  204. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  205. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  206. /xinference/web/ui/build/static/js/{main.4eb4ee80.js.LICENSE.txt → main.1eb206d1.js.LICENSE.txt} +0 -0
  207. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/LICENSE +0 -0
  208. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/WHEEL +0 -0
  209. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/entry_points.txt +0 -0
  210. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, Optional
15
+
16
+ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
17
+ from vllm.core.block.interfaces import DeviceAwareBlockAllocator
18
+ from vllm.platforms import current_platform
19
+ from vllm.utils import Device
20
+
21
+ from .block import XavierPrefixCachingBlockAllocator
22
+
23
+
24
+ class XavierCpuGpuBlockAllocator(CpuGpuBlockAllocator):
25
+ def __init__(self, *args, **kwargs):
26
+ super().__init__(*args, **kwargs)
27
+ self._xavier_config: Optional[Dict[str, Any]] = None
28
+
29
+ @property
30
+ def xavier_config(self):
31
+ return self._xavier_config
32
+
33
+ @xavier_config.setter
34
+ def xavier_config(self, v: Dict[str, Any]):
35
+ self._xavier_config = v
36
+ self._allocators[Device.GPU].xavier_config = v
37
+
38
+ @staticmethod
39
+ def create(
40
+ allocator_type: str,
41
+ num_gpu_blocks: int,
42
+ num_cpu_blocks: int,
43
+ block_size: int,
44
+ ) -> DeviceAwareBlockAllocator:
45
+ """Xinference Change!!!
46
+ 1. The code is copied here because the `allocator` needs to be instantiated as a subclass.
47
+ 2. Why not re-instantiate it externally?
48
+ Re-instantiating the `allocator` is costly because it requires initializing many tensors.
49
+ """
50
+
51
+ # For HPU, block id 0 is used only for padding
52
+ reserved_blocks = 1 if current_platform.is_hpu() else 0
53
+ block_ids = list(range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
54
+ num_gpu_blocks -= reserved_blocks
55
+ gpu_block_ids = block_ids[:num_gpu_blocks]
56
+ cpu_block_ids = block_ids[num_gpu_blocks:]
57
+
58
+ gpu_allocator = XavierPrefixCachingBlockAllocator(
59
+ run_isolation=True,
60
+ num_blocks=num_gpu_blocks,
61
+ block_size=block_size,
62
+ block_ids=gpu_block_ids,
63
+ )
64
+
65
+ cpu_allocator = XavierPrefixCachingBlockAllocator(
66
+ num_blocks=num_cpu_blocks,
67
+ block_size=block_size,
68
+ block_ids=cpu_block_ids,
69
+ )
70
+
71
+ return XavierCpuGpuBlockAllocator(
72
+ cpu_block_allocator=cpu_allocator,
73
+ gpu_block_allocator=gpu_allocator,
74
+ )
@@ -0,0 +1,112 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import asyncio
15
+ import logging
16
+ from typing import Any, Dict, Optional
17
+
18
+ import xoscar as xo
19
+ from vllm.core.block.interfaces import BlockId
20
+ from vllm.core.block.prefix_caching_block import (
21
+ BlockTracker,
22
+ PrefixCachingBlockAllocator,
23
+ )
24
+
25
+ from .....isolation import Isolation
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class XavierInnerBlockTracker(BlockTracker):
31
+ """Used to track the status of a block inside the prefix caching allocator"""
32
+
33
+ """
34
+ Here, two fixed attributes, `transferred` and `executed`,
35
+ have been added to the `BlockTracker` class to mark the status of the corresponding `block_id`.
36
+ We cannot directly set attributes on the `Block` object
37
+ because the `Block` objects are dynamically allocated with each scheduling.
38
+ The `Block` objects executed in two different scheduling steps may have the same `id`, `hash`, etc.,
39
+ but the instance objects may differ.
40
+ The BlockTracker object inside vllm is one-to-one with the block_id.
41
+ """
42
+ __slots__ = ("active", "last_accessed", "computed", "transferred", "executed")
43
+
44
+ def __init__(self):
45
+ super().__init__()
46
+ self.transferred = False
47
+ self.executed = False
48
+
49
+
50
+ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
51
+ def __init__(self, *args, run_isolation: bool = False, **kwargs):
52
+ super().__init__(*args, **kwargs)
53
+ for _id in self._block_tracker.keys():
54
+ self._block_tracker[_id] = XavierInnerBlockTracker()
55
+
56
+ self._xavier_config: Optional[Dict[str, Any]] = None
57
+ self._block_tracker_ref = None
58
+ if run_isolation:
59
+ self._isolation = Isolation(
60
+ asyncio.new_event_loop(), threaded=True, daemon=True
61
+ )
62
+ self._isolation.start()
63
+ else:
64
+ self._isolation = None # type: ignore
65
+
66
+ def __del__(self):
67
+ if self._isolation is not None:
68
+ self._isolation.stop()
69
+
70
+ @property
71
+ def xavier_config(self):
72
+ return self._xavier_config
73
+
74
+ @xavier_config.setter
75
+ def xavier_config(self, v: Dict[str, Any]):
76
+ self._xavier_config = v
77
+
78
+ async def _get_block_tracker_ref(self):
79
+ from .block_tracker import VLLMBlockTracker
80
+
81
+ if self._block_tracker_ref is None:
82
+ block_tracker_address = self.xavier_config.get("block_tracker_address")
83
+ self._block_tracker_ref = await xo.actor_ref(
84
+ address=block_tracker_address, uid=VLLMBlockTracker.default_uid()
85
+ )
86
+ return self._block_tracker_ref
87
+
88
+ async def unregister_block(self, block_id: int):
89
+ assert self._xavier_config is not None
90
+ tracker_ref = await self._get_block_tracker_ref()
91
+ await tracker_ref.unregister_block(
92
+ self.xavier_config.get("virtual_engine"),
93
+ self.xavier_config.get("rank_address"),
94
+ block_id,
95
+ )
96
+
97
+ def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
98
+ """
99
+ This is the only entry point where the `block_id` is evicted from the cache.
100
+ Therefore, when the `block_id` is evicted, the tracker actor needs to unregister the block information.
101
+ At the same time, make sure to reset the attributes corresponding to that `block_id`.
102
+ """
103
+ evicted_block_id = super()._maybe_allocate_evicted_block_id()
104
+ logger.debug(f"block_id: {evicted_block_id} will be evicted from the cache.")
105
+ if evicted_block_id is not None and self._isolation is not None:
106
+ tracker = self._block_tracker[evicted_block_id]
107
+ assert isinstance(tracker, XavierInnerBlockTracker)
108
+ tracker.transferred = False
109
+ tracker.executed = False
110
+ self._isolation.call(self.unregister_block(evicted_block_id))
111
+ logger.debug(f"block_id: {evicted_block_id} will be used again.")
112
+ return evicted_block_id
@@ -0,0 +1,71 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from typing import Any, Dict, Optional
16
+
17
+ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
18
+ from vllm.core.block.interfaces import Block
19
+ from vllm.core.block_manager import SelfAttnBlockSpaceManager
20
+ from vllm.sequence import SequenceGroup, SequenceStatus
21
+ from vllm.utils import Device
22
+
23
+ from .allocator import XavierCpuGpuBlockAllocator
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class XavierBlockManager(SelfAttnBlockSpaceManager):
29
+ def __init__(self, *args, **kwargs):
30
+ # Monkey patch
31
+ CpuGpuBlockAllocator.create = XavierCpuGpuBlockAllocator.create
32
+ super().__init__(*args, **kwargs)
33
+ self._xavier_config: Optional[Dict[str, Any]] = None
34
+ logger.debug("Init xavier block manager done.")
35
+
36
+ @property
37
+ def xavier_config(self):
38
+ return self._xavier_config
39
+
40
+ @xavier_config.setter
41
+ def xavier_config(self, value: Dict[str, Any]):
42
+ self._xavier_config = value
43
+ self.block_allocator.xavier_config = value
44
+
45
+ def get_block_by_block_id(self, seq_id: int, block_id: int) -> Block:
46
+ table = self.block_tables[seq_id]
47
+ for b in table.blocks:
48
+ if b.block_id == block_id:
49
+ return b
50
+
51
+ def get_block_status_by_block_id(self, status_name: str, block_id: int) -> bool:
52
+ tracker = self.block_allocator._allocators[Device.GPU]._block_tracker[block_id]
53
+ return getattr(tracker, status_name)
54
+
55
+ def set_block_status_by_block_id(
56
+ self, status_name: str, block_id: int, status: bool
57
+ ) -> None:
58
+ tracker = self.block_allocator._allocators[Device.GPU]._block_tracker[block_id]
59
+ assert getattr(tracker, status_name, None) is not None
60
+ setattr(tracker, status_name, status)
61
+
62
+ def allocate(self, seq_group: SequenceGroup) -> None:
63
+ """
64
+ If the `seq_group` has the `transferred` attribute,
65
+ it indicates that the `seq_group` has gone through the transfer process,
66
+ so the block allocation logic should not be executed again.
67
+ """
68
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
69
+ if all([getattr(s, "transferred", False) for s in waiting_seqs]):
70
+ return
71
+ super().allocate(seq_group)
@@ -0,0 +1,116 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import random
15
+ from typing import Dict, List, Optional, Set, Tuple
16
+
17
+ import xoscar as xo
18
+
19
+
20
+ class VLLMBlockTracker(xo.StatelessActor):
21
+ @classmethod
22
+ def default_uid(cls):
23
+ return f"vllm-block-tracker-actor"
24
+
25
+ def __init__(self):
26
+ super().__init__()
27
+ # engine -> hash_to_address_and_block_id
28
+ self._hash_to_address_and_block_id: Dict[
29
+ int, Dict[int, Set[Tuple[str, int]]]
30
+ ] = {}
31
+ # engine -> address_to_hash_and_block_id
32
+ self._address_to_hash_and_block_id: Dict[
33
+ int, Dict[str, Set[Tuple[int, int]]]
34
+ ] = {}
35
+
36
+ def register_blocks(
37
+ self, virtual_engine: int, block_infos: List[Tuple[int, int]], address: str
38
+ ):
39
+ # Update query meta
40
+ if virtual_engine not in self._hash_to_address_and_block_id:
41
+ self._hash_to_address_and_block_id[virtual_engine] = {}
42
+ hash_to_address_and_block_id = self._hash_to_address_and_block_id[
43
+ virtual_engine
44
+ ]
45
+ for hash_content, block_id in block_infos:
46
+ if hash_content not in hash_to_address_and_block_id:
47
+ hash_to_address_and_block_id[hash_content] = {
48
+ (address, block_id),
49
+ }
50
+ else:
51
+ hash_to_address_and_block_id[hash_content].add((address, block_id))
52
+
53
+ # Update remove meta
54
+ if virtual_engine not in self._address_to_hash_and_block_id:
55
+ self._address_to_hash_and_block_id[virtual_engine] = {}
56
+ address_to_hash_and_block_id = self._address_to_hash_and_block_id[
57
+ virtual_engine
58
+ ]
59
+ if address not in address_to_hash_and_block_id:
60
+ address_to_hash_and_block_id[address] = set()
61
+ address_to_hash_and_block_id[address].update(block_infos)
62
+
63
+ def query_blocks(
64
+ self, virtual_engine: int, hash_contents: List[Tuple[int, int]]
65
+ ) -> Dict[str, Set[Tuple[int, int, int]]]:
66
+ if virtual_engine not in self._hash_to_address_and_block_id:
67
+ return {}
68
+ hash_to_address_and_block_id = self._hash_to_address_and_block_id[
69
+ virtual_engine
70
+ ]
71
+ remote: Dict[str, Set[Tuple[int, int, int]]] = {}
72
+ for hash_content, _id in hash_contents:
73
+ if (
74
+ hash_content in hash_to_address_and_block_id
75
+ ) and hash_to_address_and_block_id[hash_content]:
76
+ # TODO: Randomly select here, and try to distribute requests as evenly as possible.
77
+ # There may be better methods in the future.
78
+ address, block_id = random.choice(
79
+ list(hash_to_address_and_block_id[hash_content])
80
+ )
81
+ if address not in remote:
82
+ remote[address] = {
83
+ (hash_content, block_id, _id),
84
+ }
85
+ else:
86
+ remote[address].add((hash_content, block_id, _id))
87
+ return remote
88
+
89
+ def unregister_block(self, virtual_engine: int, address: str, block_id: int):
90
+ if (virtual_engine not in self._address_to_hash_and_block_id) or (
91
+ virtual_engine not in self._hash_to_address_and_block_id
92
+ ):
93
+ return
94
+
95
+ # Update remove meta
96
+ address_to_hash_and_block_id = self._address_to_hash_and_block_id[
97
+ virtual_engine
98
+ ]
99
+ if address not in address_to_hash_and_block_id:
100
+ return
101
+ hash_and_block_id = address_to_hash_and_block_id[address]
102
+ detail: Optional[Tuple[int, int]] = None
103
+ for hash_content, _id in hash_and_block_id.copy():
104
+ if _id == block_id:
105
+ detail = (hash_content, block_id)
106
+ hash_and_block_id.discard(detail)
107
+ break
108
+
109
+ # Update query meta
110
+ if detail is not None:
111
+ hash_to_address_and_block_id = self._hash_to_address_and_block_id[
112
+ virtual_engine
113
+ ]
114
+ _hash = detail[0]
115
+ if _hash in hash_to_address_and_block_id:
116
+ hash_to_address_and_block_id[_hash].discard((address, detail[1]))
@@ -0,0 +1,247 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from typing import Dict, List, Optional, Type, Union
16
+
17
+ from vllm import AsyncEngineArgs, EmbeddingRequestOutput, RequestOutput
18
+ from vllm.config import VllmConfig
19
+ from vllm.engine.async_llm_engine import AsyncLLMEngine, _AsyncLLMEngine
20
+ from vllm.engine.llm_engine import SchedulerOutputState
21
+ from vllm.engine.metrics_types import StatLoggerBase
22
+ from vllm.executor.executor_base import ExecutorBase
23
+ from vllm.sequence import ExecuteModelRequest
24
+ from vllm.usage.usage_lib import UsageContext
25
+
26
+ from .executor import XavierExecutor
27
+ from .scheduler import XavierScheduler
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class XavierInternalEngine(_AsyncLLMEngine):
33
+ def __init__(self, *args, **kwargs):
34
+ super().__init__(*args, **kwargs)
35
+ self._xavier_config = kwargs["vllm_config"].xavier_config
36
+ self.scheduler = [
37
+ XavierScheduler(
38
+ self.scheduler_config,
39
+ self.cache_config,
40
+ self.lora_config,
41
+ self.parallel_config.pipeline_parallel_size,
42
+ self.async_callbacks[v_id]
43
+ if self.model_config.use_async_output_proc
44
+ else None,
45
+ xavier_config=self._xavier_config,
46
+ virtual_engine=v_id,
47
+ )
48
+ for v_id in range(self.parallel_config.pipeline_parallel_size)
49
+ ]
50
+ self.output_processor.scheduler = self.scheduler
51
+ self.model_executor.scheduler = self.scheduler
52
+
53
+ async def step_async(
54
+ self, virtual_engine: int
55
+ ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
56
+ """Performs one decoding iteration and returns newly generated results.
57
+ The workers are ran asynchronously if possible.
58
+
59
+ This function performs one decoding iteration of the engine. It first
60
+ schedules the sequences to be executed in the next iteration and the
61
+ token blocks to be swapped in/out/copy. Then, it executes the model
62
+ and updates the scheduler with the model outputs. Finally, it decodes
63
+ the sequences and returns the newly generated results.
64
+ """
65
+ # these are cached outputs from previous iterations. None if on first
66
+ # iteration
67
+ cached_outputs = self.cached_scheduler_outputs[virtual_engine]
68
+ seq_group_metadata_list = cached_outputs.seq_group_metadata_list
69
+ scheduler_outputs = cached_outputs.scheduler_outputs
70
+ allow_async_output_proc = cached_outputs.allow_async_output_proc
71
+
72
+ ctx = self.scheduler_contexts[virtual_engine]
73
+
74
+ # Clear outputs for each new scheduler iteration
75
+ ctx.request_outputs.clear()
76
+
77
+ # skip the scheduler if there are any remaining steps in the seq groups.
78
+ # This ensures that the scheduler is only called again when the current
79
+ # batch has completed.
80
+ if not self._has_remaining_steps(seq_group_metadata_list):
81
+ # Schedule iteration
82
+ """Xinference Change!!!
83
+ Why copy the entire function code of vllm:
84
+ The purpose here is to modify the way the `schedule` function is invoked to asynchronous calling.
85
+ No other modifications were made elsewhere.
86
+ """
87
+ (
88
+ seq_group_metadata_list,
89
+ scheduler_outputs,
90
+ allow_async_output_proc,
91
+ ) = await self.scheduler[virtual_engine].schedule()
92
+
93
+ ctx.seq_group_metadata_list = seq_group_metadata_list
94
+ ctx.scheduler_outputs = scheduler_outputs
95
+
96
+ # Maybe switch from async mode to sync mode
97
+ if not allow_async_output_proc and len(ctx.output_queue) > 0:
98
+ self._process_model_outputs(ctx=ctx)
99
+
100
+ if (
101
+ self.scheduler_config.is_multi_step
102
+ and scheduler_outputs.num_lookahead_slots > 0
103
+ ):
104
+ # cache the scheduler outputs for the next iteration if we have
105
+ # lookahead slots
106
+ self._cache_scheduler_outputs_for_multi_step(
107
+ virtual_engine,
108
+ seq_group_metadata_list,
109
+ scheduler_outputs,
110
+ allow_async_output_proc,
111
+ )
112
+
113
+ assert seq_group_metadata_list is not None
114
+ assert scheduler_outputs is not None
115
+
116
+ if not scheduler_outputs.is_empty():
117
+ finished_requests_ids = self.scheduler[
118
+ virtual_engine
119
+ ].get_and_reset_finished_requests_ids()
120
+
121
+ # Check if we have a cached last_output from the previous iteration.
122
+ # For supporting PP this is probably the best way to pass the
123
+ # sampled_token_ids, as a separate broadcast over all the PP stages
124
+ # will cause one virtual engine's microbatch to block the pipeline.
125
+ last_sampled_token_ids = self._get_last_sampled_token_ids(virtual_engine)
126
+
127
+ execute_model_req = ExecuteModelRequest(
128
+ seq_group_metadata_list=seq_group_metadata_list,
129
+ blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
130
+ blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
131
+ blocks_to_copy=scheduler_outputs.blocks_to_copy,
132
+ virtual_engine=virtual_engine,
133
+ num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
134
+ running_queue_size=scheduler_outputs.running_queue_size,
135
+ finished_requests_ids=finished_requests_ids,
136
+ # We use ExecuteModelRequest to pass the last sampled_token_ids
137
+ # to each of the non-last PP stages for in-place prepare_input.
138
+ last_sampled_token_ids=last_sampled_token_ids,
139
+ )
140
+
141
+ if allow_async_output_proc:
142
+ execute_model_req.async_callback = self.async_callbacks[virtual_engine]
143
+
144
+ # Execute the model.
145
+ outputs = await self.model_executor.execute_model_async(execute_model_req)
146
+
147
+ # we need to do this here so that last step's sampled_token_ids can
148
+ # be passed to the next iteration for PP.
149
+ if self.scheduler_config.is_multi_step:
150
+ self._update_cached_scheduler_output(virtual_engine, outputs)
151
+ else:
152
+ if len(ctx.output_queue) > 0:
153
+ self._process_model_outputs(ctx=ctx)
154
+ outputs = []
155
+
156
+ # Finish the current step for all the sequence groups.
157
+ if self.scheduler_config.is_multi_step:
158
+ for seq_group in seq_group_metadata_list:
159
+ seq_group.finish_step()
160
+
161
+ if not self._has_remaining_steps(seq_group_metadata_list):
162
+ # Clear the cache if we have finished all the steps
163
+ if self.scheduler_config.is_multi_step:
164
+ self.cached_scheduler_outputs[virtual_engine] = SchedulerOutputState()
165
+
166
+ # is_first_step_output is True only when the num_steps of all
167
+ # the sequences are 1. When the num_steps > 1,
168
+ # multi_step_model_runner does the first-step output append.
169
+ is_first_step_output: bool = (
170
+ False
171
+ if not seq_group_metadata_list
172
+ else seq_group_metadata_list[0].state.num_steps == 1
173
+ )
174
+
175
+ ctx.append_output(
176
+ outputs=outputs,
177
+ seq_group_metadata_list=seq_group_metadata_list,
178
+ scheduler_outputs=scheduler_outputs,
179
+ is_async=allow_async_output_proc,
180
+ is_last_step=True,
181
+ is_first_step_output=is_first_step_output,
182
+ )
183
+
184
+ if outputs and allow_async_output_proc:
185
+ assert (
186
+ len(outputs) == 1
187
+ ), "Async postprocessor expects only a single output set"
188
+ self._advance_to_next_step(
189
+ outputs[0],
190
+ seq_group_metadata_list,
191
+ scheduler_outputs.scheduled_seq_groups,
192
+ )
193
+
194
+ if not allow_async_output_proc:
195
+ self._process_model_outputs(ctx=ctx)
196
+
197
+ # Log stats.
198
+ self.do_log_stats(scheduler_outputs, outputs)
199
+
200
+ # Tracing
201
+ self.do_tracing(scheduler_outputs)
202
+
203
+ else:
204
+ # Multi-step case
205
+ return ctx.request_outputs
206
+
207
+ if not self.has_unfinished_requests():
208
+ # Drain async postprocessor (if exists)
209
+ if len(ctx.output_queue) > 0:
210
+ self._process_model_outputs(ctx=ctx)
211
+ assert len(ctx.output_queue) == 0
212
+
213
+ return ctx.request_outputs
214
+
215
+
216
+ class XavierEngine(AsyncLLMEngine):
217
+ _engine_class: Type[_AsyncLLMEngine] = XavierInternalEngine
218
+ _xavier_config: Optional[Dict] = None
219
+
220
+ @classmethod
221
+ def _get_executor_cls(cls, engine_config: VllmConfig) -> Type[ExecutorBase]:
222
+ logger.debug(f"Initializing Xavier executor.")
223
+ return XavierExecutor
224
+
225
+ @classmethod
226
+ def from_engine_args(
227
+ cls,
228
+ engine_args: AsyncEngineArgs,
229
+ engine_config: Optional[VllmConfig] = None,
230
+ start_engine_loop: bool = True,
231
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
232
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
233
+ xavier_config: Optional[Dict] = None,
234
+ ) -> "AsyncLLMEngine":
235
+ cls._xavier_config = xavier_config
236
+ return super().from_engine_args(
237
+ engine_args, engine_config, start_engine_loop, usage_context, stat_loggers
238
+ )
239
+
240
+ def __init__(self, *args, **kwargs):
241
+ # set xavier_config to `vllm_config`,
242
+ # because it may be needed everywhere in the vllm internal components
243
+ kwargs["vllm_config"].xavier_config = self._xavier_config
244
+ super().__init__(*args, **kwargs)
245
+
246
+ async def init_xavier(self):
247
+ await self.engine.model_executor.init_transfer()