xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -52,6 +52,7 @@ QWEN_TOOL_CALL_FAMILY = [
52
52
  "qwen2-instruct",
53
53
  "qwen2-moe-instruct",
54
54
  "qwen2.5-instruct",
55
+ "qwen2.5-coder-instruct",
55
56
  ]
56
57
 
57
58
  GLM4_TOOL_CALL_FAMILY = [
@@ -96,13 +97,22 @@ class ChatModelMixin:
96
97
  return rendered
97
98
 
98
99
  def get_full_context(
99
- self, messages: List, chat_template: str, tokenizer=None, **kwargs
100
- ) -> str:
100
+ self,
101
+ messages: List,
102
+ chat_template: str,
103
+ tokenizer=None,
104
+ tokenize=False,
105
+ **kwargs,
106
+ ):
107
+ if "vision" not in self.model_family.model_ability: # type: ignore
108
+ messages = self.convert_messages_with_content_list_to_str_conversion(
109
+ messages
110
+ )
101
111
  if tokenizer is not None:
102
112
  try:
103
113
  full_context = tokenizer.apply_chat_template(
104
114
  messages,
105
- tokenize=False,
115
+ tokenize=tokenize,
106
116
  chat_template=chat_template,
107
117
  add_generation_prompt=True,
108
118
  **kwargs,
@@ -118,6 +128,25 @@ class ChatModelMixin:
118
128
  # Compilation function uses a cache to avoid recompiling the same template
119
129
  return self._build_from_raw_template(messages, chat_template, **kwargs)
120
130
 
131
+ @staticmethod
132
+ def convert_messages_with_content_list_to_str_conversion(
133
+ messages: List[Dict],
134
+ ) -> List[Dict]:
135
+ """
136
+ Handles messages with content list conversion, in order to support Cline, see GH#2659 .
137
+ """
138
+ for message in messages:
139
+ texts = ""
140
+ msg_content = message.get("content")
141
+ if msg_content:
142
+ if isinstance(msg_content, str):
143
+ texts = msg_content
144
+ elif isinstance(msg_content, list):
145
+ texts = "\n".join(item.get("text", "") for item in msg_content)
146
+ if texts:
147
+ message["content"] = texts
148
+ return messages
149
+
121
150
  @staticmethod
122
151
  def get_specific_prompt(model_family: str, messages: List[ChatCompletionMessage]):
123
152
  """
@@ -324,7 +353,10 @@ class ChatModelMixin:
324
353
  """
325
354
  try:
326
355
  if isinstance(c, dict):
327
- return [(None, c["name"], c["arguments"])]
356
+ try:
357
+ return [(None, c["name"], json.loads(c["arguments"]))]
358
+ except Exception:
359
+ return [(None, c["name"], c["arguments"])]
328
360
  except KeyError:
329
361
  logger.error("Can't parse glm output: %s", c)
330
362
  return [(str(c), None, None)]
@@ -569,6 +601,25 @@ def _decode_image(_url):
569
601
  return Image.open(BytesIO(response.content)).convert("RGB")
570
602
 
571
603
 
604
+ def _decode_image_without_rgb(_url):
605
+ if _url.startswith("data:"):
606
+ logging.info("Parse url by base64 decoder.")
607
+ # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
608
+ # e.g. f"data:image/jpeg;base64,{base64_image}"
609
+ _type, data = _url.split(";")
610
+ _, ext = _type.split("/")
611
+ data = data[len("base64,") :]
612
+ data = base64.b64decode(data.encode("utf-8"))
613
+ return Image.open(BytesIO(data))
614
+ else:
615
+ try:
616
+ response = requests.get(_url)
617
+ except requests.exceptions.MissingSchema:
618
+ return Image.open(_url)
619
+ else:
620
+ return Image.open(BytesIO(response.content))
621
+
622
+
572
623
  @typing.no_type_check
573
624
  def generate_completion_chunk(
574
625
  chunk_text: Optional[str],
@@ -69,6 +69,8 @@ class VLLMModelConfig(TypedDict, total=False):
69
69
  quantization: Optional[str]
70
70
  max_model_len: Optional[int]
71
71
  limit_mm_per_prompt: Optional[Dict[str, int]]
72
+ guided_decoding_backend: Optional[str]
73
+ scheduling_policy: Optional[str]
72
74
 
73
75
 
74
76
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -85,6 +87,15 @@ class VLLMGenerateConfig(TypedDict, total=False):
85
87
  stop: Optional[Union[str, List[str]]]
86
88
  stream: bool # non-sampling param, should not be passed to the engine.
87
89
  stream_options: Optional[Union[dict, None]]
90
+ skip_special_tokens: Optional[bool]
91
+ response_format: Optional[dict]
92
+ guided_json: Optional[Union[str, dict]]
93
+ guided_regex: Optional[str]
94
+ guided_choice: Optional[List[str]]
95
+ guided_grammar: Optional[str]
96
+ guided_json_object: Optional[bool]
97
+ guided_decoding_backend: Optional[str]
98
+ guided_whitespace_pattern: Optional[str]
88
99
 
89
100
 
90
101
  try:
@@ -144,7 +155,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
144
155
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
145
156
  VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
146
157
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
147
-
158
+ VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
159
+ VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
160
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
148
161
 
149
162
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
150
163
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -171,14 +184,19 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
171
184
  if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
172
185
  VLLM_SUPPORTED_MODELS.append("llama-3.1")
173
186
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
187
+ VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
174
188
 
175
189
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
176
190
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
177
191
 
192
+ if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
193
+ VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
194
+
178
195
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
179
196
  VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
180
197
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
181
198
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
199
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
182
200
 
183
201
 
184
202
  class VLLMModel(LLM):
@@ -207,6 +225,10 @@ class VLLMModel(LLM):
207
225
  self._engine = None
208
226
  self.lora_modules = peft_model
209
227
  self.lora_requests: List[LoRARequest] = []
228
+ self._xavier_config = None
229
+
230
+ def set_xavier_config(self, value: Optional[Dict]):
231
+ self._xavier_config = value # type: ignore
210
232
 
211
233
  def load(self):
212
234
  try:
@@ -232,7 +254,6 @@ class VLLMModel(LLM):
232
254
  multiprocessing.set_start_method("fork", force=True)
233
255
 
234
256
  self._model_config = self._sanitize_model_config(self._model_config)
235
-
236
257
  if self.lora_modules is None:
237
258
  self.lora_requests = []
238
259
  else:
@@ -253,13 +274,34 @@ class VLLMModel(LLM):
253
274
  f"Enable lora: {enable_lora}. Lora count: {max_loras}."
254
275
  )
255
276
 
256
- engine_args = AsyncEngineArgs(
257
- model=self.model_path,
258
- enable_lora=enable_lora,
259
- max_loras=max_loras,
260
- **self._model_config,
261
- )
262
- self._engine = AsyncLLMEngine.from_engine_args(engine_args)
277
+ if self._xavier_config is not None:
278
+ from .xavier.engine import XavierEngine
279
+
280
+ # Enabling Xavier means that `enable_prefix_caching` is enabled by default.
281
+ self._model_config.setdefault("enable_prefix_caching", True)
282
+ xavier_transfer_block_num = self._model_config.pop(
283
+ "xavier_transfer_block_num", 512
284
+ )
285
+ self._xavier_config["transfer_block_num"] = xavier_transfer_block_num
286
+ engine_args = AsyncEngineArgs(
287
+ model=self.model_path,
288
+ enable_lora=enable_lora,
289
+ max_loras=max_loras,
290
+ **self._model_config,
291
+ )
292
+
293
+ logger.debug(f"Start xavier for vllm with config: {self._xavier_config}")
294
+ self._engine = XavierEngine.from_engine_args(
295
+ engine_args, xavier_config=self._xavier_config
296
+ )
297
+ else:
298
+ engine_args = AsyncEngineArgs(
299
+ model=self.model_path,
300
+ enable_lora=enable_lora,
301
+ max_loras=max_loras,
302
+ **self._model_config,
303
+ )
304
+ self._engine = AsyncLLMEngine.from_engine_args(engine_args)
263
305
 
264
306
  self._check_health_task = None
265
307
  if hasattr(self._engine, "check_health"):
@@ -277,6 +319,9 @@ class VLLMModel(LLM):
277
319
  model_executor.shutdown()
278
320
  self._engine = None
279
321
 
322
+ async def init_xavier(self):
323
+ await self._engine.init_xavier()
324
+
280
325
  async def _check_healthy(self, interval: int = 30):
281
326
  from vllm.engine.async_llm_engine import AsyncEngineDeadError
282
327
 
@@ -314,7 +359,10 @@ class VLLMModel(LLM):
314
359
  model_config.setdefault("max_num_seqs", 256)
315
360
  model_config.setdefault("quantization", None)
316
361
  model_config.setdefault("max_model_len", None)
317
-
362
+ model_config.setdefault("guided_decoding_backend", "outlines")
363
+ # Add scheduling policy if vLLM version is 0.6.3 or higher
364
+ if vllm.__version__ >= "0.6.3":
365
+ model_config.setdefault("scheduling_policy", "fcfs")
318
366
  return model_config
319
367
 
320
368
  @staticmethod
@@ -325,6 +373,22 @@ class VLLMModel(LLM):
325
373
  generate_config = {}
326
374
 
327
375
  sanitized = VLLMGenerateConfig()
376
+
377
+ response_format = generate_config.pop("response_format", None)
378
+ guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
379
+ guided_json_object = None
380
+ guided_json = None
381
+
382
+ if response_format is not None:
383
+ if response_format.get("type") == "json_object":
384
+ guided_json_object = True
385
+ elif response_format.get("type") == "json_schema":
386
+ json_schema = response_format.get("json_schema")
387
+ assert json_schema is not None
388
+ guided_json = json_schema.get("json_schema")
389
+ if guided_decoding_backend is None:
390
+ guided_decoding_backend = "outlines"
391
+
328
392
  sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
329
393
  sanitized.setdefault("n", generate_config.get("n", 1))
330
394
  sanitized.setdefault("best_of", generate_config.get("best_of", None))
@@ -346,6 +410,31 @@ class VLLMModel(LLM):
346
410
  sanitized.setdefault(
347
411
  "stream_options", generate_config.get("stream_options", None)
348
412
  )
413
+ sanitized.setdefault(
414
+ "skip_special_tokens", generate_config.get("skip_special_tokens", True)
415
+ )
416
+ sanitized.setdefault(
417
+ "guided_json", generate_config.get("guided_json", guided_json)
418
+ )
419
+ sanitized.setdefault("guided_regex", generate_config.get("guided_regex", None))
420
+ sanitized.setdefault(
421
+ "guided_choice", generate_config.get("guided_choice", None)
422
+ )
423
+ sanitized.setdefault(
424
+ "guided_grammar", generate_config.get("guided_grammar", None)
425
+ )
426
+ sanitized.setdefault(
427
+ "guided_whitespace_pattern",
428
+ generate_config.get("guided_whitespace_pattern", None),
429
+ )
430
+ sanitized.setdefault(
431
+ "guided_json_object",
432
+ generate_config.get("guided_json_object", guided_json_object),
433
+ )
434
+ sanitized.setdefault(
435
+ "guided_decoding_backend",
436
+ generate_config.get("guided_decoding_backend", guided_decoding_backend),
437
+ )
349
438
 
350
439
  return sanitized
351
440
 
@@ -483,13 +572,46 @@ class VLLMModel(LLM):
483
572
  if isinstance(stream_options, dict)
484
573
  else False
485
574
  )
486
- sampling_params = SamplingParams(**sanitized_generate_config)
575
+
576
+ if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
577
+ # guided decoding only available for vllm >= 0.6.3
578
+ from vllm.sampling_params import GuidedDecodingParams
579
+
580
+ guided_options = GuidedDecodingParams.from_optional(
581
+ json=sanitized_generate_config.pop("guided_json", None),
582
+ regex=sanitized_generate_config.pop("guided_regex", None),
583
+ choice=sanitized_generate_config.pop("guided_choice", None),
584
+ grammar=sanitized_generate_config.pop("guided_grammar", None),
585
+ json_object=sanitized_generate_config.pop("guided_json_object", None),
586
+ backend=sanitized_generate_config.pop("guided_decoding_backend", None),
587
+ whitespace_pattern=sanitized_generate_config.pop(
588
+ "guided_whitespace_pattern", None
589
+ ),
590
+ )
591
+
592
+ sampling_params = SamplingParams(
593
+ guided_decoding=guided_options, **sanitized_generate_config
594
+ )
595
+ else:
596
+ # ignore generate configs
597
+ sanitized_generate_config.pop("guided_json", None)
598
+ sanitized_generate_config.pop("guided_regex", None)
599
+ sanitized_generate_config.pop("guided_choice", None)
600
+ sanitized_generate_config.pop("guided_grammar", None)
601
+ sanitized_generate_config.pop("guided_json_object", None)
602
+ sanitized_generate_config.pop("guided_decoding_backend", None)
603
+ sanitized_generate_config.pop("guided_whitespace_pattern", None)
604
+ sampling_params = SamplingParams(**sanitized_generate_config)
605
+
487
606
  if not request_id:
488
607
  request_id = str(uuid.uuid1())
489
608
 
490
609
  assert self._engine is not None
491
610
  results_generator = self._engine.generate(
492
- prompt, sampling_params, request_id, lora_request=lora_request
611
+ prompt,
612
+ sampling_params,
613
+ request_id,
614
+ lora_request,
493
615
  )
494
616
 
495
617
  async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
@@ -772,6 +894,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
772
894
  "image": 2, # default 2 images all chat
773
895
  }
774
896
  )
897
+ # Add scheduling policy if vLLM version is 0.6.3 or higher
898
+ if vllm.__version__ >= "0.6.3":
899
+ model_config.setdefault("scheduling_policy", "fcfs")
775
900
 
776
901
  return model_config
777
902
 
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,74 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, Optional
15
+
16
+ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
17
+ from vllm.core.block.interfaces import DeviceAwareBlockAllocator
18
+ from vllm.platforms import current_platform
19
+ from vllm.utils import Device
20
+
21
+ from .block import XavierPrefixCachingBlockAllocator
22
+
23
+
24
+ class XavierCpuGpuBlockAllocator(CpuGpuBlockAllocator):
25
+ def __init__(self, *args, **kwargs):
26
+ super().__init__(*args, **kwargs)
27
+ self._xavier_config: Optional[Dict[str, Any]] = None
28
+
29
+ @property
30
+ def xavier_config(self):
31
+ return self._xavier_config
32
+
33
+ @xavier_config.setter
34
+ def xavier_config(self, v: Dict[str, Any]):
35
+ self._xavier_config = v
36
+ self._allocators[Device.GPU].xavier_config = v
37
+
38
+ @staticmethod
39
+ def create(
40
+ allocator_type: str,
41
+ num_gpu_blocks: int,
42
+ num_cpu_blocks: int,
43
+ block_size: int,
44
+ ) -> DeviceAwareBlockAllocator:
45
+ """Xinference Change!!!
46
+ 1. The code is copied here because the `allocator` needs to be instantiated as a subclass.
47
+ 2. Why not re-instantiate it externally?
48
+ Re-instantiating the `allocator` is costly because it requires initializing many tensors.
49
+ """
50
+
51
+ # For HPU, block id 0 is used only for padding
52
+ reserved_blocks = 1 if current_platform.is_hpu() else 0
53
+ block_ids = list(range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
54
+ num_gpu_blocks -= reserved_blocks
55
+ gpu_block_ids = block_ids[:num_gpu_blocks]
56
+ cpu_block_ids = block_ids[num_gpu_blocks:]
57
+
58
+ gpu_allocator = XavierPrefixCachingBlockAllocator(
59
+ run_isolation=True,
60
+ num_blocks=num_gpu_blocks,
61
+ block_size=block_size,
62
+ block_ids=gpu_block_ids,
63
+ )
64
+
65
+ cpu_allocator = XavierPrefixCachingBlockAllocator(
66
+ num_blocks=num_cpu_blocks,
67
+ block_size=block_size,
68
+ block_ids=cpu_block_ids,
69
+ )
70
+
71
+ return XavierCpuGpuBlockAllocator(
72
+ cpu_block_allocator=cpu_allocator,
73
+ gpu_block_allocator=gpu_allocator,
74
+ )
@@ -0,0 +1,111 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import asyncio
15
+ import logging
16
+ from typing import Any, Dict, Optional
17
+
18
+ import xoscar as xo
19
+ from vllm.core.block.interfaces import BlockId
20
+ from vllm.core.block.prefix_caching_block import (
21
+ BlockTracker,
22
+ PrefixCachingBlockAllocator,
23
+ )
24
+
25
+ from .....isolation import Isolation
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class XavierInnerBlockTracker(BlockTracker):
31
+ """Used to track the status of a block inside the prefix caching allocator"""
32
+
33
+ """
34
+ Here, two fixed attributes, `transferred` and `executed`,
35
+ have been added to the `BlockTracker` class to mark the status of the corresponding `block_id`.
36
+ We cannot directly set attributes on the `Block` object
37
+ because the `Block` objects are dynamically allocated with each scheduling.
38
+ The `Block` objects executed in two different scheduling steps may have the same `id`, `hash`, etc.,
39
+ but the instance objects may differ.
40
+ The BlockTracker object inside vllm is one-to-one with the block_id.
41
+ """
42
+ __slots__ = ("active", "last_accessed", "computed", "transferred", "executed")
43
+
44
+ def __init__(self):
45
+ super().__init__()
46
+ self.transferred = False
47
+ self.executed = False
48
+
49
+
50
+ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
51
+ def __init__(self, *args, run_isolation: bool = False, **kwargs):
52
+ super().__init__(*args, **kwargs)
53
+ for _id in self._block_tracker.keys():
54
+ self._block_tracker[_id] = XavierInnerBlockTracker()
55
+
56
+ self._xavier_config: Optional[Dict[str, Any]] = None
57
+ self._block_tracker_ref = None
58
+ if run_isolation:
59
+ self._isolation = Isolation(
60
+ asyncio.new_event_loop(), threaded=True, daemon=True
61
+ )
62
+ self._isolation.start()
63
+ else:
64
+ self._isolation = None # type: ignore
65
+
66
+ def __del__(self):
67
+ if self._isolation is not None:
68
+ self._isolation.stop()
69
+
70
+ @property
71
+ def xavier_config(self):
72
+ return self._xavier_config
73
+
74
+ @xavier_config.setter
75
+ def xavier_config(self, v: Dict[str, Any]):
76
+ self._xavier_config = v
77
+
78
+ async def _get_block_tracker_ref(self):
79
+ if self._block_tracker_ref is None:
80
+ block_tracker_address = self.xavier_config.get("block_tracker_address")
81
+ block_tracker_uid = self.xavier_config.get("block_tracker_uid")
82
+ self._block_tracker_ref = await xo.actor_ref(
83
+ address=block_tracker_address, uid=block_tracker_uid
84
+ )
85
+ return self._block_tracker_ref
86
+
87
+ async def unregister_block(self, block_id: int):
88
+ assert self._xavier_config is not None
89
+ tracker_ref = await self._get_block_tracker_ref()
90
+ await tracker_ref.unregister_block(
91
+ self.xavier_config.get("virtual_engine"),
92
+ self.xavier_config.get("rank"),
93
+ block_id,
94
+ )
95
+
96
+ def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
97
+ """
98
+ This is the only entry point where the `block_id` is evicted from the cache.
99
+ Therefore, when the `block_id` is evicted, the tracker actor needs to unregister the block information.
100
+ At the same time, make sure to reset the attributes corresponding to that `block_id`.
101
+ """
102
+ evicted_block_id = super()._maybe_allocate_evicted_block_id()
103
+ logger.debug(f"block_id: {evicted_block_id} will be evicted from the cache.")
104
+ if evicted_block_id is not None and self._isolation is not None:
105
+ tracker = self._block_tracker[evicted_block_id]
106
+ assert isinstance(tracker, XavierInnerBlockTracker)
107
+ tracker.transferred = False
108
+ tracker.executed = False
109
+ self._isolation.call(self.unregister_block(evicted_block_id))
110
+ logger.debug(f"block_id: {evicted_block_id} will be used again.")
111
+ return evicted_block_id
@@ -0,0 +1,71 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from typing import Any, Dict, Optional
16
+
17
+ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
18
+ from vllm.core.block.interfaces import Block
19
+ from vllm.core.block_manager import SelfAttnBlockSpaceManager
20
+ from vllm.sequence import SequenceGroup, SequenceStatus
21
+ from vllm.utils import Device
22
+
23
+ from .allocator import XavierCpuGpuBlockAllocator
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class XavierBlockManager(SelfAttnBlockSpaceManager):
29
+ def __init__(self, *args, **kwargs):
30
+ # Monkey patch
31
+ CpuGpuBlockAllocator.create = XavierCpuGpuBlockAllocator.create
32
+ super().__init__(*args, **kwargs)
33
+ self._xavier_config: Optional[Dict[str, Any]] = None
34
+ logger.debug("Init xavier block manager done.")
35
+
36
+ @property
37
+ def xavier_config(self):
38
+ return self._xavier_config
39
+
40
+ @xavier_config.setter
41
+ def xavier_config(self, value: Dict[str, Any]):
42
+ self._xavier_config = value
43
+ self.block_allocator.xavier_config = value
44
+
45
+ def get_block_by_block_id(self, seq_id: int, block_id: int) -> Block:
46
+ table = self.block_tables[seq_id]
47
+ for b in table.blocks:
48
+ if b.block_id == block_id:
49
+ return b
50
+
51
+ def get_block_status_by_block_id(self, status_name: str, block_id: int) -> bool:
52
+ tracker = self.block_allocator._allocators[Device.GPU]._block_tracker[block_id]
53
+ return getattr(tracker, status_name)
54
+
55
+ def set_block_status_by_block_id(
56
+ self, status_name: str, block_id: int, status: bool
57
+ ) -> None:
58
+ tracker = self.block_allocator._allocators[Device.GPU]._block_tracker[block_id]
59
+ assert getattr(tracker, status_name, None) is not None
60
+ setattr(tracker, status_name, status)
61
+
62
+ def allocate(self, seq_group: SequenceGroup) -> None:
63
+ """
64
+ If the `seq_group` has the `transferred` attribute,
65
+ it indicates that the `seq_group` has gone through the transfer process,
66
+ so the block allocation logic should not be executed again.
67
+ """
68
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
69
+ if all([getattr(s, "transferred", False) for s in waiting_seqs]):
70
+ return
71
+ super().allocate(seq_group)