xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +77 -71
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +79 -19
  6. xinference/core/supervisor.py +172 -10
  7. xinference/core/utils.py +12 -8
  8. xinference/core/worker.py +102 -4
  9. xinference/deploy/cmdline.py +3 -1
  10. xinference/deploy/test/test_cmdline.py +56 -0
  11. xinference/isolation.py +24 -0
  12. xinference/model/audio/core.py +16 -0
  13. xinference/model/audio/cosyvoice.py +39 -6
  14. xinference/model/audio/f5tts.py +200 -0
  15. xinference/model/audio/f5tts_mlx.py +260 -0
  16. xinference/model/audio/fish_speech.py +36 -111
  17. xinference/model/audio/melotts.py +110 -0
  18. xinference/model/audio/model_spec.json +99 -3
  19. xinference/model/audio/model_spec_modelscope.json +27 -0
  20. xinference/model/audio/utils.py +32 -0
  21. xinference/model/audio/whisper.py +35 -10
  22. xinference/model/embedding/core.py +203 -142
  23. xinference/model/embedding/model_spec.json +7 -0
  24. xinference/model/embedding/model_spec_modelscope.json +8 -0
  25. xinference/model/image/core.py +69 -1
  26. xinference/model/image/model_spec.json +145 -4
  27. xinference/model/image/model_spec_modelscope.json +150 -4
  28. xinference/model/image/stable_diffusion/core.py +45 -13
  29. xinference/model/llm/__init__.py +4 -2
  30. xinference/model/llm/llm_family.json +536 -53
  31. xinference/model/llm/llm_family.py +15 -36
  32. xinference/model/llm/llm_family_modelscope.json +454 -20
  33. xinference/model/llm/memory.py +1 -1
  34. xinference/model/llm/mlx/core.py +248 -52
  35. xinference/model/llm/sglang/core.py +1 -0
  36. xinference/model/llm/transformers/chatglm.py +9 -5
  37. xinference/model/llm/transformers/cogagent.py +272 -0
  38. xinference/model/llm/transformers/core.py +2 -0
  39. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  40. xinference/model/llm/transformers/utils.py +16 -8
  41. xinference/model/llm/utils.py +36 -4
  42. xinference/model/llm/vllm/core.py +53 -10
  43. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  44. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  45. xinference/model/llm/vllm/xavier/block.py +111 -0
  46. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  47. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  48. xinference/model/llm/vllm/xavier/collective.py +74 -0
  49. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  50. xinference/model/llm/vllm/xavier/engine.py +247 -0
  51. xinference/model/llm/vllm/xavier/executor.py +134 -0
  52. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  53. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  54. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  55. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  56. xinference/model/video/diffusers.py +14 -0
  57. xinference/model/video/model_spec.json +15 -0
  58. xinference/model/video/model_spec_modelscope.json +16 -0
  59. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  60. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  61. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  62. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  63. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  64. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  65. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  66. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  67. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  68. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  69. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  70. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  71. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  72. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  73. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  74. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  75. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  76. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  77. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  78. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  79. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  80. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  81. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  82. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  83. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  84. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  85. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  86. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  87. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  88. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  89. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  90. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  91. xinference/thirdparty/f5_tts/api.py +166 -0
  92. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  93. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  94. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  95. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  96. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  97. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  98. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  99. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  100. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  101. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  102. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  103. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  104. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  105. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  106. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  107. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  108. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  109. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  110. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  111. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  112. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  114. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  115. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  116. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  117. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  118. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  119. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  120. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  121. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  122. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  123. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  124. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  125. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  126. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  127. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  128. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  129. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  130. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  131. xinference/thirdparty/f5_tts/train/README.md +77 -0
  132. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  133. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  134. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  135. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  136. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  137. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  138. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  139. xinference/thirdparty/f5_tts/train/train.py +75 -0
  140. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  141. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  142. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  143. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  144. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  145. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  146. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  147. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  148. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  149. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  150. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  151. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  152. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  153. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  154. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  155. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  156. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  157. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  158. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  159. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  160. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  161. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  162. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  163. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  164. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  165. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  166. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  167. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  168. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  169. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  170. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  171. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  172. xinference/thirdparty/matcha/utils/utils.py +2 -2
  173. xinference/thirdparty/melo/api.py +135 -0
  174. xinference/thirdparty/melo/app.py +61 -0
  175. xinference/thirdparty/melo/attentions.py +459 -0
  176. xinference/thirdparty/melo/commons.py +160 -0
  177. xinference/thirdparty/melo/configs/config.json +94 -0
  178. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  179. xinference/thirdparty/melo/data_utils.py +413 -0
  180. xinference/thirdparty/melo/download_utils.py +67 -0
  181. xinference/thirdparty/melo/infer.py +25 -0
  182. xinference/thirdparty/melo/init_downloads.py +14 -0
  183. xinference/thirdparty/melo/losses.py +58 -0
  184. xinference/thirdparty/melo/main.py +36 -0
  185. xinference/thirdparty/melo/mel_processing.py +174 -0
  186. xinference/thirdparty/melo/models.py +1030 -0
  187. xinference/thirdparty/melo/modules.py +598 -0
  188. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  189. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  190. xinference/thirdparty/melo/preprocess_text.py +135 -0
  191. xinference/thirdparty/melo/split_utils.py +174 -0
  192. xinference/thirdparty/melo/text/__init__.py +35 -0
  193. xinference/thirdparty/melo/text/chinese.py +199 -0
  194. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  195. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  196. xinference/thirdparty/melo/text/cleaner.py +36 -0
  197. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  198. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  199. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  200. xinference/thirdparty/melo/text/english.py +284 -0
  201. xinference/thirdparty/melo/text/english_bert.py +39 -0
  202. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  203. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  204. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  205. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  206. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  207. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  208. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  209. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  210. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  211. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  212. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  213. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  214. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  215. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  216. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  217. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  218. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  219. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  220. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  221. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  222. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  223. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  224. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  225. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  226. xinference/thirdparty/melo/text/french.py +94 -0
  227. xinference/thirdparty/melo/text/french_bert.py +39 -0
  228. xinference/thirdparty/melo/text/japanese.py +647 -0
  229. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  230. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  231. xinference/thirdparty/melo/text/korean.py +192 -0
  232. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  233. xinference/thirdparty/melo/text/spanish.py +122 -0
  234. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  235. xinference/thirdparty/melo/text/symbols.py +290 -0
  236. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  237. xinference/thirdparty/melo/train.py +635 -0
  238. xinference/thirdparty/melo/train.sh +19 -0
  239. xinference/thirdparty/melo/transforms.py +209 -0
  240. xinference/thirdparty/melo/utils.py +424 -0
  241. xinference/types.py +15 -0
  242. xinference/web/ui/build/asset-manifest.json +6 -6
  243. xinference/web/ui/build/index.html +1 -1
  244. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  245. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  246. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  247. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  248. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  249. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  250. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  251. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  252. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  253. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  254. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  255. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  256. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  257. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  258. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  259. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  260. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  261. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  262. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  263. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  264. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  265. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  266. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  292. xinference/web/ui/node_modules/.package-lock.json +67 -3
  293. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  294. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  295. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  296. xinference/web/ui/node_modules/i18next/package.json +129 -0
  297. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  298. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  299. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  300. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  301. xinference/web/ui/package-lock.json +69 -3
  302. xinference/web/ui/package.json +2 -0
  303. xinference/web/ui/src/locales/en.json +186 -0
  304. xinference/web/ui/src/locales/zh.json +186 -0
  305. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/METADATA +68 -32
  306. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/RECORD +316 -122
  307. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  308. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  309. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  310. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  311. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  312. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  313. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  314. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  315. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  316. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  317. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  318. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  319. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  320. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  321. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  322. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  323. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  324. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  325. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  326. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  327. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  328. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  329. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  330. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  331. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  332. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  333. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  334. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  335. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  336. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  337. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  338. /xinference/thirdparty/{fish_speech/tools → melo/text/fr_phonemizer}/__init__.py +0 -0
  339. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  340. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  341. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/WHEEL +0 -0
  342. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  343. {xinference-1.0.1.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,272 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ import re
16
+ import uuid
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ from typing import Dict, Iterator, List, Literal, Optional, Union
19
+
20
+ import torch
21
+
22
+ from ....model.utils import select_device
23
+ from ....types import (
24
+ ChatCompletion,
25
+ ChatCompletionChunk,
26
+ CogagentGenerateConfig,
27
+ CompletionChunk,
28
+ )
29
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
30
+ from ..utils import (
31
+ _decode_image,
32
+ generate_chat_completion,
33
+ generate_completion_chunk,
34
+ parse_messages,
35
+ )
36
+ from .core import PytorchChatModel
37
+ from .utils import cache_clean
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ class CogAgentChatModel(PytorchChatModel):
43
+ def __init__(self, *args, **kwargs):
44
+ super().__init__(*args, **kwargs)
45
+ self._torch_type = None
46
+ self._device = None
47
+ self._tokenizer = None
48
+ self._model = None
49
+ self._platform: Literal["Mac", "WIN", "Mobile"] | None = "Mac"
50
+ self._format: Literal[
51
+ "(Answer in Action-Operation-Sensitive format.)",
52
+ "(Answer in Status-Plan-Action-Operation format.)",
53
+ "(Answer in Status-Action-Operation-Sensitive format.)",
54
+ "(Answer in Status-Action-Operation format.)",
55
+ "(Answer in Action-Operation format.)",
56
+ ] | None = "(Answer in Action-Operation-Sensitive format.)"
57
+
58
+ @classmethod
59
+ def match(
60
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
61
+ ) -> bool:
62
+ family = model_family.model_family or model_family.model_name
63
+ if "cogagent" in family.lower():
64
+ return True
65
+ return False
66
+
67
+ def load(self, **kwargs):
68
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
69
+
70
+ device = self._pytorch_model_config.get("device", "auto")
71
+ self._device = select_device(device)
72
+
73
+ self._tokenizer = AutoTokenizer.from_pretrained(
74
+ self.model_path, trust_remote_code=True
75
+ )
76
+ if self.quantization == "4-bit":
77
+ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
78
+ elif self.quantization == "8-bit":
79
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
80
+ else:
81
+ quantization_config = None
82
+
83
+ self._model = AutoModelForCausalLM.from_pretrained(
84
+ self.model_path,
85
+ torch_dtype=torch.bfloat16,
86
+ trust_remote_code=True,
87
+ device_map=self._device,
88
+ quantization_config=quantization_config,
89
+ ).eval()
90
+
91
+ def _message_content_to_cogagent(self, content):
92
+ assert isinstance(content, list)
93
+ texts = []
94
+ image_urls = []
95
+ for c in content:
96
+ c_type = c.get("type")
97
+ if c_type == "text":
98
+ texts.append(c["text"])
99
+ elif c_type == "image_url":
100
+ image_urls.append(c["image_url"]["url"])
101
+ image_futures = []
102
+ with ThreadPoolExecutor() as executor:
103
+ for image_url in image_urls:
104
+ fut = executor.submit(_decode_image, image_url)
105
+ image_futures.append(fut)
106
+ images = [fut.result() for fut in image_futures]
107
+ text = " ".join(texts)
108
+ if len(images) == 0:
109
+ raise RuntimeError(
110
+ "CogAgent requires image input to perform GUI Agent tasks. Pure text-based interaction cannot execute such tasks."
111
+ )
112
+ elif len(images) == 1:
113
+ return text, images[-1]
114
+ else:
115
+ logger.warning(
116
+ "There are multiple images in the prompt, CogAgent will automatically use the most recently provided image as the input."
117
+ )
118
+ return text, images[-1]
119
+
120
+ def _history_content_to_cogagent(self, chat_history: List[Dict]):
121
+ grounded_pattern = r"Grounded Operation:\s*(.*)"
122
+ action_pattern = r"Action:\s*(.*)"
123
+
124
+ def extract_operations(_content: str):
125
+ """extract grounded operation and action operation"""
126
+ _history_step = []
127
+ _history_action = []
128
+
129
+ matches_history = re.search(grounded_pattern, _content)
130
+ matches_actions = re.search(action_pattern, _content)
131
+
132
+ if matches_history:
133
+ grounded_operation = matches_history.group(1)
134
+ _history_step.append(grounded_operation)
135
+ if matches_actions:
136
+ action_operation = matches_actions.group(1)
137
+ _history_action.append(action_operation)
138
+
139
+ return _history_step, _history_action
140
+
141
+ history_step = []
142
+ history_action = []
143
+
144
+ for i in range(0, len(chat_history) - 1, 2):
145
+ content = chat_history[i + 1].get("content")
146
+ if isinstance(content, str): # 如果内容是字符串
147
+ steps, actions = extract_operations(content)
148
+ history_step.extend(steps)
149
+ history_action.extend(actions)
150
+
151
+ elif isinstance(content, list): # 如果内容是列表
152
+ for c in content:
153
+ c_content = c.get("content")
154
+ if isinstance(c_content, str): # 确保是字符串类型
155
+ steps, actions = extract_operations(c_content)
156
+ history_step.extend(steps)
157
+ history_action.extend(actions)
158
+
159
+ return history_step, history_action
160
+
161
+ def get_query_and_history(
162
+ self,
163
+ prompt: Union[str, List[Dict]],
164
+ chat_history: Optional[List[Dict]] = None,
165
+ ):
166
+ task, image = self._message_content_to_cogagent(prompt)
167
+
168
+ history_step, history_action = [], []
169
+
170
+ if chat_history:
171
+ history_step, history_action = self._history_content_to_cogagent(
172
+ chat_history
173
+ )
174
+
175
+ # Verify history lengths match
176
+ if len(history_step) != len(history_action):
177
+ raise ValueError("Mismatch in lengths of history_step and history_action.")
178
+
179
+ # Format history steps for output
180
+ history_str = "\nHistory steps: "
181
+ for index, (step, action) in enumerate(zip(history_step, history_action)):
182
+ history_str += f"\n{index}. {step}\t{action}"
183
+
184
+ # Compose the query with task, platform, and selected format instructions
185
+ query = f"Task: {task}{history_str}\n{self._platform}{self._format}"
186
+ logger.info(f"query:{query}")
187
+ return query, image
188
+
189
+ @cache_clean
190
+ def chat(
191
+ self,
192
+ messages: List[Dict],
193
+ generate_config: Optional[CogagentGenerateConfig] = None,
194
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
195
+ if generate_config is not None:
196
+ self._platform = generate_config.pop("platform", self._platform)
197
+ self._format = generate_config.pop("format", self._format)
198
+
199
+ sanitize_generate_config = self._sanitize_generate_config(generate_config)
200
+ stream = sanitize_generate_config.get("stream")
201
+ sanitized_config = {
202
+ "max_length": sanitize_generate_config.get("max_tokens", 512),
203
+ "top_k": sanitize_generate_config.get("top_k", 1),
204
+ "do_sample": True,
205
+ }
206
+ prompt, _, chat_history = parse_messages(messages)
207
+
208
+ query, image = self.get_query_and_history(prompt, chat_history)
209
+
210
+ full_context_kwargs = {
211
+ "return_tensors": "pt",
212
+ "return_dict": True,
213
+ }
214
+ assert self.model_family.chat_template is not None
215
+ inputs = self.get_full_context(
216
+ [{"role": "user", "image": image, "content": query}],
217
+ self.model_family.chat_template,
218
+ self._tokenizer,
219
+ tokenize=True,
220
+ **full_context_kwargs,
221
+ )
222
+ inputs.to(self._model.device)
223
+
224
+ if stream:
225
+ it = self._streaming_chat_response(inputs, sanitized_config)
226
+ return self._to_chat_completion_chunks(it)
227
+ else:
228
+ # Generate response
229
+ with torch.no_grad():
230
+ outputs = self._model.generate(**inputs, **sanitized_config)
231
+ outputs = outputs[:, inputs["input_ids"].shape[1] :]
232
+ response = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
233
+
234
+ return generate_chat_completion(self.model_uid, response)
235
+
236
+ def _streaming_chat_response(
237
+ self, inputs: Dict, config: Dict
238
+ ) -> Iterator[CompletionChunk]:
239
+ from threading import Thread
240
+
241
+ from transformers import TextIteratorStreamer
242
+
243
+ streamer = TextIteratorStreamer(
244
+ self._tokenizer, skip_prompt=True, skip_special_tokens=True
245
+ )
246
+ generation_kwargs = {**inputs, **config}
247
+
248
+ thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
249
+ thread.start()
250
+
251
+ completion_id = str(uuid.uuid1())
252
+ for new_text in streamer:
253
+ yield generate_completion_chunk(
254
+ chunk_text=new_text,
255
+ finish_reason=None,
256
+ chunk_id=completion_id,
257
+ model_uid=self.model_uid,
258
+ prompt_tokens=-1,
259
+ completion_tokens=-1,
260
+ total_tokens=-1,
261
+ )
262
+ yield generate_completion_chunk(
263
+ chunk_text=None,
264
+ finish_reason="stop",
265
+ chunk_id=completion_id,
266
+ model_uid=self.model_uid,
267
+ prompt_tokens=-1,
268
+ completion_tokens=-1,
269
+ total_tokens=-1,
270
+ has_choice=True,
271
+ has_content=False,
272
+ )
@@ -69,6 +69,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
69
69
  "deepseek-v2.5",
70
70
  "deepseek-v2-chat-0628",
71
71
  "glm-edge-v",
72
+ "QvQ-72B-Preview",
73
+ "cogagent",
72
74
  ]
73
75
 
74
76
 
@@ -17,6 +17,7 @@ import sys
17
17
  import uuid
18
18
  from typing import Iterator, List, Optional, Union
19
19
 
20
+ from ....device_utils import is_npu_available
20
21
  from ....model.utils import select_device
21
22
  from ....types import (
22
23
  ChatCompletion,
@@ -47,6 +48,8 @@ class Qwen2VLChatModel(PytorchChatModel):
47
48
  llm_family = model_family.model_family or model_family.model_name
48
49
  if "qwen2-vl-instruct".lower() in llm_family.lower():
49
50
  return True
51
+ if "qvq-72b-preview".lower() in llm_family.lower():
52
+ return True
50
53
  return False
51
54
 
52
55
  def load(self):
@@ -71,6 +74,14 @@ class Qwen2VLChatModel(PytorchChatModel):
71
74
  attn_implementation="flash_attention_2",
72
75
  trust_remote_code=True,
73
76
  ).eval()
77
+ elif is_npu_available():
78
+ # Ascend do not support bf16
79
+ self._model = Qwen2VLForConditionalGeneration.from_pretrained(
80
+ self.model_path,
81
+ device_map="auto",
82
+ trust_remote_code=True,
83
+ torch_dtype="float16",
84
+ ).eval()
74
85
  else:
75
86
  self._model = Qwen2VLForConditionalGeneration.from_pretrained(
76
87
  self.model_path, device_map=device, trust_remote_code=True
@@ -112,7 +123,7 @@ class Qwen2VLChatModel(PytorchChatModel):
112
123
  padding=True,
113
124
  return_tensors="pt",
114
125
  )
115
- inputs = inputs.to("cuda")
126
+ inputs = inputs.to(self._device)
116
127
 
117
128
  # Inference: Generation of the output
118
129
  generated_ids = self._model.generate(
@@ -156,6 +156,7 @@ def _get_completion(
156
156
  finish_reason: Optional[str],
157
157
  model_uid: str,
158
158
  r: InferenceRequest,
159
+ completion_tokens: int,
159
160
  ):
160
161
  completion_choice = CompletionChoice(
161
162
  text=output, index=0, logprobs=None, finish_reason=finish_reason
@@ -170,8 +171,8 @@ def _get_completion(
170
171
  )
171
172
  completion_usage = CompletionUsage(
172
173
  prompt_tokens=len(r.prompt_tokens),
173
- completion_tokens=len(r.new_tokens),
174
- total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
174
+ completion_tokens=completion_tokens,
175
+ total_tokens=len(r.prompt_tokens) + completion_tokens,
175
176
  )
176
177
  completion = Completion(
177
178
  id=completion_chunk["id"],
@@ -371,7 +372,7 @@ def _batch_inference_one_step_internal(
371
372
  r.stopped = stopped
372
373
  r.finish_reason = finish_reason
373
374
 
374
- if r.stopped and r not in stop_token_mapping and r not in output_mapping:
375
+ if r.stopped and r not in stop_token_mapping:
375
376
  stop_token_mapping[r] = _i + 1
376
377
 
377
378
  if r.stream:
@@ -446,12 +447,14 @@ def _batch_inference_one_step_internal(
446
447
  else:
447
448
  # last round, handle non-stream result
448
449
  if r.stopped and _i == decode_round - 1:
449
- invalid_token_num = decode_round - stop_token_mapping[r]
450
+ invalid_token_num = (
451
+ (decode_round - stop_token_mapping[r] + 1)
452
+ if r.finish_reason == "stop"
453
+ else (decode_round - stop_token_mapping[r])
454
+ )
450
455
  outputs = (
451
456
  tokenizer.decode(
452
- r.new_tokens[: -(invalid_token_num + 1)]
453
- if r.finish_reason == "stop"
454
- else r.new_tokens[:-invalid_token_num],
457
+ r.new_tokens[:-invalid_token_num],
455
458
  skip_special_tokens=True,
456
459
  spaces_between_special_tokens=False,
457
460
  clean_up_tokenization_spaces=True,
@@ -460,7 +463,12 @@ def _batch_inference_one_step_internal(
460
463
  else output_mapping[r]
461
464
  )
462
465
  completion = _get_completion(
463
- outputs, r.chunk_id, r.finish_reason, model_uid, r
466
+ outputs,
467
+ r.chunk_id,
468
+ r.finish_reason,
469
+ model_uid,
470
+ r,
471
+ len(r.new_tokens) - invalid_token_num,
464
472
  )
465
473
  r.completion = [completion]
466
474
 
@@ -52,6 +52,7 @@ QWEN_TOOL_CALL_FAMILY = [
52
52
  "qwen2-instruct",
53
53
  "qwen2-moe-instruct",
54
54
  "qwen2.5-instruct",
55
+ "qwen2.5-coder-instruct",
55
56
  ]
56
57
 
57
58
  GLM4_TOOL_CALL_FAMILY = [
@@ -96,13 +97,22 @@ class ChatModelMixin:
96
97
  return rendered
97
98
 
98
99
  def get_full_context(
99
- self, messages: List, chat_template: str, tokenizer=None, **kwargs
100
- ) -> str:
100
+ self,
101
+ messages: List,
102
+ chat_template: str,
103
+ tokenizer=None,
104
+ tokenize=False,
105
+ **kwargs,
106
+ ):
107
+ if "vision" not in self.model_family.model_ability: # type: ignore
108
+ messages = self.convert_messages_with_content_list_to_str_conversion(
109
+ messages
110
+ )
101
111
  if tokenizer is not None:
102
112
  try:
103
113
  full_context = tokenizer.apply_chat_template(
104
114
  messages,
105
- tokenize=False,
115
+ tokenize=tokenize,
106
116
  chat_template=chat_template,
107
117
  add_generation_prompt=True,
108
118
  **kwargs,
@@ -118,6 +128,25 @@ class ChatModelMixin:
118
128
  # Compilation function uses a cache to avoid recompiling the same template
119
129
  return self._build_from_raw_template(messages, chat_template, **kwargs)
120
130
 
131
+ @staticmethod
132
+ def convert_messages_with_content_list_to_str_conversion(
133
+ messages: List[Dict],
134
+ ) -> List[Dict]:
135
+ """
136
+ Handles messages with content list conversion, in order to support Cline, see GH#2659 .
137
+ """
138
+ for message in messages:
139
+ texts = ""
140
+ msg_content = message.get("content")
141
+ if msg_content:
142
+ if isinstance(msg_content, str):
143
+ texts = msg_content
144
+ elif isinstance(msg_content, list):
145
+ texts = "\n".join(item.get("text", "") for item in msg_content)
146
+ if texts:
147
+ message["content"] = texts
148
+ return messages
149
+
121
150
  @staticmethod
122
151
  def get_specific_prompt(model_family: str, messages: List[ChatCompletionMessage]):
123
152
  """
@@ -324,7 +353,10 @@ class ChatModelMixin:
324
353
  """
325
354
  try:
326
355
  if isinstance(c, dict):
327
- return [(None, c["name"], c["arguments"])]
356
+ try:
357
+ return [(None, c["name"], json.loads(c["arguments"]))]
358
+ except Exception:
359
+ return [(None, c["name"], c["arguments"])]
328
360
  except KeyError:
329
361
  logger.error("Can't parse glm output: %s", c)
330
362
  return [(str(c), None, None)]
@@ -70,6 +70,7 @@ class VLLMModelConfig(TypedDict, total=False):
70
70
  max_model_len: Optional[int]
71
71
  limit_mm_per_prompt: Optional[Dict[str, int]]
72
72
  guided_decoding_backend: Optional[str]
73
+ scheduling_policy: Optional[str]
73
74
 
74
75
 
75
76
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -86,6 +87,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
86
87
  stop: Optional[Union[str, List[str]]]
87
88
  stream: bool # non-sampling param, should not be passed to the engine.
88
89
  stream_options: Optional[Union[dict, None]]
90
+ skip_special_tokens: Optional[bool]
89
91
  response_format: Optional[dict]
90
92
  guided_json: Optional[Union[str, dict]]
91
93
  guided_regex: Optional[str]
@@ -154,7 +156,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
154
156
  VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
155
157
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
156
158
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
157
-
159
+ VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
160
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
158
161
 
159
162
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
160
163
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -181,14 +184,19 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
181
184
  if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
182
185
  VLLM_SUPPORTED_MODELS.append("llama-3.1")
183
186
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
187
+ VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
184
188
 
185
189
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
186
190
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
187
191
 
192
+ if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
193
+ VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
194
+
188
195
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
189
196
  VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
190
197
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
191
198
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
199
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
192
200
 
193
201
 
194
202
  class VLLMModel(LLM):
@@ -217,6 +225,10 @@ class VLLMModel(LLM):
217
225
  self._engine = None
218
226
  self.lora_modules = peft_model
219
227
  self.lora_requests: List[LoRARequest] = []
228
+ self._xavier_config = None
229
+
230
+ def set_xavier_config(self, value: Optional[Dict]):
231
+ self._xavier_config = value # type: ignore
220
232
 
221
233
  def load(self):
222
234
  try:
@@ -242,7 +254,6 @@ class VLLMModel(LLM):
242
254
  multiprocessing.set_start_method("fork", force=True)
243
255
 
244
256
  self._model_config = self._sanitize_model_config(self._model_config)
245
-
246
257
  if self.lora_modules is None:
247
258
  self.lora_requests = []
248
259
  else:
@@ -263,13 +274,34 @@ class VLLMModel(LLM):
263
274
  f"Enable lora: {enable_lora}. Lora count: {max_loras}."
264
275
  )
265
276
 
266
- engine_args = AsyncEngineArgs(
267
- model=self.model_path,
268
- enable_lora=enable_lora,
269
- max_loras=max_loras,
270
- **self._model_config,
271
- )
272
- self._engine = AsyncLLMEngine.from_engine_args(engine_args)
277
+ if self._xavier_config is not None:
278
+ from .xavier.engine import XavierEngine
279
+
280
+ # Enabling Xavier means that `enable_prefix_caching` is enabled by default.
281
+ self._model_config.setdefault("enable_prefix_caching", True)
282
+ xavier_transfer_block_num = self._model_config.pop(
283
+ "xavier_transfer_block_num", 512
284
+ )
285
+ self._xavier_config["transfer_block_num"] = xavier_transfer_block_num
286
+ engine_args = AsyncEngineArgs(
287
+ model=self.model_path,
288
+ enable_lora=enable_lora,
289
+ max_loras=max_loras,
290
+ **self._model_config,
291
+ )
292
+
293
+ logger.debug(f"Start xavier for vllm with config: {self._xavier_config}")
294
+ self._engine = XavierEngine.from_engine_args(
295
+ engine_args, xavier_config=self._xavier_config
296
+ )
297
+ else:
298
+ engine_args = AsyncEngineArgs(
299
+ model=self.model_path,
300
+ enable_lora=enable_lora,
301
+ max_loras=max_loras,
302
+ **self._model_config,
303
+ )
304
+ self._engine = AsyncLLMEngine.from_engine_args(engine_args)
273
305
 
274
306
  self._check_health_task = None
275
307
  if hasattr(self._engine, "check_health"):
@@ -287,6 +319,9 @@ class VLLMModel(LLM):
287
319
  model_executor.shutdown()
288
320
  self._engine = None
289
321
 
322
+ async def init_xavier(self):
323
+ await self._engine.init_xavier()
324
+
290
325
  async def _check_healthy(self, interval: int = 30):
291
326
  from vllm.engine.async_llm_engine import AsyncEngineDeadError
292
327
 
@@ -325,7 +360,9 @@ class VLLMModel(LLM):
325
360
  model_config.setdefault("quantization", None)
326
361
  model_config.setdefault("max_model_len", None)
327
362
  model_config.setdefault("guided_decoding_backend", "outlines")
328
-
363
+ # Add scheduling policy if vLLM version is 0.6.3 or higher
364
+ if vllm.__version__ >= "0.6.3":
365
+ model_config.setdefault("scheduling_policy", "fcfs")
329
366
  return model_config
330
367
 
331
368
  @staticmethod
@@ -373,6 +410,9 @@ class VLLMModel(LLM):
373
410
  sanitized.setdefault(
374
411
  "stream_options", generate_config.get("stream_options", None)
375
412
  )
413
+ sanitized.setdefault(
414
+ "skip_special_tokens", generate_config.get("skip_special_tokens", True)
415
+ )
376
416
  sanitized.setdefault(
377
417
  "guided_json", generate_config.get("guided_json", guided_json)
378
418
  )
@@ -854,6 +894,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
854
894
  "image": 2, # default 2 images all chat
855
895
  }
856
896
  )
897
+ # Add scheduling policy if vLLM version is 0.6.3 or higher
898
+ if vllm.__version__ >= "0.6.3":
899
+ model_config.setdefault("scheduling_policy", "fcfs")
857
900
 
858
901
  return model_config
859
902
 
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,74 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, Optional
15
+
16
+ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
17
+ from vllm.core.block.interfaces import DeviceAwareBlockAllocator
18
+ from vllm.platforms import current_platform
19
+ from vllm.utils import Device
20
+
21
+ from .block import XavierPrefixCachingBlockAllocator
22
+
23
+
24
+ class XavierCpuGpuBlockAllocator(CpuGpuBlockAllocator):
25
+ def __init__(self, *args, **kwargs):
26
+ super().__init__(*args, **kwargs)
27
+ self._xavier_config: Optional[Dict[str, Any]] = None
28
+
29
+ @property
30
+ def xavier_config(self):
31
+ return self._xavier_config
32
+
33
+ @xavier_config.setter
34
+ def xavier_config(self, v: Dict[str, Any]):
35
+ self._xavier_config = v
36
+ self._allocators[Device.GPU].xavier_config = v
37
+
38
+ @staticmethod
39
+ def create(
40
+ allocator_type: str,
41
+ num_gpu_blocks: int,
42
+ num_cpu_blocks: int,
43
+ block_size: int,
44
+ ) -> DeviceAwareBlockAllocator:
45
+ """Xinference Change!!!
46
+ 1. The code is copied here because the `allocator` needs to be instantiated as a subclass.
47
+ 2. Why not re-instantiate it externally?
48
+ Re-instantiating the `allocator` is costly because it requires initializing many tensors.
49
+ """
50
+
51
+ # For HPU, block id 0 is used only for padding
52
+ reserved_blocks = 1 if current_platform.is_hpu() else 0
53
+ block_ids = list(range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
54
+ num_gpu_blocks -= reserved_blocks
55
+ gpu_block_ids = block_ids[:num_gpu_blocks]
56
+ cpu_block_ids = block_ids[num_gpu_blocks:]
57
+
58
+ gpu_allocator = XavierPrefixCachingBlockAllocator(
59
+ run_isolation=True,
60
+ num_blocks=num_gpu_blocks,
61
+ block_size=block_size,
62
+ block_ids=gpu_block_ids,
63
+ )
64
+
65
+ cpu_allocator = XavierPrefixCachingBlockAllocator(
66
+ num_blocks=num_cpu_blocks,
67
+ block_size=block_size,
68
+ block_ids=cpu_block_ids,
69
+ )
70
+
71
+ return XavierCpuGpuBlockAllocator(
72
+ cpu_block_allocator=cpu_allocator,
73
+ gpu_block_allocator=gpu_allocator,
74
+ )