xinference 1.8.1rc1__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (64) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +2 -1
  3. xinference/core/model.py +5 -0
  4. xinference/core/supervisor.py +2 -3
  5. xinference/core/worker.py +3 -4
  6. xinference/deploy/local.py +5 -0
  7. xinference/deploy/worker.py +6 -0
  8. xinference/model/core.py +3 -0
  9. xinference/model/embedding/sentence_transformers/core.py +3 -4
  10. xinference/model/embedding/vllm/core.py +4 -3
  11. xinference/model/image/model_spec.json +69 -0
  12. xinference/model/image/stable_diffusion/core.py +22 -0
  13. xinference/model/llm/cache_manager.py +17 -3
  14. xinference/model/llm/harmony.py +245 -0
  15. xinference/model/llm/llm_family.json +293 -8
  16. xinference/model/llm/llm_family.py +1 -1
  17. xinference/model/llm/sglang/core.py +108 -5
  18. xinference/model/llm/transformers/core.py +15 -7
  19. xinference/model/llm/transformers/gemma3.py +1 -1
  20. xinference/model/llm/transformers/gpt_oss.py +91 -0
  21. xinference/model/llm/transformers/multimodal/core.py +1 -1
  22. xinference/model/llm/transformers/multimodal/gemma3.py +1 -1
  23. xinference/model/llm/transformers/multimodal/glm4_1v.py +2 -2
  24. xinference/model/llm/transformers/multimodal/ovis2.py +1 -1
  25. xinference/model/llm/transformers/multimodal/qwen-omni.py +7 -8
  26. xinference/model/llm/transformers/multimodal/qwen2_vl.py +9 -6
  27. xinference/model/llm/transformers/utils.py +1 -33
  28. xinference/model/llm/utils.py +61 -7
  29. xinference/model/llm/vllm/core.py +38 -8
  30. xinference/model/rerank/__init__.py +66 -23
  31. xinference/model/rerank/cache_manager.py +35 -0
  32. xinference/model/rerank/core.py +84 -339
  33. xinference/model/rerank/custom.py +33 -8
  34. xinference/model/rerank/model_spec.json +251 -212
  35. xinference/model/rerank/rerank_family.py +137 -0
  36. xinference/model/rerank/sentence_transformers/__init__.py +13 -0
  37. xinference/model/rerank/sentence_transformers/core.py +337 -0
  38. xinference/model/rerank/vllm/__init__.py +13 -0
  39. xinference/model/rerank/vllm/core.py +106 -0
  40. xinference/model/utils.py +109 -0
  41. xinference/types.py +2 -0
  42. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  43. xinference/ui/web/ui/build/index.html +1 -1
  44. xinference/ui/web/ui/build/static/js/{main.b969199a.js → main.4918643a.js} +3 -3
  45. xinference/ui/web/ui/build/static/js/{main.b969199a.js.map → main.4918643a.js.map} +1 -1
  46. xinference/ui/web/ui/node_modules/.cache/babel-loader/28012da921a51f1082549956d3ae82acd769a754b22afda9acddd98a4daf9ea4.json +1 -0
  47. xinference/ui/web/ui/node_modules/.cache/babel-loader/475936ebe725eca62a6f52ce182c06a19b2cef4df9545a05ed0591ee0c539d43.json +1 -0
  48. xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +1 -0
  49. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +1 -0
  50. xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +1 -0
  51. xinference/ui/web/ui/node_modules/.cache/babel-loader/aee5aaba26f2b1e816a3ea9efa68bad8b95695a3d80adcfd8dd57a7bb17ac71a.json +1 -0
  52. {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/METADATA +6 -1
  53. {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/RECORD +58 -50
  54. xinference/ui/web/ui/node_modules/.cache/babel-loader/1409a96b9f9f9f5de99a89ab0f738f6da62b449521b0a8d3e4efcf7f5c23534d.json +0 -1
  55. xinference/ui/web/ui/node_modules/.cache/babel-loader/43b889c3a8e2634092ade463d52481c7c5581c72ded8f23bc5f012ea0ef8cea5.json +0 -1
  56. xinference/ui/web/ui/node_modules/.cache/babel-loader/5d47532fb42128280d87f57c8a0b02bc1930f7ef764aa7e90579247df18bba83.json +0 -1
  57. xinference/ui/web/ui/node_modules/.cache/babel-loader/830882bb275468a969614824a9ab8983f874b4581f2eb625e9c66426cdc65e5b.json +0 -1
  58. xinference/ui/web/ui/node_modules/.cache/babel-loader/9df08abcb5a7c1e48a4eb25c5d5f5d7253ea6854a4397e6d74d1fd75a14acda1.json +0 -1
  59. xinference/ui/web/ui/node_modules/.cache/babel-loader/b99034986a06445701accc7a4914bb9320947435e8d4e15793392ca4f679316c.json +0 -1
  60. /xinference/ui/web/ui/build/static/js/{main.b969199a.js.LICENSE.txt → main.4918643a.js.LICENSE.txt} +0 -0
  61. {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/WHEEL +0 -0
  62. {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/entry_points.txt +0 -0
  63. {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/licenses/LICENSE +0 -0
  64. {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/top_level.txt +0 -0
@@ -286,12 +286,18 @@ class PytorchModel(LLM):
286
286
 
287
287
  kwargs = {}
288
288
 
289
- dtype = get_device_preferred_dtype(self._device)
290
-
291
- if dtype is not None:
292
- kwargs["torch_dtype"] = dtype
289
+ torch_dtype = self._pytorch_model_config.get("torch_dtype")
290
+ if torch_dtype is not None:
291
+ if isinstance(torch_dtype, str) and torch_dtype != "auto":
292
+ torch_dtype = getattr(torch, torch_dtype)
293
+ kwargs["torch_dtype"] = torch_dtype
293
294
  else:
294
- raise ValueError(f"Device {self._device} is not supported in temporary")
295
+ dtype = get_device_preferred_dtype(self._device)
296
+
297
+ if dtype is not None:
298
+ kwargs["torch_dtype"] = dtype
299
+ else:
300
+ raise ValueError(f"Device {self._device} is not supported in temporary")
295
301
 
296
302
  kwargs["revision"] = self._pytorch_model_config.get(
297
303
  "revision", self.model_spec.model_revision
@@ -327,6 +333,8 @@ class PytorchModel(LLM):
327
333
  reasoning_content, enable_thinking=enable_thinking
328
334
  )
329
335
 
336
+ logger.debug("Loading Transformers model with kwargs: %s", kwargs)
337
+
330
338
  if self._check_tensorizer_integrity():
331
339
  self._model, self._tokenizer = self._load_tensorizer(**kwargs)
332
340
  else:
@@ -488,7 +496,7 @@ class PytorchModel(LLM):
488
496
  def match_json(
489
497
  cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
490
498
  ) -> bool:
491
- if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
499
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
492
500
  return False
493
501
  model_family = llm_family.model_family or llm_family.model_name
494
502
  if model_family in NON_DEFAULT_MODEL_LIST:
@@ -878,7 +886,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
878
886
  def match_json(
879
887
  cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
880
888
  ) -> bool:
881
- if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
889
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
882
890
  return False
883
891
  model_family = llm_family.model_family or llm_family.model_name
884
892
  if model_family in NON_DEFAULT_MODEL_LIST:
@@ -28,7 +28,7 @@ class Gemma3TextChatModel(PytorchChatModel):
28
28
  def match_json(
29
29
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
30
30
  ) -> bool:
31
- if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
31
+ if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
32
32
  return False
33
33
  llm_family = model_family.model_family or model_family.model_name
34
34
  if "gemma-3-1b-it".lower() in llm_family.lower():
@@ -0,0 +1,91 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import inspect
15
+ import logging
16
+ from typing import Dict, Iterator, List, Optional, Union
17
+
18
+ from ....types import (
19
+ ChatCompletion,
20
+ ChatCompletionChunk,
21
+ PytorchGenerateConfig,
22
+ PytorchModelConfig,
23
+ )
24
+ from ..harmony import async_stream_harmony_chat_completion
25
+ from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
26
+ from .core import PytorchChatModel, register_non_default_model
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @register_transformer
32
+ @register_non_default_model("gpt-oss")
33
+ class GPTOSSPytorchChatModel(PytorchChatModel):
34
+ def _sanitize_model_config(
35
+ self, pytorch_model_config: Optional[PytorchModelConfig]
36
+ ) -> PytorchModelConfig:
37
+ config = super()._sanitize_model_config(pytorch_model_config)
38
+ config.setdefault("torch_dtype", "auto")
39
+ return config # type:ignore
40
+
41
+ @classmethod
42
+ def match_json(
43
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
44
+ ) -> bool:
45
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
46
+ return False
47
+ model_family = llm_family.model_family or llm_family.model_name
48
+ if "gpt" not in model_family and "oss" not in model_family:
49
+ return False
50
+ if "chat" not in llm_family.model_ability:
51
+ return False
52
+ return True
53
+
54
+ async def chat( # type:ignore
55
+ self,
56
+ messages: List[Dict],
57
+ generate_config: Optional[PytorchGenerateConfig] = None,
58
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
59
+ gen = super().chat(messages, generate_config=generate_config)
60
+
61
+ if inspect.iscoroutine(gen):
62
+ gen = await gen
63
+
64
+ if inspect.isasyncgen(gen):
65
+ # Streaming
66
+ async def stream_parser():
67
+ full_text = ""
68
+ full_reasoning = ""
69
+
70
+ async for parsed_chunk in async_stream_harmony_chat_completion(gen):
71
+ choices = parsed_chunk.get("choices")
72
+ if choices and len(choices) > 0:
73
+ delta = choices[0].get("delta", {})
74
+ if delta.get("content"):
75
+ full_text += delta["content"]
76
+ if delta.get("reasoning_content"):
77
+ full_reasoning += delta["reasoning_content"]
78
+ yield parsed_chunk
79
+
80
+ logger.debug(
81
+ "Chat finished, content: %r, reasoning: %r",
82
+ full_text,
83
+ full_reasoning,
84
+ )
85
+
86
+ return stream_parser()
87
+
88
+ else:
89
+ # Non-streaming sync - handle single result
90
+ async for parsed_completion in async_stream_harmony_chat_completion(gen): # type: ignore
91
+ return parsed_completion
@@ -21,9 +21,9 @@ from .....types import (
21
21
  CompletionChunk,
22
22
  PytorchGenerateConfig,
23
23
  )
24
+ from ....utils import cache_clean
24
25
  from ...utils import generate_chat_completion, generate_completion_chunk
25
26
  from ..core import PytorchChatModel
26
- from ..utils import cache_clean
27
27
 
28
28
 
29
29
  class PytorchMultiModalModel(PytorchChatModel):
@@ -31,7 +31,7 @@ class Gemma3ChatModel(PytorchMultiModalModel):
31
31
  def match_json(
32
32
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
33
33
  ) -> bool:
34
- if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
34
+ if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
35
35
  return False
36
36
  llm_family = model_family.model_family or model_family.model_name
37
37
  if "gemma-3-it".lower() in llm_family.lower():
@@ -28,14 +28,14 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  @register_transformer
31
- @register_non_default_model("glm-4.1v-thinking")
31
+ @register_non_default_model("glm-4.1v-thinking", "glm-4.5v")
32
32
  class Glm4_1VModel(PytorchMultiModalModel):
33
33
  @classmethod
34
34
  def match_json(
35
35
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
36
36
  ) -> bool:
37
37
  family = model_family.model_family or model_family.model_name
38
- if "glm-4.1v" in family.lower():
38
+ if "glm-4.1v" in family.lower() or "glm-4.5v" in family.lower():
39
39
  return True
40
40
  return False
41
41
 
@@ -37,7 +37,7 @@ class Ovis2ChatModel(PytorchMultiModalModel):
37
37
  def match_json(
38
38
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
39
39
  ) -> bool:
40
- if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
40
+ if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
41
41
  return False
42
42
  llm_family = model_family.model_family or model_family.model_name
43
43
  if "ovis2".lower() in llm_family.lower():
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import base64
15
- import importlib.util
16
15
  import io
17
16
  import logging
18
17
  import time
@@ -20,13 +19,13 @@ import uuid
20
19
  from threading import Thread
21
20
  from typing import Any, Dict, Iterator, List, Optional, Tuple
22
21
 
23
- from .....model.utils import select_device
24
22
  from .....types import (
25
23
  ChatCompletion,
26
24
  ChatCompletionAudio,
27
25
  ChatCompletionChoice,
28
26
  CompletionUsage,
29
27
  )
28
+ from ....utils import is_flash_attn_available, select_device
30
29
  from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
31
30
  from ..core import PytorchGenerateConfig, register_non_default_model
32
31
  from .core import PytorchMultiModalModel
@@ -46,7 +45,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
46
45
  def match_json(
47
46
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
48
47
  ) -> bool:
49
- if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
48
+ if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
50
49
  return False
51
50
  llm_family = model_family.model_family or model_family.model_name
52
51
  if "qwen2.5-omni".lower() in llm_family.lower():
@@ -71,12 +70,12 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
71
70
 
72
71
  # for multiple GPU, set back to auto to make multiple devices work
73
72
  device = "auto" if self._device == "cuda" else self._device
74
- flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
75
- kwargs = (
76
- {}
77
- if not flash_attn_installed
78
- else {"attn_implementation": "flash_attention_2"}
73
+ kwargs = {}
74
+ enable_flash_attn = self._pytorch_model_config.get(
75
+ "enable_flash_attn", is_flash_attn_available()
79
76
  )
77
+ if enable_flash_attn:
78
+ kwargs["attn_implementation"] = "flash_attention_2"
80
79
  kwargs = self.apply_bnb_quantization(kwargs)
81
80
  logger.debug("Loading model with extra kwargs: %s", kwargs)
82
81
 
@@ -11,15 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import importlib.util
15
14
  import logging
16
15
  from typing import Any, Dict, Iterator, List, Optional, Tuple
17
16
 
18
17
  from .....core.model import register_batching_multimodal_models
19
18
  from .....device_utils import is_npu_available
20
- from .....model.utils import select_device
21
19
  from .....types import PytorchModelConfig
22
20
  from ....scheduler.request import InferenceRequest
21
+ from ....utils import is_flash_attn_available, select_device
23
22
  from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
24
23
  from ..core import register_non_default_model
25
24
  from .core import PytorchMultiModalModel
@@ -48,7 +47,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
48
47
  def match_json(
49
48
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
50
49
  ) -> bool:
51
- if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
50
+ if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
52
51
  return False
53
52
  llm_family = model_family.model_family or model_family.model_name
54
53
  if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -87,7 +86,6 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
87
86
  Qwen2_5_VLForConditionalGeneration = None
88
87
 
89
88
  kwargs = self.apply_bnb_quantization()
90
- flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
91
89
  llm_family = self.model_family.model_family or self.model_family.model_name
92
90
  model_cls = (
93
91
  Qwen2_5_VLForConditionalGeneration
@@ -97,12 +95,17 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
97
95
  if model_cls is None:
98
96
  raise ImportError("`transformers` version is too old, please upgrade it")
99
97
  device = "auto" if self._device == "cuda" else self._device
100
- if flash_attn_installed:
98
+
99
+ enable_flash_attn = self._pytorch_model_config.get(
100
+ "enable_flash_attn", is_flash_attn_available()
101
+ )
102
+
103
+ if enable_flash_attn:
101
104
  self._model = model_cls.from_pretrained(
102
105
  self.model_path,
103
106
  torch_dtype="bfloat16",
104
- device_map=device,
105
107
  attn_implementation="flash_attention_2",
108
+ device_map=device,
106
109
  trust_remote_code=True,
107
110
  **kwargs,
108
111
  ).eval()
@@ -12,8 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import asyncio
16
- import functools
15
+
17
16
  import logging
18
17
  import os
19
18
  import time
@@ -495,34 +494,3 @@ def batch_inference_one_step(
495
494
  for r in req_list:
496
495
  r.stopped = True
497
496
  r.error_msg = str(e)
498
-
499
-
500
- def cache_clean(fn):
501
- @functools.wraps(fn)
502
- async def _async_wrapper(self, *args, **kwargs):
503
- import gc
504
-
505
- from ....device_utils import empty_cache
506
-
507
- result = await fn(self, *args, **kwargs)
508
-
509
- gc.collect()
510
- empty_cache()
511
- return result
512
-
513
- @functools.wraps(fn)
514
- def _wrapper(self, *args, **kwargs):
515
- import gc
516
-
517
- from ....device_utils import empty_cache
518
-
519
- result = fn(self, *args, **kwargs)
520
-
521
- gc.collect()
522
- empty_cache()
523
- return result
524
-
525
- if asyncio.iscoroutinefunction(fn):
526
- return _async_wrapper
527
- else:
528
- return _wrapper
@@ -67,6 +67,9 @@ QWEN_TOOL_CALL_FAMILY = [
67
67
  "qwen3",
68
68
  "HuatuoGPT-o1-Qwen2.5",
69
69
  "DianJin-R1",
70
+ "Qwen3-Thinking",
71
+ "Qwen3-Instruct",
72
+ "Qwen3-Coder",
70
73
  ]
71
74
 
72
75
  GLM4_TOOL_CALL_FAMILY = [
@@ -79,9 +82,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
79
82
  "HuatuoGPT-o1-LLaMA-3.1",
80
83
  ]
81
84
 
82
- DEEPSEEK_TOOL_CALL_FAMILY = [
83
- "deepseek-v3",
84
- ]
85
+ DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"]
85
86
 
86
87
  TOOL_CALL_FAMILY = (
87
88
  QWEN_TOOL_CALL_FAMILY
@@ -167,8 +168,7 @@ class ChatModelMixin:
167
168
  return json.loads(kwargs)
168
169
  except json.JSONDecodeError:
169
170
  raise TypeError(
170
- f"`chat_template_kwargs` should be json parsable, "
171
- f"got: {kwargs}"
171
+ f"`chat_template_kwargs` should be json parsable, got: {kwargs}"
172
172
  )
173
173
  elif isinstance(kwargs, dict):
174
174
  return kwargs
@@ -254,7 +254,7 @@ class ChatModelMixin:
254
254
  ret += role + "\n" + text + intra_message_sep + "\n"
255
255
  else:
256
256
  placeholders = "\n".join(
257
- f"Image-{i+1}: <image>\n"
257
+ f"Image-{i + 1}: <image>\n"
258
258
  for i in range(
259
259
  len(images) - len(image_futures), len(images)
260
260
  )
@@ -463,6 +463,7 @@ class ChatModelMixin:
463
463
  chat_context_var.set(ctx)
464
464
 
465
465
  previous_texts = [""]
466
+ full_text = ""
466
467
  # Process chunks
467
468
  if reasoning_parser:
468
469
  set_context()
@@ -474,10 +475,14 @@ class ChatModelMixin:
474
475
  # usage
475
476
  chat_chunk = cls._get_final_chat_completion_chunk(chunk)
476
477
  else:
478
+ if choices[0].get("text"):
479
+ full_text += choices[0]["text"] # type: ignore
480
+
477
481
  chat_chunk = cls._to_chat_completion_chunk(
478
482
  chunk, reasoning_parser, previous_texts
479
483
  )
480
484
  yield chat_chunk
485
+ logger.debug("Chat finished, output: %s", full_text)
481
486
 
482
487
  @staticmethod
483
488
  def _to_chat_completion(
@@ -683,6 +688,52 @@ class ChatModelMixin:
683
688
 
684
689
  return results
685
690
 
691
+ @classmethod
692
+ def _eval_deepseek_r1_arguments(cls, c) -> List[Tuple]:
693
+ """
694
+ Parses tool calls from deepseek-r1 (0528) chat template format.
695
+ Returns:
696
+ List of (None, function_name, arguments_dict)
697
+ or (raw_content, None, None) if parsing fails.
698
+ """
699
+ text = c["choices"][0]["text"]
700
+ pattern = (
701
+ r"<\|tool▁call▁begin|>function<\|tool▁sep|>([^\n]+)\n"
702
+ r"```json\n(.*?)\n```<\|tool▁call▁end|>"
703
+ )
704
+
705
+ matches = re.findall(pattern, text, re.DOTALL)
706
+ if not matches:
707
+ return [(text, None, None)]
708
+
709
+ tool_calls = set()
710
+ results = []
711
+
712
+ for func_name, raw_json in matches:
713
+ func_and_args = None
714
+ try:
715
+ func_and_args = json.loads(raw_json)
716
+ arguments_hashable = frozenset(func_and_args.items())
717
+ tool_call_tuple = (
718
+ None,
719
+ func_name,
720
+ func_and_args,
721
+ )
722
+ except Exception:
723
+ tool_call_tuple = (raw_json, None, None)
724
+ arguments_hashable = None
725
+
726
+ dedup_key = (
727
+ (func_name, arguments_hashable)
728
+ if func_and_args is not None
729
+ else raw_json
730
+ )
731
+ if dedup_key not in tool_calls:
732
+ tool_calls.add(dedup_key)
733
+ results.append(tool_call_tuple)
734
+
735
+ return results
736
+
686
737
  @classmethod
687
738
  def _eval_tool_arguments(
688
739
  cls, model_family, c, tool_call_text: Optional[str] = None
@@ -695,7 +746,10 @@ class ChatModelMixin:
695
746
  elif family in LLAMA3_TOOL_CALL_FAMILY:
696
747
  result = cls._eval_llama3_chat_arguments(c)
697
748
  elif family in DEEPSEEK_TOOL_CALL_FAMILY:
698
- result = cls._eval_deepseek_chat_arguments(c)
749
+ if family == "deepseek-r1-0528":
750
+ result = cls._eval_deepseek_r1_arguments(c)
751
+ else:
752
+ result = cls._eval_deepseek_chat_arguments(c)
699
753
  else:
700
754
  raise Exception(
701
755
  f"Model {model_family.model_name} is not support tool calls."
@@ -89,6 +89,7 @@ class VLLMModelConfig(TypedDict, total=False):
89
89
  mm_processor_kwargs: NotRequired[dict[str, Any]]
90
90
  min_pixels: NotRequired[int]
91
91
  max_pixels: NotRequired[int]
92
+ enable_expert_parallel: bool
92
93
 
93
94
 
94
95
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -273,8 +274,12 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
273
274
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
274
275
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
275
276
 
276
- if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
277
+ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
277
278
  VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
279
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
280
+
281
+ if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
282
+ VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
278
283
 
279
284
 
280
285
  class VLLMModel(LLM):
@@ -557,7 +562,9 @@ class VLLMModel(LLM):
557
562
  raise err.with_traceback(tb)
558
563
 
559
564
  # set context length after engine inited
560
- self._set_context_length()
565
+ # if shard > 0, the engine will be inited in another process
566
+ if self._engine:
567
+ self._set_context_length()
561
568
 
562
569
  def _set_context_length(self):
563
570
  from vllm import envs
@@ -839,7 +846,7 @@ class VLLMModel(LLM):
839
846
  return False
840
847
  if not cls._is_linux():
841
848
  return False
842
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
849
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
843
850
  return False
844
851
  if llm_spec.model_format == "pytorch":
845
852
  if quantization != "none" and not (quantization is None):
@@ -1187,7 +1194,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1187
1194
  def match_json(
1188
1195
  cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
1189
1196
  ) -> bool:
1190
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
1197
+ if llm_spec.model_format not in [
1198
+ "pytorch",
1199
+ "gptq",
1200
+ "awq",
1201
+ "fp8",
1202
+ "bnb",
1203
+ "ggufv2",
1204
+ ]:
1191
1205
  return False
1192
1206
  if llm_spec.model_format == "pytorch":
1193
1207
  if quantization != "none" and not (quantization is None):
@@ -1284,6 +1298,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1284
1298
  previous_texts = [""]
1285
1299
  tool_call = False
1286
1300
  tool_call_texts = [""]
1301
+ full_text = ""
1287
1302
  if self.reasoning_parser:
1288
1303
  set_context()
1289
1304
  chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
@@ -1299,6 +1314,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1299
1314
  if not choices:
1300
1315
  yield self._get_final_chat_completion_chunk(chunk)
1301
1316
  else:
1317
+ full_text += chunk["choices"][0]["text"]
1302
1318
  if self.is_tool_call_chunk_start(chunk):
1303
1319
  tool_call = True
1304
1320
  if tool_call:
@@ -1320,6 +1336,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1320
1336
  chunk, self.reasoning_parser, previous_texts
1321
1337
  )
1322
1338
  i += 1
1339
+ logger.debug("Chat finished, output: %s", full_text)
1323
1340
 
1324
1341
  @vllm_check
1325
1342
  async def async_chat(
@@ -1348,13 +1365,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1348
1365
  ):
1349
1366
  full_context_kwargs["tools"] = tools
1350
1367
  assert self.model_family.chat_template is not None
1351
- full_prompt = self.get_full_context(
1352
- messages, self.model_family.chat_template, **full_context_kwargs
1353
- )
1354
1368
 
1355
1369
  generate_config = self._sanitize_chat_config(generate_config)
1356
1370
  stream = generate_config.get("stream", None)
1357
1371
 
1372
+ lora_request = None
1373
+ lora_model = generate_config.get("lora_name")
1374
+ if lora_model is not None:
1375
+ for lora in self.lora_requests:
1376
+ if lora_model == lora.lora_name:
1377
+ lora_request = lora
1378
+ break
1379
+ tokenizer = await self._get_tokenizer(lora_request)
1380
+
1381
+ full_prompt = self.get_full_context(
1382
+ messages,
1383
+ self.model_family.chat_template,
1384
+ tokenizer=tokenizer,
1385
+ **full_context_kwargs,
1386
+ )
1387
+
1358
1388
  if stream:
1359
1389
  agen = await self.async_generate(
1360
1390
  full_prompt, generate_config, tools, request_id=request_id
@@ -1386,7 +1416,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1386
1416
  return False
1387
1417
  if not cls._is_linux():
1388
1418
  return False
1389
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
1419
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
1390
1420
  return False
1391
1421
  if llm_spec.model_format == "pytorch":
1392
1422
  if quantization != "none" and not (quantization is None):