xinference 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (68) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/restful_client.py +9 -1
  3. xinference/core/model.py +19 -0
  4. xinference/core/resource.py +7 -1
  5. xinference/core/status_guard.py +1 -0
  6. xinference/core/supervisor.py +228 -19
  7. xinference/core/utils.py +1 -29
  8. xinference/core/worker.py +28 -2
  9. xinference/deploy/cmdline.py +33 -3
  10. xinference/deploy/test/test_cmdline.py +32 -0
  11. xinference/device_utils.py +43 -1
  12. xinference/model/audio/kokoro.py +19 -36
  13. xinference/model/audio/model_spec.json +1 -1
  14. xinference/model/image/stable_diffusion/core.py +15 -6
  15. xinference/model/llm/llm_family.json +521 -6
  16. xinference/model/llm/llm_family.py +3 -1
  17. xinference/model/llm/llm_family_modelscope.json +559 -6
  18. xinference/model/llm/reasoning_parsers/__init__.py +13 -0
  19. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +98 -0
  20. xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +140 -0
  21. xinference/model/llm/sglang/core.py +99 -11
  22. xinference/model/llm/transformers/intern_vl.py +23 -14
  23. xinference/model/llm/utils.py +53 -19
  24. xinference/model/llm/vllm/core.py +23 -2
  25. xinference/model/llm/vllm/xavier/executor.py +2 -2
  26. xinference/model/llm/vllm/xavier/scheduler.py +3 -3
  27. xinference/thirdparty/internvl/conversation.py +26 -17
  28. xinference/types.py +2 -0
  29. xinference/web/ui/build/asset-manifest.json +6 -6
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/css/main.f8177338.css +2 -0
  32. xinference/web/ui/build/static/css/main.f8177338.css.map +1 -0
  33. xinference/web/ui/build/static/js/main.ad42919c.js +3 -0
  34. xinference/web/ui/build/static/js/main.ad42919c.js.map +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/0acb065326560592b10888234242f94f67efe28458b90f273d4d4fba9daa0cd2.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/6cb9f6c62ab4042f0b11c5d75e51187188e9d6f5f08b1d63e796e051bafdb457.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +1 -0
  47. xinference/web/ui/src/locales/en.json +14 -1
  48. xinference/web/ui/src/locales/zh.json +14 -1
  49. {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/METADATA +11 -11
  50. {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/RECORD +55 -49
  51. xinference/web/ui/build/static/css/main.51a587ff.css +0 -2
  52. xinference/web/ui/build/static/css/main.51a587ff.css.map +0 -1
  53. xinference/web/ui/build/static/js/main.b0936c54.js +0 -3
  54. xinference/web/ui/build/static/js/main.b0936c54.js.map +0 -1
  55. xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +0 -1
  56. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +0 -1
  58. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/a7f1a71f6580dfe810c685a9c1d68e318f71e1fa258fbe50b87a6ac37cc0a598.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +0 -1
  64. /xinference/web/ui/build/static/js/{main.b0936c54.js.LICENSE.txt → main.ad42919c.js.LICENSE.txt} +0 -0
  65. {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/LICENSE +0 -0
  66. {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/WHEEL +0 -0
  67. {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/entry_points.txt +0 -0
  68. {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,98 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, Optional, Tuple, Type, Union
3
+
4
+ from ....types import ChatCompletionChunkDelta, CompletionChoice, CompletionChunk
5
+
6
+
7
+ class ReasoningParser(ABC):
8
+ """Abstract base class for reasoning content parsers."""
9
+
10
+ def __init__(
11
+ self,
12
+ reasoning_start_tag: str = "<think>",
13
+ reasoning_end_tag: str = "</think>",
14
+ ):
15
+ """Initialize the reasoning parser.
16
+
17
+ Args:
18
+ reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
19
+ reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
20
+ """
21
+ self.reasoning_start_tag = reasoning_start_tag
22
+ self.reasoning_end_tag = reasoning_end_tag
23
+
24
+ @abstractmethod
25
+ def extract_reasoning_content_streaming(
26
+ self,
27
+ previous_text: str,
28
+ current_text: str,
29
+ delta: Union[str, CompletionChunk],
30
+ ) -> ChatCompletionChunkDelta:
31
+ """Extract reasoning content from model output in a streaming fashion.
32
+
33
+ Args:
34
+ content (str): The model output content to parse.
35
+
36
+ Yields:
37
+ str: Extracted reasoning content chunks.
38
+ """
39
+ pass
40
+
41
+ @abstractmethod
42
+ def extract_reasoning_content(
43
+ self, model_output: Union[str, CompletionChoice]
44
+ ) -> Tuple[Optional[str], Optional[str]]:
45
+ """Extract reasoning content from model output.
46
+
47
+ Args:
48
+ content (str): The model output content to parse.
49
+
50
+ Returns:
51
+ Optional[str]: Extracted reasoning content, or None if no reasoning content found.
52
+ """
53
+ pass
54
+
55
+
56
+ class ReasoningParserManager:
57
+ """Manager class for reasoning parsers."""
58
+
59
+ _parsers: Dict[str, Type[ReasoningParser]] = {}
60
+
61
+ @classmethod
62
+ def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
63
+ """Register a reasoning parser for a specific model.
64
+
65
+ Args:
66
+ model_name (str): The name of the model.
67
+ parser_cls (Type[ReasoningParser]): The parser class to register.
68
+ """
69
+ cls._parsers[model_name] = parser_cls
70
+
71
+ @classmethod
72
+ def register_module(cls, model_name: str):
73
+ """Decorator for registering a reasoning parser for a specific model.
74
+
75
+ Args:
76
+ model_name (str): The name of the model.
77
+
78
+ Returns:
79
+ Callable: The decorator function.
80
+ """
81
+
82
+ def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
83
+ cls.register(model_name, parser_cls)
84
+ return parser_cls
85
+
86
+ return _register
87
+
88
+ @classmethod
89
+ def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
90
+ """Get the registered parser for a specific model.
91
+
92
+ Args:
93
+ model_name (str): The name of the model.
94
+
95
+ Returns:
96
+ Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
97
+ """
98
+ return cls._parsers.get(model_name)
@@ -0,0 +1,140 @@
1
+ import re
2
+ from typing import Optional, Tuple, Union
3
+
4
+ from ....types import ChatCompletionChunkDelta, CompletionChoice
5
+ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
6
+
7
+
8
+ @ReasoningParserManager.register_module("deepseek-v3")
9
+ @ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
10
+ @ReasoningParserManager.register_module("deepseek-r1-distill-llama")
11
+ class DeepSeekR1ReasoningParser(ReasoningParser):
12
+ """Reasoning parser for DeepSeek-R1 model."""
13
+
14
+ def __init__(
15
+ self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
16
+ ):
17
+ super().__init__(reasoning_start_tag, reasoning_end_tag)
18
+ self.reasoning_regex = re.compile(
19
+ rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
20
+ )
21
+
22
+ def extract_reasoning_content_streaming(
23
+ self,
24
+ previous_text: str,
25
+ current_text: str,
26
+ delta: ChatCompletionChunkDelta,
27
+ ) -> Optional[ChatCompletionChunkDelta]:
28
+ """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
29
+
30
+ Args:
31
+ previous_text (str): The previous accumulated text content.
32
+ current_text (Union[str, ChatCompletionChunk]): The current text chunk or completion chunk.
33
+
34
+ Yields:
35
+ str: Extracted reasoning content chunks.
36
+ """
37
+ if delta is None:
38
+ return delta
39
+
40
+ delta_text = delta["content"]
41
+
42
+ # Check if <think> is present in previous or delta.
43
+ # Keep compatibility with models that don't generate <think> tokens.
44
+ if self.reasoning_start_tag in previous_text:
45
+ if self.reasoning_end_tag in delta_text:
46
+ # <think> in previous, </think> in delta,
47
+ # extract reasoning content
48
+ end_idx = delta_text.find(self.reasoning_end_tag)
49
+ reasoning_content = delta_text[:end_idx]
50
+ content = delta_text[end_idx + len(self.reasoning_end_tag) :]
51
+ delta["reasoning_content"] = reasoning_content
52
+ if content is not None:
53
+ delta["content"] = content
54
+ return delta
55
+ elif self.reasoning_end_tag in previous_text:
56
+ # <think> in previous, </think> in previous,
57
+ # <think> in previous, </think> in previous,
58
+ # reasoning content ends
59
+ return delta
60
+ else:
61
+ # <think> in previous, no </think> in previous or delta,
62
+ # reasoning content continues
63
+ delta["reasoning_content"] = delta_text
64
+ delta["content"] = ""
65
+ return delta
66
+ elif self.reasoning_start_tag in delta_text:
67
+ if self.reasoning_end_tag in delta_text:
68
+ # <think> in delta, </think> in delta, extract reasoning content
69
+ start_idx = delta_text.find(self.reasoning_start_tag)
70
+ end_idx = delta_text.find(self.reasoning_end_tag)
71
+ reasoning_content = delta_text[
72
+ start_idx + len(self.reasoning_start_tag) : end_idx
73
+ ]
74
+ content = delta_text[end_idx + len(self.reasoning_end_tag) :]
75
+ delta["reasoning_content"] = reasoning_content
76
+ if content is not None:
77
+ delta["content"] = content
78
+ return delta
79
+ else:
80
+ # <think> in delta, no </think> in delta,
81
+ # reasoning content continues
82
+ delta["reasoning_content"] = delta_text
83
+ delta["content"] = ""
84
+ return delta
85
+ else:
86
+ # No <think> in previous or delta, also need to check for </think>.
87
+ # Because the model may have generated </think> without <think>
88
+ # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
89
+ if self.reasoning_end_tag in delta_text:
90
+ # </think> in delta with more tokens,
91
+ # extract reasoning content and content
92
+ end_idx = delta_text.find(self.reasoning_end_tag)
93
+ reasoning_content = delta_text[:end_idx]
94
+ content = delta_text[end_idx + len(self.reasoning_end_tag) :]
95
+ delta["reasoning_content"] = reasoning_content
96
+ if content is not None:
97
+ delta["content"] = content
98
+ return delta
99
+ elif self.reasoning_end_tag in previous_text:
100
+ # </think> in previous, thinking content ends
101
+ return delta
102
+ else:
103
+ # no </think> in previous or delta, reasoning content continues
104
+ delta["reasoning_content"] = delta_text
105
+ delta["content"] = ""
106
+ return delta
107
+
108
+ def extract_reasoning_content(
109
+ self, model_output: Union[str, CompletionChoice]
110
+ ) -> Tuple[Optional[str], Optional[str]]:
111
+ """Extract reasoning content from DeepSeek-R1 model output.
112
+
113
+ Args:
114
+ content (str): The model output content to parse.
115
+
116
+ Returns:
117
+ Optional[str]: Extracted reasoning content, or None if no reasoning content found.
118
+ """
119
+ if not isinstance(model_output, str):
120
+ model_output = model_output["text"]
121
+ # DeepSeek R1 doesn't generate <think> now.
122
+ # Thus we assume the reasoning content is always at the start.
123
+ # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
124
+ if self.reasoning_end_tag not in model_output:
125
+ return model_output, None
126
+ else:
127
+ # Add a start token if it's missing to keep compatibility.
128
+ if self.reasoning_start_tag not in model_output:
129
+ model_output = f"{self.reasoning_start_tag}{model_output}"
130
+ # Use a regex to find the reasoning content
131
+ reasoning_content = self.reasoning_regex.findall(model_output)[0]
132
+
133
+ end_index = len(
134
+ f"{self.reasoning_start_tag}{reasoning_content}{self.reasoning_end_tag}"
135
+ )
136
+ final_output = model_output[end_index:]
137
+
138
+ if len(final_output) == 0:
139
+ return reasoning_content, None
140
+ return reasoning_content, final_output
@@ -14,10 +14,14 @@
14
14
 
15
15
  import json
16
16
  import logging
17
+ import sys
18
+ import threading
17
19
  import time
18
20
  import uuid
19
21
  from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
20
22
 
23
+ from xoscar.utils import get_next_port
24
+
21
25
  from ....types import (
22
26
  ChatCompletion,
23
27
  ChatCompletionChunk,
@@ -40,6 +44,10 @@ class SGLANGModelConfig(TypedDict, total=False):
40
44
  mem_fraction_static: float
41
45
  log_level: str
42
46
  attention_reduce_in_fp32: bool # For gemma
47
+ # distributed
48
+ nnodes: Optional[int]
49
+ node_rank: Optional[int]
50
+ dist_init_addr: Optional[str]
43
51
 
44
52
 
45
53
  class SGLANGGenerateConfig(TypedDict, total=False):
@@ -91,6 +99,10 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
91
99
  "qwen2.5-instruct",
92
100
  "qwen2.5-coder-instruct",
93
101
  "QwQ-32B-Preview",
102
+ "deepseek-r1-distill-qwen",
103
+ "deepseek-r1-distill-llama",
104
+ "deepseek-v3",
105
+ "deepseek-r1",
94
106
  ]
95
107
 
96
108
 
@@ -107,6 +119,16 @@ class SGLANGModel(LLM):
107
119
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
108
120
  self._model_config = model_config
109
121
  self._engine = None
122
+ self._address = model_config.pop("address", None) # type: ignore
123
+ self._n_worker = model_config.pop("n_worker", 1) # type: ignore
124
+ self._shard = model_config.pop("shard", 0) # type: ignore
125
+ self._driver_info = model_config.pop("driver_info", None) # type: ignore
126
+ self._loading_thread = None
127
+ self._loading_error = None
128
+
129
+ @property
130
+ def driver_info(self) -> Optional[dict]:
131
+ return self._driver_info
110
132
 
111
133
  def load(self):
112
134
  try:
@@ -128,18 +150,84 @@ class SGLANGModel(LLM):
128
150
  else:
129
151
  self._model_config.setdefault("attention_reduce_in_fp32", False)
130
152
 
131
- logger.info(
132
- f"Loading {self.model_uid} with following model config: {self._model_config}"
133
- )
153
+ # gen port for sgl Runtime,
154
+ # this is useful for sglang service on a same machine.
155
+ # sglang typically find a port between [port, 40000]
156
+ # we need to ensure the generated port < 40000
157
+ sgl_port = None
158
+ for _ in range(10):
159
+ sgl_port = get_next_port()
160
+ if sgl_port >= 40000:
161
+ sgl_port = None
162
+ else:
163
+ break
164
+ if sgl_port is None:
165
+ raise ValueError("Failed to find a port for sglang")
166
+
167
+ if self._n_worker > 1:
168
+ # distributed inference
169
+ self._model_config["nnodes"] = self._n_worker
170
+ self._model_config["node_rank"] = self._shard
171
+ # model across multiple workers
172
+ if self._shard == 0:
173
+ # distributed, need to init driver_info
174
+ assert self._driver_info is None
175
+ # This must run inside Xoscar pool
176
+ dist_init_addr = f"{self._address.split(':', 1)[0]}:{get_next_port()}"
177
+ self._driver_info = {"dist_init_addr": dist_init_addr}
178
+ self._model_config["dist_init_addr"] = dist_init_addr
179
+ else:
180
+ assert self._driver_info is not None
181
+ self._model_config["dist_init_addr"] = self._driver_info[
182
+ "dist_init_addr"
183
+ ]
134
184
 
135
- self._engine = sgl.Runtime(
136
- model_path=self.model_path,
137
- tokenizer_path=self.model_path,
138
- **self._model_config,
139
- )
185
+ logger.info(
186
+ f"Loading {self.model_uid}, shard({self._shard} of {self._n_worker}) with following model config: {self._model_config}"
187
+ )
188
+
189
+ def _load():
190
+ try:
191
+ self._engine = sgl.Runtime(
192
+ model_path=self.model_path,
193
+ tokenizer_path=self.model_path,
194
+ port=sgl_port,
195
+ **self._model_config,
196
+ )
197
+ except:
198
+ logger.exception("Creating sglang Runtime failed")
199
+ self._loading_error = sys.exc_info()
200
+
201
+ self._loading_thread = threading.Thread(target=_load)
202
+ self._loading_thread.start()
203
+ if self._shard == 0:
204
+ # wait for 3 seconds to ensure torch distributed inited first
205
+ self._loading_thread.join(3)
206
+ else:
207
+ logger.info(
208
+ f"Loading {self.model_uid} with following model config: {self._model_config}"
209
+ )
210
+
211
+ self._engine = sgl.Runtime(
212
+ model_path=self.model_path,
213
+ tokenizer_path=self.model_path,
214
+ port=sgl_port,
215
+ **self._model_config,
216
+ )
217
+
218
+ def wait_for_load(self):
219
+ if self._loading_thread:
220
+ if self._shard == 0:
221
+ # for the shard 0, we wait it to complete
222
+ # the sglang will serve forever for the other shards,
223
+ # so we only check if any error happens.
224
+ self._loading_thread.join()
225
+ if self._loading_error:
226
+ _, err, tb = self._loading_error
227
+ raise err.with_traceback(tb)
140
228
 
141
229
  def stop(self):
142
- logger.info("Stopping SGLang engine")
230
+ logger.info("Stopping SGLang engine, sglang pid: %s", self._engine.pid)
143
231
  self._engine.shutdown()
144
232
 
145
233
  def _sanitize_model_config(
@@ -151,7 +239,7 @@ class SGLANGModel(LLM):
151
239
  cuda_count = self._get_cuda_count()
152
240
  model_config.setdefault("tokenizer_mode", "auto")
153
241
  model_config.setdefault("trust_remote_code", True)
154
- model_config.setdefault("tp_size", cuda_count)
242
+ model_config.setdefault("tp_size", cuda_count * self._n_worker)
155
243
  # See https://github.com/sgl-project/sglang/blob/00023d622a6d484e67ef4a0e444f708b8fc861c8/python/sglang/srt/server_args.py#L100-L109
156
244
  mem_fraction_static = model_config.get("mem_fraction_static")
157
245
  if mem_fraction_static is None:
@@ -159,7 +247,7 @@ class SGLANGModel(LLM):
159
247
  if tp_size >= 16:
160
248
  model_config["mem_fraction_static"] = 0.79
161
249
  elif tp_size >= 8:
162
- model_config["mem_fraction_static"] = 0.83
250
+ model_config["mem_fraction_static"] = 0.81
163
251
  elif tp_size >= 4:
164
252
  model_config["mem_fraction_static"] = 0.85
165
253
  elif tp_size >= 2:
@@ -265,15 +265,24 @@ class InternVLChatModel(PytorchChatModel):
265
265
  if world_size == 1:
266
266
  return None
267
267
  model_size = f"{self.model_spec.model_size_in_billions}B"
268
+ model_name = f"{self.model_family.model_name.lower()}-{model_size}"
268
269
  num_layers = {
269
- "1B": 24,
270
- "2B": 24,
271
- "4B": 32,
272
- "8B": 32,
273
- "26B": 48,
274
- "40B": 60,
275
- "76B": 80,
276
- }[model_size]
270
+ "internvl2-1B": 24,
271
+ "internvl2-2B": 24,
272
+ "internvl2-4B": 32,
273
+ "internvl2-8B": 32,
274
+ "internvl2-26B": 48,
275
+ "internvl2-40B": 60,
276
+ "internvl2-76B": 80,
277
+ "internvl2.5-1B": 24,
278
+ "internvl2.5-2B": 24,
279
+ "internvl2.5-4B": 36,
280
+ "internvl2.5-8B": 32,
281
+ "internvl2.5-26B": 48,
282
+ "internvl2.5-38B": 64,
283
+ "internvl2.5-78B": 80,
284
+ }[model_name]
285
+
277
286
  # Since the first GPU will be used for ViT, treat it as half a GPU.
278
287
  num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
279
288
  num_layers_per_gpu = [num_layers_per_gpu] * world_size
@@ -322,9 +331,7 @@ class InternVLChatModel(PytorchChatModel):
322
331
  self._model.cuda()
323
332
 
324
333
  self._tokenizer = AutoTokenizer.from_pretrained(
325
- self.model_path,
326
- trust_remote_code=True,
327
- use_fast=False,
334
+ self.model_path, trust_remote_code=True, use_fast=False
328
335
  )
329
336
 
330
337
  @cache_clean
@@ -339,11 +346,12 @@ class InternVLChatModel(PytorchChatModel):
339
346
  IMG_END_TOKEN = "</img>"
340
347
  IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
341
348
 
349
+ generate_config = generate_config if isinstance(generate_config, dict) else {}
350
+
342
351
  generation_config = {
343
- "max_new_tokens": generate_config.get("max_tokens", 1024)
344
- if generate_config
345
- else 1024,
352
+ "max_new_tokens": (generate_config.get("max_tokens", 1024)),
346
353
  "do_sample": False,
354
+ "temperature": generate_config.get("temperature", None),
347
355
  }
348
356
 
349
357
  stream = (
@@ -458,6 +466,7 @@ class InternVLChatModel(PytorchChatModel):
458
466
  streamer = TextIteratorStreamer(
459
467
  self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
460
468
  )
469
+
461
470
  # Define the generation configuration
462
471
  generate_kwargs["streamer"] = streamer
463
472
  # Start the model chat in a separate thread
@@ -54,6 +54,7 @@ from .llm_family import (
54
54
  _get_cache_dir,
55
55
  get_cache_status,
56
56
  )
57
+ from .reasoning_parsers.abs_reasoning_parsers import ReasoningParser
57
58
 
58
59
  logger = logging.getLogger(__name__)
59
60
 
@@ -321,6 +322,7 @@ class ChatModelMixin:
321
322
  def _to_chat_completion_chunks(
322
323
  cls,
323
324
  chunks: Iterator[CompletionChunk],
325
+ reasoning_parse: Optional[ReasoningParser] = None,
324
326
  ) -> Iterator[ChatCompletionChunk]:
325
327
  for i, chunk in enumerate(chunks):
326
328
  if i == 0:
@@ -365,37 +367,69 @@ class ChatModelMixin:
365
367
  async def _async_to_chat_completion_chunks(
366
368
  cls,
367
369
  chunks: AsyncGenerator[CompletionChunk, None],
370
+ reasoning_parser: Optional[ReasoningParser] = None,
368
371
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
369
372
  i = 0
373
+ previous_text = ""
374
+ current_text = ""
370
375
  async for chunk in chunks:
371
376
  if i == 0:
372
- yield cls._get_first_chat_completion_chunk(chunk)
373
- # usage
374
- choices = chunk.get("choices")
375
- if not choices:
376
- yield cls._get_final_chat_completion_chunk(chunk)
377
+ chunk = cls._get_first_chat_completion_chunk(chunk)
378
+ elif not chunk.get("choices"):
379
+ # usage
380
+ chunk = cls._get_final_chat_completion_chunk(chunk)
377
381
  else:
378
- yield cls._to_chat_completion_chunk(chunk)
382
+ chunk = cls._to_chat_completion_chunk(chunk)
383
+ if reasoning_parser is not None:
384
+ choices = chunk.get("choices")
385
+ for choice in choices:
386
+ delta = choice.get("delta")
387
+ if not delta:
388
+ continue
389
+ current_text = previous_text + delta.get("content")
390
+ choice[
391
+ "delta"
392
+ ] = reasoning_parser.extract_reasoning_content_streaming(
393
+ previous_text=previous_text,
394
+ current_text=current_text,
395
+ delta=delta,
396
+ )
397
+ previous_text = current_text
398
+ yield chunk
379
399
  i += 1
380
400
 
381
401
  @staticmethod
382
- def _to_chat_completion(completion: Completion) -> ChatCompletion:
383
- return {
384
- "id": "chat" + completion["id"],
385
- "object": "chat.completion",
386
- "created": completion["created"],
387
- "model": completion["model"],
388
- "choices": [
402
+ def _to_chat_completion(
403
+ completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
404
+ ) -> ChatCompletion:
405
+ choices = []
406
+ for i, choice in enumerate(completion["choices"]):
407
+ content = choice["text"]
408
+ reasoning_content = None
409
+
410
+ if reasoning_parser is not None:
411
+ reasoning_content, content = reasoning_parser.extract_reasoning_content(
412
+ choice
413
+ )
414
+
415
+ message = {"role": "assistant", "content": content}
416
+
417
+ # add only reasoning_content is None
418
+ if reasoning_content is not None:
419
+ message["reasoning_content"] = reasoning_content
420
+
421
+ choices.append(
389
422
  {
390
423
  "index": i,
391
- "message": {
392
- "role": "assistant",
393
- "content": choice["text"],
394
- },
424
+ "message": message,
395
425
  "finish_reason": choice["finish_reason"],
396
426
  }
397
- for i, choice in enumerate(completion["choices"])
398
- ],
427
+ )
428
+ return {
429
+ "id": "chat" + completion["id"],
430
+ "object": "chat.completion",
431
+ "created": completion["created"],
432
+ "model": choices,
399
433
  "usage": completion["usage"],
400
434
  }
401
435
 
@@ -43,6 +43,8 @@ from ....types import (
43
43
  )
44
44
  from .. import LLM, LLMFamilyV1, LLMSpecV1
45
45
  from ..llm_family import CustomLLMFamilyV1
46
+ from ..reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
47
+ from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
46
48
  from ..utils import (
47
49
  DEEPSEEK_TOOL_CALL_FAMILY,
48
50
  QWEN_TOOL_CALL_FAMILY,
@@ -72,6 +74,7 @@ class VLLMModelConfig(TypedDict, total=False):
72
74
  limit_mm_per_prompt: Optional[Dict[str, int]]
73
75
  guided_decoding_backend: Optional[str]
74
76
  scheduling_policy: Optional[str]
77
+ reasoning_content: bool
75
78
 
76
79
 
77
80
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -176,6 +179,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
176
179
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat")
177
180
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
178
181
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
182
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v3")
183
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1")
179
184
 
180
185
  if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
181
186
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
@@ -190,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
190
195
 
191
196
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
192
197
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
198
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
193
199
 
194
200
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
195
201
  VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
@@ -206,6 +212,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
206
212
  if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
207
213
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
208
214
 
215
+ if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
216
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen-2.5-instruct-1m")
217
+
209
218
 
210
219
  class VLLMModel(LLM):
211
220
  def __init__(
@@ -234,6 +243,7 @@ class VLLMModel(LLM):
234
243
  self.lora_modules = peft_model
235
244
  self.lora_requests: List[LoRARequest] = []
236
245
  self._xavier_config = None
246
+ self.reasoning_parser = None
237
247
 
238
248
  def set_xavier_config(self, value: Optional[Dict]):
239
249
  self._xavier_config = value # type: ignore
@@ -262,6 +272,16 @@ class VLLMModel(LLM):
262
272
  multiprocessing.set_start_method("fork", force=True)
263
273
 
264
274
  self._model_config = self._sanitize_model_config(self._model_config)
275
+ reasoning_content = self._model_config.pop("reasoning_content")
276
+
277
+ # Initialize reasoning parser if model has reasoning ability
278
+ if "reasoning" in self.model_family.model_ability and reasoning_content:
279
+ module_name = self.model_family.model_family or self.model_family.model_name
280
+ self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
281
+ self.reasoning_parser = self.reasoning_parser(
282
+ self.model_family.reasoning_start_tag,
283
+ self.model_family.reasoning_end_tag,
284
+ )
265
285
  if self.lora_modules is None:
266
286
  self.lora_requests = []
267
287
  else:
@@ -368,6 +388,7 @@ class VLLMModel(LLM):
368
388
  model_config.setdefault("quantization", None)
369
389
  model_config.setdefault("max_model_len", None)
370
390
  model_config.setdefault("guided_decoding_backend", "outlines")
391
+ model_config.setdefault("reasoning_content", False)
371
392
  # Add scheduling policy if vLLM version is 0.6.3 or higher
372
393
  if vllm.__version__ >= "0.6.3":
373
394
  model_config.setdefault("scheduling_policy", "fcfs")
@@ -835,7 +856,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
835
856
  assert isinstance(agen, AsyncGenerator)
836
857
  if tools:
837
858
  return self._async_to_tool_completion_chunks(agen)
838
- return self._async_to_chat_completion_chunks(agen)
859
+ return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
839
860
  else:
840
861
  c = await self.async_generate(
841
862
  full_prompt, generate_config, request_id=request_id
@@ -843,7 +864,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
843
864
  assert not isinstance(c, AsyncGenerator)
844
865
  if tools:
845
866
  return self._tool_calls_completion(self.model_family, self.model_uid, c)
846
- return self._to_chat_completion(c)
867
+ return self._to_chat_completion(c, self.reasoning_parser)
847
868
 
848
869
 
849
870
  class VLLMVisionModel(VLLMModel, ChatModelMixin):