xinference 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +1 -0
- xinference/conftest.py +7 -0
- xinference/core/chat_interface.py +39 -24
- xinference/core/model.py +3 -1
- xinference/core/scheduler.py +3 -0
- xinference/core/worker.py +1 -1
- xinference/model/embedding/core.py +12 -5
- xinference/model/llm/__init__.py +2 -1
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +266 -3
- xinference/model/llm/llm_family.json +390 -17
- xinference/model/llm/llm_family_modelscope.json +348 -29
- xinference/model/llm/mlx/core.py +15 -4
- xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +9 -13
- xinference/model/llm/sglang/core.py +7 -2
- xinference/model/llm/transformers/chatglm.py +4 -4
- xinference/model/llm/transformers/core.py +22 -5
- xinference/model/llm/transformers/intern_vl.py +2 -1
- xinference/model/llm/transformers/utils.py +1 -1
- xinference/model/llm/utils.py +134 -60
- xinference/model/llm/vllm/core.py +31 -42
- xinference/types.py +4 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
- xinference/web/ui/src/locales/en.json +9 -1
- xinference/web/ui/src/locales/zh.json +9 -1
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/METADATA +9 -5
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/RECORD +43 -44
- xinference/model/llm/reasoning_parsers/__init__.py +0 -13
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
- xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
- xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
- /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/LICENSE +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/WHEEL +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/top_level.txt +0 -0
xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py}
RENAMED
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import Optional, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from
|
|
5
|
-
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
|
4
|
+
from ...types import ChatCompletionChunkDelta, CompletionChoice
|
|
6
5
|
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
|
|
11
|
-
class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
12
|
-
"""Reasoning parser for DeepSeek-R1 model."""
|
|
7
|
+
class ReasoningParser:
|
|
8
|
+
"""Reasoning parser for reasoning model."""
|
|
13
9
|
|
|
14
10
|
def __init__(
|
|
15
11
|
self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
|
|
16
12
|
):
|
|
17
|
-
|
|
13
|
+
self.reasoning_start_tag = reasoning_start_tag
|
|
14
|
+
self.reasoning_end_tag = reasoning_end_tag
|
|
18
15
|
self.reasoning_regex = re.compile(
|
|
19
16
|
rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
|
|
20
17
|
)
|
|
@@ -23,7 +20,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
23
20
|
self,
|
|
24
21
|
previous_text: str,
|
|
25
22
|
current_text: str,
|
|
26
|
-
|
|
23
|
+
delta_text: str,
|
|
27
24
|
) -> ChatCompletionChunkDelta:
|
|
28
25
|
"""Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
|
|
29
26
|
|
|
@@ -34,10 +31,9 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
|
34
31
|
Yields:
|
|
35
32
|
str: Extracted reasoning content chunks.
|
|
36
33
|
"""
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
delta_text = delta["content"]
|
|
34
|
+
delta = ChatCompletionChunkDelta(
|
|
35
|
+
content=delta_text,
|
|
36
|
+
)
|
|
41
37
|
|
|
42
38
|
# Check if <think> is present in previous or delta.
|
|
43
39
|
# Keep compatibility with models that don't generate <think> tokens.
|
|
@@ -48,6 +48,7 @@ class SGLANGModelConfig(TypedDict, total=False):
|
|
|
48
48
|
nnodes: Optional[int]
|
|
49
49
|
node_rank: Optional[int]
|
|
50
50
|
dist_init_addr: Optional[str]
|
|
51
|
+
reasoning_content: bool
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
class SGLANGGenerateConfig(TypedDict, total=False):
|
|
@@ -99,6 +100,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
|
99
100
|
"qwen2.5-instruct",
|
|
100
101
|
"qwen2.5-coder-instruct",
|
|
101
102
|
"QwQ-32B-Preview",
|
|
103
|
+
"QwQ-32B",
|
|
102
104
|
"deepseek-r1-distill-qwen",
|
|
103
105
|
"deepseek-r1-distill-llama",
|
|
104
106
|
"deepseek-v3",
|
|
@@ -143,6 +145,8 @@ class SGLANGModel(LLM):
|
|
|
143
145
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
144
146
|
|
|
145
147
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
148
|
+
reasoning_content = self._model_config.pop("reasoning_content")
|
|
149
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
146
150
|
|
|
147
151
|
# Fix: GH#2169
|
|
148
152
|
if sgl.__version__ >= "0.2.14":
|
|
@@ -255,6 +259,7 @@ class SGLANGModel(LLM):
|
|
|
255
259
|
else:
|
|
256
260
|
model_config["mem_fraction_static"] = 0.88
|
|
257
261
|
model_config.setdefault("log_level", "info")
|
|
262
|
+
model_config.setdefault("reasoning_content", False)
|
|
258
263
|
|
|
259
264
|
return model_config
|
|
260
265
|
|
|
@@ -547,8 +552,8 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
547
552
|
if stream:
|
|
548
553
|
agen = await self.async_generate(full_prompt, generate_config) # type: ignore
|
|
549
554
|
assert isinstance(agen, AsyncGenerator)
|
|
550
|
-
return self._async_to_chat_completion_chunks(agen)
|
|
555
|
+
return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
|
|
551
556
|
else:
|
|
552
557
|
c = await self.async_generate(full_prompt, generate_config) # type: ignore
|
|
553
558
|
assert not isinstance(c, AsyncGenerator)
|
|
554
|
-
return self._to_chat_completion(c)
|
|
559
|
+
return self._to_chat_completion(c, self.reasoning_parser)
|
|
@@ -383,7 +383,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
383
383
|
function_call = self._process_response_non_streaming(
|
|
384
384
|
response, tools, use_tool=True
|
|
385
385
|
)
|
|
386
|
-
return self.
|
|
386
|
+
return self._post_process_completion(
|
|
387
387
|
self.model_family, self.model_uid, function_call
|
|
388
388
|
)
|
|
389
389
|
else:
|
|
@@ -397,7 +397,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
397
397
|
prompt_tokens = len(inputs["input_ids"][0])
|
|
398
398
|
for chunk_text in self._stream_chat(inputs, tools, **kwargs):
|
|
399
399
|
if tools and isinstance(chunk_text, dict):
|
|
400
|
-
yield self.
|
|
400
|
+
yield self._post_process_completion_chunk(
|
|
401
401
|
self.model_family, self.model_uid, chunk_text
|
|
402
402
|
)
|
|
403
403
|
return
|
|
@@ -484,7 +484,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
484
484
|
function_call = self._process_response_non_streaming(
|
|
485
485
|
response, req.tools, use_tool=True
|
|
486
486
|
)
|
|
487
|
-
req.completion[0] = self.
|
|
487
|
+
req.completion[0] = self._post_process_completion(
|
|
488
488
|
self.model_family, self.model_uid, function_call
|
|
489
489
|
)
|
|
490
490
|
req.completion[0]["usage"] = usage
|
|
@@ -516,7 +516,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
516
516
|
c for c in req.completion if not isinstance(c, str)
|
|
517
517
|
][0]["id"]
|
|
518
518
|
results.append(
|
|
519
|
-
self.
|
|
519
|
+
self._post_process_completion_chunk(
|
|
520
520
|
self.model_family,
|
|
521
521
|
self.model_uid,
|
|
522
522
|
new_response,
|
|
@@ -61,6 +61,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
|
61
61
|
"deepseek-vl-chat",
|
|
62
62
|
"internvl-chat",
|
|
63
63
|
"internvl2",
|
|
64
|
+
"Internvl2.5",
|
|
65
|
+
"Internvl2.5-MPO",
|
|
64
66
|
"cogvlm2",
|
|
65
67
|
"cogvlm2-video-llama3-chat",
|
|
66
68
|
"MiniCPM-Llama3-V-2_5",
|
|
@@ -112,6 +114,7 @@ class PytorchModel(LLM):
|
|
|
112
114
|
pytorch_model_config.setdefault("trust_remote_code", True)
|
|
113
115
|
pytorch_model_config.setdefault("max_num_seqs", 16)
|
|
114
116
|
pytorch_model_config.setdefault("enable_tensorizer", False)
|
|
117
|
+
pytorch_model_config.setdefault("reasoning_content", False)
|
|
115
118
|
return pytorch_model_config
|
|
116
119
|
|
|
117
120
|
def _sanitize_generate_config(
|
|
@@ -324,6 +327,9 @@ class PytorchModel(LLM):
|
|
|
324
327
|
kwargs.update({"device_map": "auto"})
|
|
325
328
|
is_device_map_auto = True
|
|
326
329
|
|
|
330
|
+
reasoning_content = self._pytorch_model_config.pop("reasoning_content")
|
|
331
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
332
|
+
|
|
327
333
|
if self._check_tensorizer_integrity():
|
|
328
334
|
self._model, self._tokenizer = self._load_tensorizer(**kwargs)
|
|
329
335
|
else:
|
|
@@ -714,23 +720,34 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
714
720
|
|
|
715
721
|
def handle_chat_result_non_streaming(self, req: InferenceRequest):
|
|
716
722
|
if req.tools:
|
|
717
|
-
req.completion[0] = self.
|
|
718
|
-
self.model_family,
|
|
723
|
+
req.completion[0] = self._post_process_completion(
|
|
724
|
+
self.model_family,
|
|
725
|
+
self.model_uid,
|
|
726
|
+
req.completion[0],
|
|
727
|
+
self.reasoning_parser,
|
|
719
728
|
)
|
|
720
729
|
else:
|
|
721
|
-
req.completion[0] = self._to_chat_completion(
|
|
730
|
+
req.completion[0] = self._to_chat_completion(
|
|
731
|
+
req.completion[0], self.reasoning_parser
|
|
732
|
+
)
|
|
722
733
|
|
|
723
734
|
def handle_chat_result_streaming(self, req: InferenceRequest):
|
|
724
735
|
results = []
|
|
725
736
|
for i, c in enumerate(req.completion):
|
|
726
737
|
if c == "<bos_stream>":
|
|
727
738
|
results.append(
|
|
728
|
-
self._get_first_chat_completion_chunk(
|
|
739
|
+
self._get_first_chat_completion_chunk(
|
|
740
|
+
req.completion[i + 1], self.reasoning_parser
|
|
741
|
+
)
|
|
729
742
|
)
|
|
730
743
|
elif c == "<eos_stream>":
|
|
731
744
|
break
|
|
732
745
|
else:
|
|
733
|
-
results.append(
|
|
746
|
+
results.append(
|
|
747
|
+
self._to_chat_completion_chunk(
|
|
748
|
+
c, self.reasoning_parser, req.previous_texts
|
|
749
|
+
)
|
|
750
|
+
)
|
|
734
751
|
|
|
735
752
|
if req.stopped and req.include_usage:
|
|
736
753
|
results.append(self._get_final_chat_completion_chunk(req.completion[-1]))
|
|
@@ -265,7 +265,8 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
265
265
|
if world_size == 1:
|
|
266
266
|
return None
|
|
267
267
|
model_size = f"{self.model_spec.model_size_in_billions}B"
|
|
268
|
-
model_name =
|
|
268
|
+
model_name = self.model_family.model_name.lower().replace("-mpo", "")
|
|
269
|
+
model_name = f"{model_name}-{model_size}"
|
|
269
270
|
num_layers = {
|
|
270
271
|
"internvl2-1B": 24,
|
|
271
272
|
"internvl2-2B": 24,
|
|
@@ -132,7 +132,7 @@ def _pad_seqs_inplace(seqs: List[List[int]], reqs: List[InferenceRequest], pad:
|
|
|
132
132
|
|
|
133
133
|
def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
|
|
134
134
|
max_new_tokens = int(
|
|
135
|
-
r.sanitized_generate_config.get("max_tokens"
|
|
135
|
+
r.sanitized_generate_config.get("max_tokens") or max_tokens_field.default
|
|
136
136
|
)
|
|
137
137
|
return context_len - max_new_tokens - 8
|
|
138
138
|
|
xinference/model/llm/utils.py
CHANGED
|
@@ -41,6 +41,7 @@ from ...types import (
|
|
|
41
41
|
ChatCompletion,
|
|
42
42
|
ChatCompletionChoice,
|
|
43
43
|
ChatCompletionChunk,
|
|
44
|
+
ChatCompletionChunkDelta,
|
|
44
45
|
ChatCompletionMessage,
|
|
45
46
|
Completion,
|
|
46
47
|
CompletionChoice,
|
|
@@ -54,7 +55,7 @@ from .llm_family import (
|
|
|
54
55
|
_get_cache_dir,
|
|
55
56
|
get_cache_status,
|
|
56
57
|
)
|
|
57
|
-
from .
|
|
58
|
+
from .reasoning_parser import ReasoningParser
|
|
58
59
|
|
|
59
60
|
logger = logging.getLogger(__name__)
|
|
60
61
|
|
|
@@ -243,62 +244,95 @@ class ChatModelMixin:
|
|
|
243
244
|
raise ValueError(f"Invalid model family: {model_family}")
|
|
244
245
|
|
|
245
246
|
@classmethod
|
|
246
|
-
def _to_chat_completion_chunk(
|
|
247
|
+
def _to_chat_completion_chunk(
|
|
248
|
+
cls,
|
|
249
|
+
chunk: CompletionChunk,
|
|
250
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
251
|
+
previous_texts: Optional[List[str]] = None,
|
|
252
|
+
) -> ChatCompletionChunk:
|
|
247
253
|
choices = chunk.get("choices")
|
|
248
254
|
if (
|
|
249
255
|
chunk.get("object") == "chat.completion.chunk"
|
|
250
256
|
and choices
|
|
251
257
|
and "delta" in choices[0]
|
|
252
258
|
):
|
|
259
|
+
if reasoning_parser is not None:
|
|
260
|
+
# process parsing reasoning content
|
|
261
|
+
assert previous_texts is not None
|
|
262
|
+
delta = choices[0]["delta"] # type: ignore
|
|
263
|
+
if text := delta.get("content"):
|
|
264
|
+
current_text = previous_texts[-1] + text
|
|
265
|
+
delta = reasoning_parser.extract_reasoning_content_streaming(
|
|
266
|
+
previous_text=previous_texts[-1],
|
|
267
|
+
current_text=current_text,
|
|
268
|
+
delta_text=text,
|
|
269
|
+
)
|
|
270
|
+
previous_texts[-1] = current_text
|
|
271
|
+
choices[0]["delta"] = delta # type: ignore
|
|
253
272
|
# Already a ChatCompletionChunk, we don't need to convert chunk.
|
|
254
273
|
return cast(ChatCompletionChunk, chunk)
|
|
274
|
+
|
|
275
|
+
choices_list = []
|
|
276
|
+
for i, choice in enumerate(choices): # type: ignore
|
|
277
|
+
delta = ChatCompletionChunkDelta()
|
|
278
|
+
if "text" in choice and choice["finish_reason"] is None:
|
|
279
|
+
if reasoning_parser is None:
|
|
280
|
+
delta["content"] = choice["text"]
|
|
281
|
+
else:
|
|
282
|
+
assert previous_texts is not None
|
|
283
|
+
current_text = previous_texts[-1] + choice["text"]
|
|
284
|
+
delta = reasoning_parser.extract_reasoning_content_streaming(
|
|
285
|
+
previous_text=previous_texts[-1],
|
|
286
|
+
current_text=current_text,
|
|
287
|
+
delta_text=choice["text"],
|
|
288
|
+
)
|
|
289
|
+
previous_texts[-1] = current_text
|
|
290
|
+
if "tool_calls" in choice:
|
|
291
|
+
delta["tool_calls"] = choice["tool_calls"]
|
|
292
|
+
choices_list.append(
|
|
293
|
+
{
|
|
294
|
+
"index": i,
|
|
295
|
+
"delta": delta,
|
|
296
|
+
"finish_reason": choice["finish_reason"],
|
|
297
|
+
}
|
|
298
|
+
)
|
|
255
299
|
chat_chunk = {
|
|
256
300
|
"id": "chat" + chunk["id"],
|
|
257
301
|
"model": chunk["model"],
|
|
258
302
|
"created": chunk["created"],
|
|
259
303
|
"object": "chat.completion.chunk",
|
|
260
|
-
"choices":
|
|
261
|
-
{
|
|
262
|
-
"index": i,
|
|
263
|
-
"delta": {
|
|
264
|
-
**(
|
|
265
|
-
{"content": choice["text"]}
|
|
266
|
-
if ("text" in choice and choice["finish_reason"] is None)
|
|
267
|
-
else {}
|
|
268
|
-
),
|
|
269
|
-
**(
|
|
270
|
-
{"tool_calls": choice["tool_calls"]}
|
|
271
|
-
if "tool_calls" in choice
|
|
272
|
-
else {}
|
|
273
|
-
),
|
|
274
|
-
},
|
|
275
|
-
"finish_reason": choice["finish_reason"],
|
|
276
|
-
}
|
|
277
|
-
for i, choice in enumerate(chunk["choices"])
|
|
278
|
-
],
|
|
304
|
+
"choices": choices_list,
|
|
279
305
|
}
|
|
280
306
|
return cast(ChatCompletionChunk, chat_chunk)
|
|
281
307
|
|
|
282
308
|
@classmethod
|
|
283
309
|
def _get_first_chat_completion_chunk(
|
|
284
|
-
cls,
|
|
310
|
+
cls,
|
|
311
|
+
chunk: CompletionChunk,
|
|
312
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
285
313
|
) -> ChatCompletionChunk:
|
|
314
|
+
choices_list = []
|
|
315
|
+
for i, choice in enumerate(chunk["choices"]):
|
|
316
|
+
delta = {
|
|
317
|
+
"role": "assistant",
|
|
318
|
+
}
|
|
319
|
+
if reasoning_parser is None:
|
|
320
|
+
delta["content"] = ""
|
|
321
|
+
else:
|
|
322
|
+
delta["reasoning_content"] = ""
|
|
323
|
+
choices_list.append(
|
|
324
|
+
{
|
|
325
|
+
"index": i,
|
|
326
|
+
"delta": delta,
|
|
327
|
+
"finish_reason": None,
|
|
328
|
+
}
|
|
329
|
+
)
|
|
286
330
|
chat_chunk = {
|
|
287
331
|
"id": "chat" + chunk["id"],
|
|
288
332
|
"model": chunk["model"],
|
|
289
333
|
"created": chunk["created"],
|
|
290
334
|
"object": "chat.completion.chunk",
|
|
291
|
-
"choices":
|
|
292
|
-
{
|
|
293
|
-
"index": i,
|
|
294
|
-
"delta": {
|
|
295
|
-
"role": "assistant",
|
|
296
|
-
"content": "",
|
|
297
|
-
},
|
|
298
|
-
"finish_reason": None,
|
|
299
|
-
}
|
|
300
|
-
for i, choice in enumerate(chunk["choices"])
|
|
301
|
-
],
|
|
335
|
+
"choices": choices_list,
|
|
302
336
|
}
|
|
303
337
|
return cast(ChatCompletionChunk, chat_chunk)
|
|
304
338
|
|
|
@@ -324,15 +358,19 @@ class ChatModelMixin:
|
|
|
324
358
|
chunks: Iterator[CompletionChunk],
|
|
325
359
|
reasoning_parse: Optional[ReasoningParser] = None,
|
|
326
360
|
) -> Iterator[ChatCompletionChunk]:
|
|
361
|
+
previous_texts = [""]
|
|
327
362
|
for i, chunk in enumerate(chunks):
|
|
328
363
|
if i == 0:
|
|
329
|
-
yield cls._get_first_chat_completion_chunk(chunk)
|
|
364
|
+
yield cls._get_first_chat_completion_chunk(chunk, reasoning_parse)
|
|
330
365
|
# usage
|
|
331
366
|
choices = chunk.get("choices")
|
|
332
367
|
if not choices:
|
|
333
368
|
yield cls._get_final_chat_completion_chunk(chunk)
|
|
334
369
|
else:
|
|
335
|
-
|
|
370
|
+
r = cls._to_chat_completion_chunk(
|
|
371
|
+
chunk, reasoning_parse, previous_texts
|
|
372
|
+
)
|
|
373
|
+
yield r
|
|
336
374
|
|
|
337
375
|
@classmethod
|
|
338
376
|
def _tools_to_messages_for_deepseek(
|
|
@@ -370,33 +408,19 @@ class ChatModelMixin:
|
|
|
370
408
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
371
409
|
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
372
410
|
i = 0
|
|
373
|
-
|
|
374
|
-
current_text = ""
|
|
411
|
+
previous_texts = [""]
|
|
375
412
|
async for chunk in chunks:
|
|
376
413
|
if i == 0:
|
|
377
|
-
chat_chunk = cls._get_first_chat_completion_chunk(
|
|
414
|
+
chat_chunk = cls._get_first_chat_completion_chunk(
|
|
415
|
+
chunk, reasoning_parser
|
|
416
|
+
)
|
|
378
417
|
elif not chunk.get("choices"):
|
|
379
418
|
# usage
|
|
380
419
|
chat_chunk = cls._get_final_chat_completion_chunk(chunk)
|
|
381
420
|
else:
|
|
382
|
-
chat_chunk = cls._to_chat_completion_chunk(
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
if choices is None:
|
|
386
|
-
continue
|
|
387
|
-
for choice in choices:
|
|
388
|
-
delta = choice.get("delta")
|
|
389
|
-
if not delta:
|
|
390
|
-
continue
|
|
391
|
-
current_text = previous_text + delta.get("content", "")
|
|
392
|
-
choice[
|
|
393
|
-
"delta"
|
|
394
|
-
] = reasoning_parser.extract_reasoning_content_streaming(
|
|
395
|
-
previous_text=previous_text,
|
|
396
|
-
current_text=current_text,
|
|
397
|
-
delta=delta,
|
|
398
|
-
)
|
|
399
|
-
previous_text = current_text
|
|
421
|
+
chat_chunk = cls._to_chat_completion_chunk(
|
|
422
|
+
chunk, reasoning_parser, previous_texts
|
|
423
|
+
)
|
|
400
424
|
yield chat_chunk
|
|
401
425
|
i += 1
|
|
402
426
|
|
|
@@ -404,6 +428,21 @@ class ChatModelMixin:
|
|
|
404
428
|
def _to_chat_completion(
|
|
405
429
|
completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
|
|
406
430
|
) -> ChatCompletion:
|
|
431
|
+
if completion.get("object") == "chat.completion" and completion.get("choices"):
|
|
432
|
+
# Already a ChatCompletion
|
|
433
|
+
if reasoning_parser is not None:
|
|
434
|
+
for choice in completion["choices"]:
|
|
435
|
+
message = choice["message"] # type: ignore
|
|
436
|
+
text = message["content"]
|
|
437
|
+
(
|
|
438
|
+
reasoning_content,
|
|
439
|
+
content,
|
|
440
|
+
) = reasoning_parser.extract_reasoning_content(text)
|
|
441
|
+
message["content"] = content
|
|
442
|
+
if reasoning_content is not None:
|
|
443
|
+
message["reasoning_content"] = reasoning_content
|
|
444
|
+
return cast(ChatCompletion, completion)
|
|
445
|
+
|
|
407
446
|
choices = []
|
|
408
447
|
for i, choice in enumerate(completion["choices"]):
|
|
409
448
|
content = choice["text"]
|
|
@@ -565,7 +604,14 @@ class ChatModelMixin:
|
|
|
565
604
|
return result
|
|
566
605
|
|
|
567
606
|
@classmethod
|
|
568
|
-
def
|
|
607
|
+
def _post_process_completion_chunk(
|
|
608
|
+
cls,
|
|
609
|
+
model_family,
|
|
610
|
+
model_uid,
|
|
611
|
+
c,
|
|
612
|
+
chunk_id=None,
|
|
613
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
614
|
+
):
|
|
569
615
|
_id = chunk_id if chunk_id is not None else str(uuid.uuid4())
|
|
570
616
|
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
571
617
|
tool_calls = []
|
|
@@ -585,11 +631,22 @@ class ChatModelMixin:
|
|
|
585
631
|
else:
|
|
586
632
|
failed_contents.append(content)
|
|
587
633
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
634
|
+
|
|
635
|
+
reasoning_content = None
|
|
636
|
+
content = ". ".join(failed_contents) if failed_contents else None
|
|
637
|
+
if reasoning_parser is not None:
|
|
638
|
+
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
639
|
+
content
|
|
640
|
+
)
|
|
588
641
|
d = {
|
|
589
642
|
"role": "assistant",
|
|
590
|
-
"content":
|
|
643
|
+
"content": content,
|
|
591
644
|
"tool_calls": tool_calls,
|
|
592
645
|
}
|
|
646
|
+
# add only reasoning_content is None
|
|
647
|
+
if reasoning_content is not None:
|
|
648
|
+
d["reasoning_content"] = reasoning_content
|
|
649
|
+
|
|
593
650
|
try:
|
|
594
651
|
usage = c.get("usage")
|
|
595
652
|
assert "prompt_tokens" in usage
|
|
@@ -616,7 +673,13 @@ class ChatModelMixin:
|
|
|
616
673
|
}
|
|
617
674
|
|
|
618
675
|
@classmethod
|
|
619
|
-
def
|
|
676
|
+
def _post_process_completion(
|
|
677
|
+
cls,
|
|
678
|
+
model_family,
|
|
679
|
+
model_uid,
|
|
680
|
+
c,
|
|
681
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
682
|
+
):
|
|
620
683
|
_id = str(uuid.uuid4())
|
|
621
684
|
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
622
685
|
|
|
@@ -637,11 +700,22 @@ class ChatModelMixin:
|
|
|
637
700
|
else:
|
|
638
701
|
failed_contents.append(content)
|
|
639
702
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
703
|
+
|
|
704
|
+
reasoning_content = None
|
|
705
|
+
content = ". ".join(failed_contents) if failed_contents else None
|
|
706
|
+
if reasoning_parser is not None:
|
|
707
|
+
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
708
|
+
content
|
|
709
|
+
)
|
|
640
710
|
m = {
|
|
641
711
|
"role": "assistant",
|
|
642
|
-
"content":
|
|
712
|
+
"content": content,
|
|
643
713
|
"tool_calls": tool_calls,
|
|
644
714
|
}
|
|
715
|
+
# add only reasoning_content is None
|
|
716
|
+
if reasoning_content is not None:
|
|
717
|
+
m["reasoning_content"] = reasoning_content
|
|
718
|
+
|
|
645
719
|
try:
|
|
646
720
|
usage = c.get("usage")
|
|
647
721
|
assert "prompt_tokens" in usage
|
|
@@ -43,8 +43,6 @@ from ....types import (
|
|
|
43
43
|
)
|
|
44
44
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
45
45
|
from ..llm_family import CustomLLMFamilyV1
|
|
46
|
-
from ..reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
|
|
47
|
-
from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
|
|
48
46
|
from ..utils import (
|
|
49
47
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
50
48
|
QWEN_TOOL_CALL_FAMILY,
|
|
@@ -160,6 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
160
158
|
VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
|
|
161
159
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
|
|
162
160
|
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
|
|
161
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
|
|
163
162
|
VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
|
|
164
163
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
|
|
165
164
|
|
|
@@ -196,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
196
195
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
197
196
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
198
197
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
|
|
198
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
|
|
199
199
|
|
|
200
200
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
|
|
201
201
|
VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
|
|
@@ -211,9 +211,10 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
|
|
|
211
211
|
|
|
212
212
|
if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
|
|
213
213
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
|
|
214
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
|
|
214
215
|
|
|
215
216
|
if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
|
|
216
|
-
VLLM_SUPPORTED_CHAT_MODELS.append("
|
|
217
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
|
|
217
218
|
|
|
218
219
|
|
|
219
220
|
class VLLMModel(LLM):
|
|
@@ -243,7 +244,6 @@ class VLLMModel(LLM):
|
|
|
243
244
|
self.lora_modules = peft_model
|
|
244
245
|
self.lora_requests: List[LoRARequest] = []
|
|
245
246
|
self._xavier_config = None
|
|
246
|
-
self.reasoning_parser = None
|
|
247
247
|
|
|
248
248
|
def set_xavier_config(self, value: Optional[Dict]):
|
|
249
249
|
self._xavier_config = value # type: ignore
|
|
@@ -274,14 +274,8 @@ class VLLMModel(LLM):
|
|
|
274
274
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
275
275
|
reasoning_content = self._model_config.pop("reasoning_content")
|
|
276
276
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
module_name = self.model_family.model_family or self.model_family.model_name
|
|
280
|
-
self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
|
|
281
|
-
self.reasoning_parser = self.reasoning_parser(
|
|
282
|
-
self.model_family.reasoning_start_tag,
|
|
283
|
-
self.model_family.reasoning_end_tag,
|
|
284
|
-
)
|
|
277
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
278
|
+
|
|
285
279
|
if self.lora_modules is None:
|
|
286
280
|
self.lora_requests = []
|
|
287
281
|
else:
|
|
@@ -581,6 +575,10 @@ class VLLMModel(LLM):
|
|
|
581
575
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
582
576
|
|
|
583
577
|
sanitized_generate_config = self._sanitize_generate_config(generate_config)
|
|
578
|
+
if self.reasoning_parser:
|
|
579
|
+
# For reasoning model, the </think> we be split into multiple words,
|
|
580
|
+
# if `stop` param is passed, so we pop it from config.
|
|
581
|
+
sanitized_generate_config.pop("stop")
|
|
584
582
|
logger.debug(
|
|
585
583
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
586
584
|
)
|
|
@@ -812,18 +810,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
812
810
|
i = 0
|
|
813
811
|
async for chunk in chunks:
|
|
814
812
|
if i == 0:
|
|
815
|
-
yield self._get_first_chat_completion_chunk(
|
|
813
|
+
yield self._get_first_chat_completion_chunk(
|
|
814
|
+
chunk, self.reasoning_parser
|
|
815
|
+
)
|
|
816
816
|
# usage
|
|
817
817
|
choices = chunk.get("choices")
|
|
818
818
|
if not choices:
|
|
819
819
|
yield self._get_final_chat_completion_chunk(chunk)
|
|
820
820
|
else:
|
|
821
821
|
if self.is_tool_call_chunk(chunk):
|
|
822
|
-
yield self.
|
|
823
|
-
self.model_family,
|
|
822
|
+
yield self._post_process_completion_chunk(
|
|
823
|
+
self.model_family,
|
|
824
|
+
self.model_uid,
|
|
825
|
+
chunk,
|
|
826
|
+
reasoning_parser=self.reasoning_parser,
|
|
824
827
|
)
|
|
825
828
|
else:
|
|
826
|
-
yield self._to_chat_completion_chunk(chunk)
|
|
829
|
+
yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
|
|
827
830
|
i += 1
|
|
828
831
|
|
|
829
832
|
@vllm_check
|
|
@@ -863,7 +866,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
863
866
|
)
|
|
864
867
|
assert not isinstance(c, AsyncGenerator)
|
|
865
868
|
if tools:
|
|
866
|
-
return self.
|
|
869
|
+
return self._post_process_completion(
|
|
870
|
+
self.model_family, self.model_uid, c, self.reasoning_parser
|
|
871
|
+
)
|
|
867
872
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
868
873
|
|
|
869
874
|
|
|
@@ -905,31 +910,15 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
905
910
|
def _sanitize_model_config(
|
|
906
911
|
self, model_config: Optional[VLLMModelConfig]
|
|
907
912
|
) -> VLLMModelConfig:
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
model_config.setdefault("swap_space", 4)
|
|
918
|
-
model_config.setdefault("gpu_memory_utilization", 0.90)
|
|
919
|
-
model_config.setdefault("max_num_seqs", 256)
|
|
920
|
-
model_config.setdefault("quantization", None)
|
|
921
|
-
model_config.setdefault("max_model_len", None)
|
|
922
|
-
model_config["limit_mm_per_prompt"] = (
|
|
923
|
-
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
924
|
-
if model_config.get("limit_mm_per_prompt")
|
|
925
|
-
else {
|
|
926
|
-
"image": 2, # default 2 images all chat
|
|
927
|
-
}
|
|
928
|
-
)
|
|
929
|
-
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
930
|
-
if vllm.__version__ >= "0.6.3":
|
|
931
|
-
model_config.setdefault("scheduling_policy", "fcfs")
|
|
932
|
-
|
|
913
|
+
model_config = super()._sanitize_model_config(model_config)
|
|
914
|
+
if vllm.__version__ >= "0.5.5":
|
|
915
|
+
model_config["limit_mm_per_prompt"] = (
|
|
916
|
+
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
917
|
+
if model_config.get("limit_mm_per_prompt")
|
|
918
|
+
else {
|
|
919
|
+
"image": 2, # default 2 images all chat
|
|
920
|
+
}
|
|
921
|
+
)
|
|
933
922
|
return model_config
|
|
934
923
|
|
|
935
924
|
def _sanitize_chat_config(
|