xinference 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +97 -8
- xinference/client/restful/restful_client.py +51 -11
- xinference/core/media_interface.py +758 -0
- xinference/core/model.py +49 -9
- xinference/core/worker.py +31 -37
- xinference/deploy/utils.py +0 -3
- xinference/model/audio/__init__.py +16 -27
- xinference/model/audio/core.py +1 -0
- xinference/model/audio/cosyvoice.py +4 -2
- xinference/model/audio/model_spec.json +20 -3
- xinference/model/audio/model_spec_modelscope.json +18 -1
- xinference/model/embedding/__init__.py +16 -24
- xinference/model/image/__init__.py +15 -25
- xinference/model/llm/__init__.py +37 -110
- xinference/model/llm/core.py +15 -6
- xinference/model/llm/llama_cpp/core.py +25 -353
- xinference/model/llm/llm_family.json +613 -89
- xinference/model/llm/llm_family.py +9 -1
- xinference/model/llm/llm_family_modelscope.json +540 -90
- xinference/model/llm/mlx/core.py +6 -3
- xinference/model/llm/reasoning_parser.py +281 -5
- xinference/model/llm/sglang/core.py +16 -3
- xinference/model/llm/transformers/chatglm.py +2 -2
- xinference/model/llm/transformers/cogagent.py +1 -1
- xinference/model/llm/transformers/cogvlm2.py +1 -1
- xinference/model/llm/transformers/core.py +9 -3
- xinference/model/llm/transformers/glm4v.py +1 -1
- xinference/model/llm/transformers/minicpmv26.py +1 -1
- xinference/model/llm/transformers/qwen-omni.py +6 -0
- xinference/model/llm/transformers/qwen_vl.py +1 -1
- xinference/model/llm/utils.py +68 -45
- xinference/model/llm/vllm/core.py +38 -18
- xinference/model/llm/vllm/xavier/test/test_xavier.py +1 -10
- xinference/model/rerank/__init__.py +13 -24
- xinference/model/video/__init__.py +15 -25
- xinference/model/video/core.py +3 -3
- xinference/model/video/diffusers.py +133 -16
- xinference/model/video/model_spec.json +54 -0
- xinference/model/video/model_spec_modelscope.json +56 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
- xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
- xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
- xinference/thirdparty/cosyvoice/bin/train.py +7 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
- xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
- xinference/thirdparty/cosyvoice/cli/model.py +140 -155
- xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
- xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
- xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
- xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
- xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
- xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
- xinference/thirdparty/cosyvoice/utils/common.py +1 -1
- xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
- xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
- xinference/types.py +0 -71
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
- xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
- xinference/web/ui/src/locales/en.json +6 -4
- xinference/web/ui/src/locales/zh.json +6 -4
- {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/METADATA +59 -39
- {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/RECORD +87 -87
- {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/WHEEL +1 -1
- xinference/core/image_interface.py +0 -377
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
- xinference/web/ui/build/static/js/main.91e77b5c.js +0 -3
- xinference/web/ui/build/static/js/main.91e77b5c.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5e6edb0fb87e3798f142e9abf8dd2dc46bab33a60d31dff525797c0c99887097.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6087820be1bd5c02c42dff797e7df365448ef35ab26dd5d6bd33e967e05cbfd4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
- /xinference/web/ui/build/static/js/{main.91e77b5c.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
- {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/top_level.txt +0 -0
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -160,7 +160,10 @@ class MLXModel(LLM):
|
|
|
160
160
|
|
|
161
161
|
def load(self):
|
|
162
162
|
reasoning_content = self._model_config.pop("reasoning_content")
|
|
163
|
-
self.
|
|
163
|
+
enable_thinking = self._model_config.pop("enable_thinking", True)
|
|
164
|
+
self.prepare_parse_reasoning_content(
|
|
165
|
+
reasoning_content, enable_thinking=enable_thinking
|
|
166
|
+
)
|
|
164
167
|
|
|
165
168
|
kwargs = {}
|
|
166
169
|
kwargs["revision"] = self._model_config.get(
|
|
@@ -450,7 +453,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
450
453
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
451
454
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
452
455
|
full_context_kwargs = (
|
|
453
|
-
self._get_chat_template_kwargs_from_generate_config(generate_config) or {} # type: ignore
|
|
456
|
+
self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {} # type: ignore
|
|
454
457
|
)
|
|
455
458
|
if tools:
|
|
456
459
|
if (
|
|
@@ -634,7 +637,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
634
637
|
from qwen_vl_utils import process_vision_info
|
|
635
638
|
|
|
636
639
|
full_context_kwargs = (
|
|
637
|
-
self._get_chat_template_kwargs_from_generate_config(generate_config) # type: ignore
|
|
640
|
+
self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) # type: ignore
|
|
638
641
|
or {}
|
|
639
642
|
)
|
|
640
643
|
if tools and model_family in QWEN_TOOL_CALL_FAMILY:
|
|
@@ -1,20 +1,33 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Optional, Tuple, Union
|
|
2
|
+
from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from ...types import
|
|
4
|
+
from ...types import (
|
|
5
|
+
ChatCompletionChunk,
|
|
6
|
+
ChatCompletionChunkDelta,
|
|
7
|
+
CompletionChoice,
|
|
8
|
+
CompletionChunk,
|
|
9
|
+
)
|
|
5
10
|
|
|
6
11
|
|
|
7
12
|
class ReasoningParser:
|
|
8
13
|
"""Reasoning parser for reasoning model."""
|
|
9
14
|
|
|
10
15
|
def __init__(
|
|
11
|
-
self,
|
|
16
|
+
self,
|
|
17
|
+
reasoning_content: bool = False,
|
|
18
|
+
reasoning_start_tag: str = "",
|
|
19
|
+
reasoning_end_tag: str = "",
|
|
20
|
+
enable_thinking: bool = True,
|
|
12
21
|
):
|
|
22
|
+
self.reasoning_content = reasoning_content
|
|
13
23
|
self.reasoning_start_tag = reasoning_start_tag
|
|
14
24
|
self.reasoning_end_tag = reasoning_end_tag
|
|
15
25
|
self.reasoning_regex = re.compile(
|
|
16
26
|
rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
|
|
17
27
|
)
|
|
28
|
+
# enable_thinking can be set to False only for hybrid model
|
|
29
|
+
# e.g. qwen3, which can support both thinking and non-thinking
|
|
30
|
+
self.enable_thinking = enable_thinking
|
|
18
31
|
|
|
19
32
|
def extract_reasoning_content_streaming(
|
|
20
33
|
self,
|
|
@@ -62,9 +75,9 @@ class ReasoningParser:
|
|
|
62
75
|
delta["content"] = None
|
|
63
76
|
return delta
|
|
64
77
|
elif self.reasoning_start_tag in delta_text:
|
|
78
|
+
start_idx = delta_text.find(self.reasoning_start_tag)
|
|
65
79
|
if self.reasoning_end_tag in delta_text:
|
|
66
80
|
# <think> in delta, </think> in delta, extract reasoning content
|
|
67
|
-
start_idx = delta_text.find(self.reasoning_start_tag)
|
|
68
81
|
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
69
82
|
reasoning_content = delta_text[
|
|
70
83
|
start_idx + len(self.reasoning_start_tag) : end_idx
|
|
@@ -79,7 +92,10 @@ class ReasoningParser:
|
|
|
79
92
|
else:
|
|
80
93
|
# <think> in delta, no </think> in delta,
|
|
81
94
|
# reasoning content continues
|
|
82
|
-
|
|
95
|
+
reasoning_content = delta_text[
|
|
96
|
+
start_idx + len(self.reasoning_start_tag) :
|
|
97
|
+
]
|
|
98
|
+
delta["reasoning_content"] = reasoning_content
|
|
83
99
|
delta["content"] = None
|
|
84
100
|
return delta
|
|
85
101
|
else:
|
|
@@ -142,3 +158,263 @@ class ReasoningParser:
|
|
|
142
158
|
if len(final_output) == 0:
|
|
143
159
|
return reasoning_content, ""
|
|
144
160
|
return reasoning_content, final_output
|
|
161
|
+
|
|
162
|
+
def check_content_parser(self) -> bool:
|
|
163
|
+
"""Check if the parser should extract reasoning content.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
bool: True if reasoning content should be extracted, False otherwise
|
|
167
|
+
"""
|
|
168
|
+
return self.reasoning_content
|
|
169
|
+
|
|
170
|
+
def _create_chat_completion_chunk(
|
|
171
|
+
self, chunk: Union[Dict[str, Any], CompletionChunk], content: str
|
|
172
|
+
) -> ChatCompletionChunk:
|
|
173
|
+
"""Helper method to create a ChatCompletionChunk with specified content.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
chunk: The original chunk to copy metadata from
|
|
177
|
+
content: The content to include in the chunk
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
ChatCompletionChunk: A new chat completion chunk
|
|
181
|
+
"""
|
|
182
|
+
return ChatCompletionChunk(
|
|
183
|
+
id="chat" + chunk["id"],
|
|
184
|
+
model=chunk["model"],
|
|
185
|
+
created=chunk["created"],
|
|
186
|
+
object="chat.completion.chunk",
|
|
187
|
+
choices=[
|
|
188
|
+
{
|
|
189
|
+
"index": 0,
|
|
190
|
+
"delta": {
|
|
191
|
+
"content": content,
|
|
192
|
+
},
|
|
193
|
+
"finish_reason": None,
|
|
194
|
+
}
|
|
195
|
+
],
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def _create_completion_chunk(
|
|
199
|
+
self, chunk: Union[Dict[str, Any], CompletionChunk], text: str
|
|
200
|
+
) -> CompletionChunk:
|
|
201
|
+
"""Helper method to create a CompletionChunk with specified text.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
chunk: The original chunk to copy metadata from
|
|
205
|
+
text: The text to include in the chunk
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
CompletionChunk: A new completion chunk
|
|
209
|
+
"""
|
|
210
|
+
return CompletionChunk(
|
|
211
|
+
id=chunk["id"],
|
|
212
|
+
model=chunk["model"],
|
|
213
|
+
created=chunk["created"],
|
|
214
|
+
object="text_completion",
|
|
215
|
+
choices=[
|
|
216
|
+
{
|
|
217
|
+
"index": 0,
|
|
218
|
+
"text": text,
|
|
219
|
+
"logprobs": None,
|
|
220
|
+
"finish_reason": None,
|
|
221
|
+
}
|
|
222
|
+
],
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
async def prepare_reasoning_content_streaming(
|
|
226
|
+
self, chunks: AsyncGenerator[CompletionChunk, None]
|
|
227
|
+
):
|
|
228
|
+
"""Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
|
|
229
|
+
if not, add a chunk with the tag at the beginning.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
chunks (AsyncGenerator[CompletionChunk, None]): Chunks from model output
|
|
233
|
+
|
|
234
|
+
Yields:
|
|
235
|
+
AsyncGenerator[CompletionChunk, None]: Processed chunks
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
# If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
|
|
239
|
+
# yield chunks as is
|
|
240
|
+
if not self.reasoning_start_tag or not self.enable_thinking:
|
|
241
|
+
async for chunk in chunks:
|
|
242
|
+
yield chunk
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
# If chunks is empty, return
|
|
246
|
+
if not chunks:
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
# Flag to identify the first chunk
|
|
250
|
+
is_first_chunk = True
|
|
251
|
+
|
|
252
|
+
async for chunk in chunks:
|
|
253
|
+
if is_first_chunk:
|
|
254
|
+
# Reset the flag after processing the first chunk
|
|
255
|
+
is_first_chunk = False
|
|
256
|
+
choices = chunk.get("choices")
|
|
257
|
+
if not choices or not choices[0]:
|
|
258
|
+
continue
|
|
259
|
+
if (
|
|
260
|
+
chunk.get("object") == "chat.completion.chunk"
|
|
261
|
+
and "delta" in choices[0]
|
|
262
|
+
):
|
|
263
|
+
# For chat completion chunks with delta format
|
|
264
|
+
delta = choices[0].get("delta")
|
|
265
|
+
if delta is None:
|
|
266
|
+
continue
|
|
267
|
+
assert isinstance(delta, dict)
|
|
268
|
+
text = delta.get("content")
|
|
269
|
+
if text is None:
|
|
270
|
+
continue
|
|
271
|
+
# If the first chunk doesn't contain the reasoning_start_tag
|
|
272
|
+
if self.reasoning_start_tag not in text:
|
|
273
|
+
# Create and yield chunks with reasoning_start_tag and newline
|
|
274
|
+
yield self._create_chat_completion_chunk(
|
|
275
|
+
chunk, f"{self.reasoning_start_tag}\n"
|
|
276
|
+
)
|
|
277
|
+
else:
|
|
278
|
+
# For standard completion chunks
|
|
279
|
+
text = choices[0].get("text")
|
|
280
|
+
if text is None:
|
|
281
|
+
continue
|
|
282
|
+
# If the first chunk doesn't contain the reasoning_start_tag
|
|
283
|
+
if self.reasoning_start_tag not in text:
|
|
284
|
+
# Create and yield chunks with reasoning_start_tag and newline
|
|
285
|
+
yield self._create_completion_chunk(
|
|
286
|
+
chunk, f"{self.reasoning_start_tag}\n"
|
|
287
|
+
)
|
|
288
|
+
# Yield the original first chunk
|
|
289
|
+
yield chunk
|
|
290
|
+
else:
|
|
291
|
+
# For non-first chunks, yield directly
|
|
292
|
+
yield chunk
|
|
293
|
+
|
|
294
|
+
def prepare_reasoning_content_sync(self, chunks: Iterator[CompletionChunk]):
|
|
295
|
+
"""Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
|
|
296
|
+
if not, add a chunk with the tag at the beginning. This is a synchronous version of
|
|
297
|
+
prepare_reasoning_content_streaming.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
chunks (Iterator[CompletionChunk]): Chunks from model output
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Iterator[CompletionChunk]: Processed chunks
|
|
304
|
+
"""
|
|
305
|
+
# If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
|
|
306
|
+
# yield chunks as is
|
|
307
|
+
if not self.reasoning_start_tag or not self.enable_thinking:
|
|
308
|
+
for chunk in chunks:
|
|
309
|
+
yield chunk
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
# Flag to identify the first chunk
|
|
313
|
+
is_first_chunk = True
|
|
314
|
+
|
|
315
|
+
for chunk in chunks:
|
|
316
|
+
if is_first_chunk:
|
|
317
|
+
# Reset the flag after processing the first chunk
|
|
318
|
+
is_first_chunk = False
|
|
319
|
+
choices = chunk.get("choices")
|
|
320
|
+
if not choices or not choices[0]:
|
|
321
|
+
continue
|
|
322
|
+
if (
|
|
323
|
+
chunk.get("object") == "chat.completion.chunk"
|
|
324
|
+
and "delta" in choices[0]
|
|
325
|
+
):
|
|
326
|
+
# For chat completion chunks with delta format
|
|
327
|
+
delta = choices[0].get("delta")
|
|
328
|
+
if delta is None:
|
|
329
|
+
continue
|
|
330
|
+
assert isinstance(delta, dict)
|
|
331
|
+
text = delta.get("content")
|
|
332
|
+
if text is None:
|
|
333
|
+
continue
|
|
334
|
+
# If the first chunk doesn't contain the reasoning_start_tag
|
|
335
|
+
if self.reasoning_start_tag not in text:
|
|
336
|
+
# Create and yield chunks with reasoning_start_tag and newline
|
|
337
|
+
yield self._create_chat_completion_chunk(
|
|
338
|
+
chunk, f"{self.reasoning_start_tag}\n"
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
# For standard completion chunks
|
|
342
|
+
text = choices[0].get("text")
|
|
343
|
+
if text is None:
|
|
344
|
+
continue
|
|
345
|
+
# If the first chunk doesn't contain the reasoning_start_tag
|
|
346
|
+
if self.reasoning_start_tag not in text:
|
|
347
|
+
# Create and yield chunks with reasoning_start_tag and newline
|
|
348
|
+
yield self._create_completion_chunk(
|
|
349
|
+
chunk, f"{self.reasoning_start_tag}\n"
|
|
350
|
+
)
|
|
351
|
+
# Yield the original first chunk
|
|
352
|
+
yield chunk
|
|
353
|
+
else:
|
|
354
|
+
# For non-first chunks, yield directly
|
|
355
|
+
yield chunk
|
|
356
|
+
|
|
357
|
+
def prepare_reasoning_content(self, completion):
|
|
358
|
+
"""Ensures that the model output string starts with the reasoning_start_tag.
|
|
359
|
+
|
|
360
|
+
If the model_output is not a string (e.g., CompletionChoice), it extracts
|
|
361
|
+
the text content. If the reasoning_start_tag is not found in the text,
|
|
362
|
+
it prepends the tag to the text.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
completion: The completion object containing model output,
|
|
366
|
+
which can be either a chat completion or a standard completion.
|
|
367
|
+
"""
|
|
368
|
+
if not self.reasoning_start_tag or not self.enable_thinking:
|
|
369
|
+
return completion
|
|
370
|
+
|
|
371
|
+
if completion.get("object") == "chat.completion" and completion.get("choices"):
|
|
372
|
+
text = completion["choices"][0]["message"]["content"]
|
|
373
|
+
if self.reasoning_start_tag not in text:
|
|
374
|
+
text = f"{self.reasoning_start_tag}\n{text}"
|
|
375
|
+
completion["choices"][0]["message"]["content"] = text
|
|
376
|
+
return completion
|
|
377
|
+
|
|
378
|
+
text = completion["choices"][0]["text"]
|
|
379
|
+
if self.reasoning_start_tag not in text:
|
|
380
|
+
text = f"{self.reasoning_start_tag}\n{text}"
|
|
381
|
+
completion["choices"][0]["text"] = text
|
|
382
|
+
return completion
|
|
383
|
+
|
|
384
|
+
def prepare_first_reasoning_content_chunk(
|
|
385
|
+
self,
|
|
386
|
+
chunk: CompletionChunk,
|
|
387
|
+
) -> List[ChatCompletionChunk]:
|
|
388
|
+
"""Prepares the first chunk of a completion by adding reasoning_start_tag if needed.
|
|
389
|
+
|
|
390
|
+
This function checks if the first chunk contains the reasoning_start_tag. If not,
|
|
391
|
+
it creates two new chunks containing the reasoning_start_tag and a newline character
|
|
392
|
+
that will be inserted before the original chunk.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
chunk (CompletionChunk): The first chunk of a completion to check and possibly modify
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
List[ChatCompletionChunk]: A list of new chunks to insert before the original chunk,
|
|
399
|
+
or an empty list if no modification is needed
|
|
400
|
+
"""
|
|
401
|
+
chunks: List[ChatCompletionChunk] = []
|
|
402
|
+
if not self.reasoning_start_tag or not self.enable_thinking:
|
|
403
|
+
return chunks
|
|
404
|
+
|
|
405
|
+
choices = chunk.get("choices")
|
|
406
|
+
if not choices or not choices[0]:
|
|
407
|
+
return chunks
|
|
408
|
+
text = choices[0].get("text")
|
|
409
|
+
if not text:
|
|
410
|
+
return chunks
|
|
411
|
+
|
|
412
|
+
if self.reasoning_start_tag not in text:
|
|
413
|
+
# Create chunks with reasoning_start_tag and newline
|
|
414
|
+
chunks.append(
|
|
415
|
+
self._create_chat_completion_chunk(
|
|
416
|
+
chunk, f"{self.reasoning_start_tag}\n"
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
return chunks
|
|
@@ -101,13 +101,17 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
|
101
101
|
"deepseek-v2-chat-0628",
|
|
102
102
|
"qwen2.5-instruct",
|
|
103
103
|
"qwen2.5-coder-instruct",
|
|
104
|
+
"XiYanSQL-QwenCoder-2504",
|
|
104
105
|
"QwQ-32B-Preview",
|
|
105
106
|
"QwQ-32B",
|
|
106
107
|
"deepseek-r1-distill-qwen",
|
|
107
108
|
"deepseek-r1-distill-llama",
|
|
108
109
|
"deepseek-v3",
|
|
109
110
|
"deepseek-r1",
|
|
111
|
+
"DianJin-R1",
|
|
110
112
|
"qwen3",
|
|
113
|
+
"HuatuoGPT-o1-Qwen2.5",
|
|
114
|
+
"HuatuoGPT-o1-LLaMA-3.1",
|
|
111
115
|
]
|
|
112
116
|
SGLANG_SUPPORTED_VISION_MODEL_LIST = [
|
|
113
117
|
"qwen2.5-vl-instruct",
|
|
@@ -155,7 +159,10 @@ class SGLANGModel(LLM):
|
|
|
155
159
|
|
|
156
160
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
157
161
|
reasoning_content = self._model_config.pop("reasoning_content")
|
|
158
|
-
self.
|
|
162
|
+
enable_thinking = self._model_config.pop("enable_thinking", False)
|
|
163
|
+
self.prepare_parse_reasoning_content(
|
|
164
|
+
reasoning_content, enable_thinking=enable_thinking
|
|
165
|
+
)
|
|
159
166
|
|
|
160
167
|
# Fix: GH#2169
|
|
161
168
|
if sgl.__version__ >= "0.2.14":
|
|
@@ -568,7 +575,10 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
568
575
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
569
576
|
assert self.model_family.chat_template is not None
|
|
570
577
|
full_context_kwargs = (
|
|
571
|
-
self._get_chat_template_kwargs_from_generate_config(
|
|
578
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
579
|
+
generate_config, self.reasoning_parser
|
|
580
|
+
)
|
|
581
|
+
or {}
|
|
572
582
|
)
|
|
573
583
|
full_prompt = self.get_full_context(
|
|
574
584
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
@@ -640,7 +650,10 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
|
|
|
640
650
|
)
|
|
641
651
|
|
|
642
652
|
full_context_kwargs = (
|
|
643
|
-
self._get_chat_template_kwargs_from_generate_config(
|
|
653
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
654
|
+
generate_config, self.reasoning_parser
|
|
655
|
+
)
|
|
656
|
+
or {}
|
|
644
657
|
)
|
|
645
658
|
prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
|
|
646
659
|
images, video_inputs = process_vision_info(messages)
|
|
@@ -464,7 +464,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
464
464
|
|
|
465
465
|
full_context_kwargs = (
|
|
466
466
|
self._get_chat_template_kwargs_from_generate_config(
|
|
467
|
-
r.generate_config
|
|
467
|
+
r.generate_config, self.reasoning_parser
|
|
468
468
|
)
|
|
469
469
|
or {}
|
|
470
470
|
)
|
|
@@ -508,7 +508,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
508
508
|
|
|
509
509
|
if "<bos_stream>" in req.completion:
|
|
510
510
|
bos_pos = req.completion.index("<bos_stream>")
|
|
511
|
-
results.
|
|
511
|
+
results.extend(
|
|
512
512
|
self._get_first_chat_completion_chunk(req.completion[bos_pos + 1])
|
|
513
513
|
)
|
|
514
514
|
|
|
@@ -207,7 +207,7 @@ class CogAgentChatModel(PytorchChatModel):
|
|
|
207
207
|
"return_dict": True,
|
|
208
208
|
}
|
|
209
209
|
full_context_kwargs.update(
|
|
210
|
-
self._get_chat_template_kwargs_from_generate_config(generate_config) or {} # type: ignore
|
|
210
|
+
self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {} # type: ignore
|
|
211
211
|
)
|
|
212
212
|
assert self.model_family.chat_template is not None
|
|
213
213
|
inputs = self.get_full_context(
|
|
@@ -316,7 +316,7 @@ class CogVLM2Model(PytorchChatModel):
|
|
|
316
316
|
def get_dtype(self):
|
|
317
317
|
return self._torch_type
|
|
318
318
|
|
|
319
|
-
def _get_full_prompt(self, messages: List[Dict], tools): # type: ignore
|
|
319
|
+
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
|
|
320
320
|
prompt, system_prompt, chat_history = parse_messages(messages)
|
|
321
321
|
system_prompt = system_prompt or ""
|
|
322
322
|
query, image, history = self.get_query_and_history(
|
|
@@ -339,7 +339,10 @@ class PytorchModel(LLM):
|
|
|
339
339
|
is_device_map_auto = True
|
|
340
340
|
|
|
341
341
|
reasoning_content = self._pytorch_model_config.pop("reasoning_content")
|
|
342
|
-
self.
|
|
342
|
+
enable_thinking = self._pytorch_model_config.pop("enable_thinking", False)
|
|
343
|
+
self.prepare_parse_reasoning_content(
|
|
344
|
+
reasoning_content, enable_thinking=enable_thinking
|
|
345
|
+
)
|
|
343
346
|
|
|
344
347
|
if self._check_tensorizer_integrity():
|
|
345
348
|
self._model, self._tokenizer = self._load_tensorizer(**kwargs)
|
|
@@ -702,7 +705,10 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
702
705
|
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
|
|
703
706
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
704
707
|
full_context_kwargs = (
|
|
705
|
-
self._get_chat_template_kwargs_from_generate_config(
|
|
708
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
709
|
+
generate_config, self.reasoning_parser
|
|
710
|
+
)
|
|
711
|
+
or {}
|
|
706
712
|
)
|
|
707
713
|
if (
|
|
708
714
|
tools
|
|
@@ -753,7 +759,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
753
759
|
results = []
|
|
754
760
|
for i, c in enumerate(req.completion):
|
|
755
761
|
if c == "<bos_stream>":
|
|
756
|
-
results.
|
|
762
|
+
results.extend(
|
|
757
763
|
self._get_first_chat_completion_chunk(
|
|
758
764
|
req.completion[i + 1], self.reasoning_parser
|
|
759
765
|
)
|
|
@@ -196,7 +196,7 @@ class Glm4VModel(PytorchChatModel):
|
|
|
196
196
|
has_content=False,
|
|
197
197
|
)
|
|
198
198
|
|
|
199
|
-
def _get_full_prompt(self, messages, tools):
|
|
199
|
+
def _get_full_prompt(self, messages, tools, generate_config: dict):
|
|
200
200
|
msgs = self._get_processed_msgs(messages)
|
|
201
201
|
inputs = self._tokenizer.apply_chat_template(
|
|
202
202
|
msgs,
|
|
@@ -324,7 +324,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
324
324
|
"input_image": images,
|
|
325
325
|
}
|
|
326
326
|
|
|
327
|
-
def _get_full_prompt(self, messages: List[Dict], tools): # type: ignore
|
|
327
|
+
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
|
|
328
328
|
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
329
329
|
if video_existed:
|
|
330
330
|
raise RuntimeError(
|
|
@@ -67,6 +67,12 @@ class Qwen2_5OmniChatModel(PytorchChatModel):
|
|
|
67
67
|
return False
|
|
68
68
|
|
|
69
69
|
def load(self):
|
|
70
|
+
logger.debug(
|
|
71
|
+
"Try to load model, current python: %s, sys path: %s",
|
|
72
|
+
sys.executable,
|
|
73
|
+
sys.path,
|
|
74
|
+
)
|
|
75
|
+
|
|
70
76
|
from transformers import (
|
|
71
77
|
Qwen2_5OmniForConditionalGeneration,
|
|
72
78
|
Qwen2_5OmniProcessor,
|
|
@@ -313,7 +313,7 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
313
313
|
|
|
314
314
|
return raw_text, context_tokens
|
|
315
315
|
|
|
316
|
-
def _get_full_prompt(self, messages: List[Dict], tools): # type: ignore
|
|
316
|
+
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
|
|
317
317
|
prompt, qwen_history = self._get_prompt_and_chat_history(messages)
|
|
318
318
|
_, context_tokens = self.make_context(self._tokenizer, prompt, qwen_history)
|
|
319
319
|
return context_tokens
|