xinference 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +97 -8
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/core/media_interface.py +758 -0
  5. xinference/core/model.py +49 -9
  6. xinference/core/worker.py +31 -37
  7. xinference/deploy/utils.py +0 -3
  8. xinference/model/audio/__init__.py +16 -27
  9. xinference/model/audio/core.py +1 -0
  10. xinference/model/audio/cosyvoice.py +4 -2
  11. xinference/model/audio/model_spec.json +20 -3
  12. xinference/model/audio/model_spec_modelscope.json +18 -1
  13. xinference/model/embedding/__init__.py +16 -24
  14. xinference/model/image/__init__.py +15 -25
  15. xinference/model/llm/__init__.py +37 -110
  16. xinference/model/llm/core.py +15 -6
  17. xinference/model/llm/llama_cpp/core.py +25 -353
  18. xinference/model/llm/llm_family.json +613 -89
  19. xinference/model/llm/llm_family.py +9 -1
  20. xinference/model/llm/llm_family_modelscope.json +540 -90
  21. xinference/model/llm/mlx/core.py +6 -3
  22. xinference/model/llm/reasoning_parser.py +281 -5
  23. xinference/model/llm/sglang/core.py +16 -3
  24. xinference/model/llm/transformers/chatglm.py +2 -2
  25. xinference/model/llm/transformers/cogagent.py +1 -1
  26. xinference/model/llm/transformers/cogvlm2.py +1 -1
  27. xinference/model/llm/transformers/core.py +9 -3
  28. xinference/model/llm/transformers/glm4v.py +1 -1
  29. xinference/model/llm/transformers/minicpmv26.py +1 -1
  30. xinference/model/llm/transformers/qwen-omni.py +6 -0
  31. xinference/model/llm/transformers/qwen_vl.py +1 -1
  32. xinference/model/llm/utils.py +68 -45
  33. xinference/model/llm/vllm/core.py +38 -18
  34. xinference/model/llm/vllm/xavier/test/test_xavier.py +1 -10
  35. xinference/model/rerank/__init__.py +13 -24
  36. xinference/model/video/__init__.py +15 -25
  37. xinference/model/video/core.py +3 -3
  38. xinference/model/video/diffusers.py +133 -16
  39. xinference/model/video/model_spec.json +54 -0
  40. xinference/model/video/model_spec_modelscope.json +56 -0
  41. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  42. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  43. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  44. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  45. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  46. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  47. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  48. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  49. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  50. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  51. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  52. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  53. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  54. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  55. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  56. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  57. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  58. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  59. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  60. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  61. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  62. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  63. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  64. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  65. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  66. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  67. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  68. xinference/types.py +0 -71
  69. xinference/web/ui/build/asset-manifest.json +3 -3
  70. xinference/web/ui/build/index.html +1 -1
  71. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  72. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  79. xinference/web/ui/src/locales/en.json +6 -4
  80. xinference/web/ui/src/locales/zh.json +6 -4
  81. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
  82. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/RECORD +87 -87
  83. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
  84. xinference/core/image_interface.py +0 -377
  85. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  86. xinference/web/ui/build/static/js/main.91e77b5c.js +0 -3
  87. xinference/web/ui/build/static/js/main.91e77b5c.js.map +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/5e6edb0fb87e3798f142e9abf8dd2dc46bab33a60d31dff525797c0c99887097.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/6087820be1bd5c02c42dff797e7df365448ef35ab26dd5d6bd33e967e05cbfd4.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.91e77b5c.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  94. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
  95. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
  96. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
@@ -160,7 +160,10 @@ class MLXModel(LLM):
160
160
 
161
161
  def load(self):
162
162
  reasoning_content = self._model_config.pop("reasoning_content")
163
- self.prepare_parse_reasoning_content(reasoning_content)
163
+ enable_thinking = self._model_config.pop("enable_thinking", True)
164
+ self.prepare_parse_reasoning_content(
165
+ reasoning_content, enable_thinking=enable_thinking
166
+ )
164
167
 
165
168
  kwargs = {}
166
169
  kwargs["revision"] = self._model_config.get(
@@ -450,7 +453,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
450
453
  model_family = self.model_family.model_family or self.model_family.model_name
451
454
  tools = generate_config.pop("tools", []) if generate_config else None
452
455
  full_context_kwargs = (
453
- self._get_chat_template_kwargs_from_generate_config(generate_config) or {} # type: ignore
456
+ self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {} # type: ignore
454
457
  )
455
458
  if tools:
456
459
  if (
@@ -634,7 +637,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
634
637
  from qwen_vl_utils import process_vision_info
635
638
 
636
639
  full_context_kwargs = (
637
- self._get_chat_template_kwargs_from_generate_config(generate_config) # type: ignore
640
+ self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) # type: ignore
638
641
  or {}
639
642
  )
640
643
  if tools and model_family in QWEN_TOOL_CALL_FAMILY:
@@ -1,20 +1,33 @@
1
1
  import re
2
- from typing import Optional, Tuple, Union
2
+ from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union
3
3
 
4
- from ...types import ChatCompletionChunkDelta, CompletionChoice
4
+ from ...types import (
5
+ ChatCompletionChunk,
6
+ ChatCompletionChunkDelta,
7
+ CompletionChoice,
8
+ CompletionChunk,
9
+ )
5
10
 
6
11
 
7
12
  class ReasoningParser:
8
13
  """Reasoning parser for reasoning model."""
9
14
 
10
15
  def __init__(
11
- self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
16
+ self,
17
+ reasoning_content: bool = False,
18
+ reasoning_start_tag: str = "",
19
+ reasoning_end_tag: str = "",
20
+ enable_thinking: bool = True,
12
21
  ):
22
+ self.reasoning_content = reasoning_content
13
23
  self.reasoning_start_tag = reasoning_start_tag
14
24
  self.reasoning_end_tag = reasoning_end_tag
15
25
  self.reasoning_regex = re.compile(
16
26
  rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
17
27
  )
28
+ # enable_thinking can be set to False only for hybrid model
29
+ # e.g. qwen3, which can support both thinking and non-thinking
30
+ self.enable_thinking = enable_thinking
18
31
 
19
32
  def extract_reasoning_content_streaming(
20
33
  self,
@@ -62,9 +75,9 @@ class ReasoningParser:
62
75
  delta["content"] = None
63
76
  return delta
64
77
  elif self.reasoning_start_tag in delta_text:
78
+ start_idx = delta_text.find(self.reasoning_start_tag)
65
79
  if self.reasoning_end_tag in delta_text:
66
80
  # <think> in delta, </think> in delta, extract reasoning content
67
- start_idx = delta_text.find(self.reasoning_start_tag)
68
81
  end_idx = delta_text.find(self.reasoning_end_tag)
69
82
  reasoning_content = delta_text[
70
83
  start_idx + len(self.reasoning_start_tag) : end_idx
@@ -79,7 +92,10 @@ class ReasoningParser:
79
92
  else:
80
93
  # <think> in delta, no </think> in delta,
81
94
  # reasoning content continues
82
- delta["reasoning_content"] = delta_text
95
+ reasoning_content = delta_text[
96
+ start_idx + len(self.reasoning_start_tag) :
97
+ ]
98
+ delta["reasoning_content"] = reasoning_content
83
99
  delta["content"] = None
84
100
  return delta
85
101
  else:
@@ -142,3 +158,263 @@ class ReasoningParser:
142
158
  if len(final_output) == 0:
143
159
  return reasoning_content, ""
144
160
  return reasoning_content, final_output
161
+
162
+ def check_content_parser(self) -> bool:
163
+ """Check if the parser should extract reasoning content.
164
+
165
+ Returns:
166
+ bool: True if reasoning content should be extracted, False otherwise
167
+ """
168
+ return self.reasoning_content
169
+
170
+ def _create_chat_completion_chunk(
171
+ self, chunk: Union[Dict[str, Any], CompletionChunk], content: str
172
+ ) -> ChatCompletionChunk:
173
+ """Helper method to create a ChatCompletionChunk with specified content.
174
+
175
+ Args:
176
+ chunk: The original chunk to copy metadata from
177
+ content: The content to include in the chunk
178
+
179
+ Returns:
180
+ ChatCompletionChunk: A new chat completion chunk
181
+ """
182
+ return ChatCompletionChunk(
183
+ id="chat" + chunk["id"],
184
+ model=chunk["model"],
185
+ created=chunk["created"],
186
+ object="chat.completion.chunk",
187
+ choices=[
188
+ {
189
+ "index": 0,
190
+ "delta": {
191
+ "content": content,
192
+ },
193
+ "finish_reason": None,
194
+ }
195
+ ],
196
+ )
197
+
198
+ def _create_completion_chunk(
199
+ self, chunk: Union[Dict[str, Any], CompletionChunk], text: str
200
+ ) -> CompletionChunk:
201
+ """Helper method to create a CompletionChunk with specified text.
202
+
203
+ Args:
204
+ chunk: The original chunk to copy metadata from
205
+ text: The text to include in the chunk
206
+
207
+ Returns:
208
+ CompletionChunk: A new completion chunk
209
+ """
210
+ return CompletionChunk(
211
+ id=chunk["id"],
212
+ model=chunk["model"],
213
+ created=chunk["created"],
214
+ object="text_completion",
215
+ choices=[
216
+ {
217
+ "index": 0,
218
+ "text": text,
219
+ "logprobs": None,
220
+ "finish_reason": None,
221
+ }
222
+ ],
223
+ )
224
+
225
+ async def prepare_reasoning_content_streaming(
226
+ self, chunks: AsyncGenerator[CompletionChunk, None]
227
+ ):
228
+ """Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
229
+ if not, add a chunk with the tag at the beginning.
230
+
231
+ Args:
232
+ chunks (AsyncGenerator[CompletionChunk, None]): Chunks from model output
233
+
234
+ Yields:
235
+ AsyncGenerator[CompletionChunk, None]: Processed chunks
236
+ """
237
+
238
+ # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
239
+ # yield chunks as is
240
+ if not self.reasoning_start_tag or not self.enable_thinking:
241
+ async for chunk in chunks:
242
+ yield chunk
243
+ return
244
+
245
+ # If chunks is empty, return
246
+ if not chunks:
247
+ return
248
+
249
+ # Flag to identify the first chunk
250
+ is_first_chunk = True
251
+
252
+ async for chunk in chunks:
253
+ if is_first_chunk:
254
+ # Reset the flag after processing the first chunk
255
+ is_first_chunk = False
256
+ choices = chunk.get("choices")
257
+ if not choices or not choices[0]:
258
+ continue
259
+ if (
260
+ chunk.get("object") == "chat.completion.chunk"
261
+ and "delta" in choices[0]
262
+ ):
263
+ # For chat completion chunks with delta format
264
+ delta = choices[0].get("delta")
265
+ if delta is None:
266
+ continue
267
+ assert isinstance(delta, dict)
268
+ text = delta.get("content")
269
+ if text is None:
270
+ continue
271
+ # If the first chunk doesn't contain the reasoning_start_tag
272
+ if self.reasoning_start_tag not in text:
273
+ # Create and yield chunks with reasoning_start_tag and newline
274
+ yield self._create_chat_completion_chunk(
275
+ chunk, f"{self.reasoning_start_tag}\n"
276
+ )
277
+ else:
278
+ # For standard completion chunks
279
+ text = choices[0].get("text")
280
+ if text is None:
281
+ continue
282
+ # If the first chunk doesn't contain the reasoning_start_tag
283
+ if self.reasoning_start_tag not in text:
284
+ # Create and yield chunks with reasoning_start_tag and newline
285
+ yield self._create_completion_chunk(
286
+ chunk, f"{self.reasoning_start_tag}\n"
287
+ )
288
+ # Yield the original first chunk
289
+ yield chunk
290
+ else:
291
+ # For non-first chunks, yield directly
292
+ yield chunk
293
+
294
+ def prepare_reasoning_content_sync(self, chunks: Iterator[CompletionChunk]):
295
+ """Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
296
+ if not, add a chunk with the tag at the beginning. This is a synchronous version of
297
+ prepare_reasoning_content_streaming.
298
+
299
+ Args:
300
+ chunks (Iterator[CompletionChunk]): Chunks from model output
301
+
302
+ Returns:
303
+ Iterator[CompletionChunk]: Processed chunks
304
+ """
305
+ # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
306
+ # yield chunks as is
307
+ if not self.reasoning_start_tag or not self.enable_thinking:
308
+ for chunk in chunks:
309
+ yield chunk
310
+ return
311
+
312
+ # Flag to identify the first chunk
313
+ is_first_chunk = True
314
+
315
+ for chunk in chunks:
316
+ if is_first_chunk:
317
+ # Reset the flag after processing the first chunk
318
+ is_first_chunk = False
319
+ choices = chunk.get("choices")
320
+ if not choices or not choices[0]:
321
+ continue
322
+ if (
323
+ chunk.get("object") == "chat.completion.chunk"
324
+ and "delta" in choices[0]
325
+ ):
326
+ # For chat completion chunks with delta format
327
+ delta = choices[0].get("delta")
328
+ if delta is None:
329
+ continue
330
+ assert isinstance(delta, dict)
331
+ text = delta.get("content")
332
+ if text is None:
333
+ continue
334
+ # If the first chunk doesn't contain the reasoning_start_tag
335
+ if self.reasoning_start_tag not in text:
336
+ # Create and yield chunks with reasoning_start_tag and newline
337
+ yield self._create_chat_completion_chunk(
338
+ chunk, f"{self.reasoning_start_tag}\n"
339
+ )
340
+ else:
341
+ # For standard completion chunks
342
+ text = choices[0].get("text")
343
+ if text is None:
344
+ continue
345
+ # If the first chunk doesn't contain the reasoning_start_tag
346
+ if self.reasoning_start_tag not in text:
347
+ # Create and yield chunks with reasoning_start_tag and newline
348
+ yield self._create_completion_chunk(
349
+ chunk, f"{self.reasoning_start_tag}\n"
350
+ )
351
+ # Yield the original first chunk
352
+ yield chunk
353
+ else:
354
+ # For non-first chunks, yield directly
355
+ yield chunk
356
+
357
+ def prepare_reasoning_content(self, completion):
358
+ """Ensures that the model output string starts with the reasoning_start_tag.
359
+
360
+ If the model_output is not a string (e.g., CompletionChoice), it extracts
361
+ the text content. If the reasoning_start_tag is not found in the text,
362
+ it prepends the tag to the text.
363
+
364
+ Args:
365
+ completion: The completion object containing model output,
366
+ which can be either a chat completion or a standard completion.
367
+ """
368
+ if not self.reasoning_start_tag or not self.enable_thinking:
369
+ return completion
370
+
371
+ if completion.get("object") == "chat.completion" and completion.get("choices"):
372
+ text = completion["choices"][0]["message"]["content"]
373
+ if self.reasoning_start_tag not in text:
374
+ text = f"{self.reasoning_start_tag}\n{text}"
375
+ completion["choices"][0]["message"]["content"] = text
376
+ return completion
377
+
378
+ text = completion["choices"][0]["text"]
379
+ if self.reasoning_start_tag not in text:
380
+ text = f"{self.reasoning_start_tag}\n{text}"
381
+ completion["choices"][0]["text"] = text
382
+ return completion
383
+
384
+ def prepare_first_reasoning_content_chunk(
385
+ self,
386
+ chunk: CompletionChunk,
387
+ ) -> List[ChatCompletionChunk]:
388
+ """Prepares the first chunk of a completion by adding reasoning_start_tag if needed.
389
+
390
+ This function checks if the first chunk contains the reasoning_start_tag. If not,
391
+ it creates two new chunks containing the reasoning_start_tag and a newline character
392
+ that will be inserted before the original chunk.
393
+
394
+ Args:
395
+ chunk (CompletionChunk): The first chunk of a completion to check and possibly modify
396
+
397
+ Returns:
398
+ List[ChatCompletionChunk]: A list of new chunks to insert before the original chunk,
399
+ or an empty list if no modification is needed
400
+ """
401
+ chunks: List[ChatCompletionChunk] = []
402
+ if not self.reasoning_start_tag or not self.enable_thinking:
403
+ return chunks
404
+
405
+ choices = chunk.get("choices")
406
+ if not choices or not choices[0]:
407
+ return chunks
408
+ text = choices[0].get("text")
409
+ if not text:
410
+ return chunks
411
+
412
+ if self.reasoning_start_tag not in text:
413
+ # Create chunks with reasoning_start_tag and newline
414
+ chunks.append(
415
+ self._create_chat_completion_chunk(
416
+ chunk, f"{self.reasoning_start_tag}\n"
417
+ )
418
+ )
419
+
420
+ return chunks
@@ -101,13 +101,17 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
101
101
  "deepseek-v2-chat-0628",
102
102
  "qwen2.5-instruct",
103
103
  "qwen2.5-coder-instruct",
104
+ "XiYanSQL-QwenCoder-2504",
104
105
  "QwQ-32B-Preview",
105
106
  "QwQ-32B",
106
107
  "deepseek-r1-distill-qwen",
107
108
  "deepseek-r1-distill-llama",
108
109
  "deepseek-v3",
109
110
  "deepseek-r1",
111
+ "DianJin-R1",
110
112
  "qwen3",
113
+ "HuatuoGPT-o1-Qwen2.5",
114
+ "HuatuoGPT-o1-LLaMA-3.1",
111
115
  ]
112
116
  SGLANG_SUPPORTED_VISION_MODEL_LIST = [
113
117
  "qwen2.5-vl-instruct",
@@ -155,7 +159,10 @@ class SGLANGModel(LLM):
155
159
 
156
160
  self._model_config = self._sanitize_model_config(self._model_config)
157
161
  reasoning_content = self._model_config.pop("reasoning_content")
158
- self.prepare_parse_reasoning_content(reasoning_content)
162
+ enable_thinking = self._model_config.pop("enable_thinking", False)
163
+ self.prepare_parse_reasoning_content(
164
+ reasoning_content, enable_thinking=enable_thinking
165
+ )
159
166
 
160
167
  # Fix: GH#2169
161
168
  if sgl.__version__ >= "0.2.14":
@@ -568,7 +575,10 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
568
575
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
569
576
  assert self.model_family.chat_template is not None
570
577
  full_context_kwargs = (
571
- self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
578
+ self._get_chat_template_kwargs_from_generate_config(
579
+ generate_config, self.reasoning_parser
580
+ )
581
+ or {}
572
582
  )
573
583
  full_prompt = self.get_full_context(
574
584
  messages, self.model_family.chat_template, **full_context_kwargs
@@ -640,7 +650,10 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
640
650
  )
641
651
 
642
652
  full_context_kwargs = (
643
- self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
653
+ self._get_chat_template_kwargs_from_generate_config(
654
+ generate_config, self.reasoning_parser
655
+ )
656
+ or {}
644
657
  )
645
658
  prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
646
659
  images, video_inputs = process_vision_info(messages)
@@ -464,7 +464,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
464
464
 
465
465
  full_context_kwargs = (
466
466
  self._get_chat_template_kwargs_from_generate_config(
467
- r.generate_config
467
+ r.generate_config, self.reasoning_parser
468
468
  )
469
469
  or {}
470
470
  )
@@ -508,7 +508,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
508
508
 
509
509
  if "<bos_stream>" in req.completion:
510
510
  bos_pos = req.completion.index("<bos_stream>")
511
- results.append(
511
+ results.extend(
512
512
  self._get_first_chat_completion_chunk(req.completion[bos_pos + 1])
513
513
  )
514
514
 
@@ -207,7 +207,7 @@ class CogAgentChatModel(PytorchChatModel):
207
207
  "return_dict": True,
208
208
  }
209
209
  full_context_kwargs.update(
210
- self._get_chat_template_kwargs_from_generate_config(generate_config) or {} # type: ignore
210
+ self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {} # type: ignore
211
211
  )
212
212
  assert self.model_family.chat_template is not None
213
213
  inputs = self.get_full_context(
@@ -316,7 +316,7 @@ class CogVLM2Model(PytorchChatModel):
316
316
  def get_dtype(self):
317
317
  return self._torch_type
318
318
 
319
- def _get_full_prompt(self, messages: List[Dict], tools): # type: ignore
319
+ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
320
320
  prompt, system_prompt, chat_history = parse_messages(messages)
321
321
  system_prompt = system_prompt or ""
322
322
  query, image, history = self.get_query_and_history(
@@ -339,7 +339,10 @@ class PytorchModel(LLM):
339
339
  is_device_map_auto = True
340
340
 
341
341
  reasoning_content = self._pytorch_model_config.pop("reasoning_content")
342
- self.prepare_parse_reasoning_content(reasoning_content)
342
+ enable_thinking = self._pytorch_model_config.pop("enable_thinking", False)
343
+ self.prepare_parse_reasoning_content(
344
+ reasoning_content, enable_thinking=enable_thinking
345
+ )
343
346
 
344
347
  if self._check_tensorizer_integrity():
345
348
  self._model, self._tokenizer = self._load_tensorizer(**kwargs)
@@ -702,7 +705,10 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
702
705
  def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
703
706
  model_family = self.model_family.model_family or self.model_family.model_name
704
707
  full_context_kwargs = (
705
- self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
708
+ self._get_chat_template_kwargs_from_generate_config(
709
+ generate_config, self.reasoning_parser
710
+ )
711
+ or {}
706
712
  )
707
713
  if (
708
714
  tools
@@ -753,7 +759,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
753
759
  results = []
754
760
  for i, c in enumerate(req.completion):
755
761
  if c == "<bos_stream>":
756
- results.append(
762
+ results.extend(
757
763
  self._get_first_chat_completion_chunk(
758
764
  req.completion[i + 1], self.reasoning_parser
759
765
  )
@@ -196,7 +196,7 @@ class Glm4VModel(PytorchChatModel):
196
196
  has_content=False,
197
197
  )
198
198
 
199
- def _get_full_prompt(self, messages, tools):
199
+ def _get_full_prompt(self, messages, tools, generate_config: dict):
200
200
  msgs = self._get_processed_msgs(messages)
201
201
  inputs = self._tokenizer.apply_chat_template(
202
202
  msgs,
@@ -324,7 +324,7 @@ class MiniCPMV26Model(PytorchChatModel):
324
324
  "input_image": images,
325
325
  }
326
326
 
327
- def _get_full_prompt(self, messages: List[Dict], tools): # type: ignore
327
+ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
328
328
  msgs, video_existed = self._convert_to_specific_style(messages)
329
329
  if video_existed:
330
330
  raise RuntimeError(
@@ -67,6 +67,12 @@ class Qwen2_5OmniChatModel(PytorchChatModel):
67
67
  return False
68
68
 
69
69
  def load(self):
70
+ logger.debug(
71
+ "Try to load model, current python: %s, sys path: %s",
72
+ sys.executable,
73
+ sys.path,
74
+ )
75
+
70
76
  from transformers import (
71
77
  Qwen2_5OmniForConditionalGeneration,
72
78
  Qwen2_5OmniProcessor,
@@ -313,7 +313,7 @@ class QwenVLChatModel(PytorchChatModel):
313
313
 
314
314
  return raw_text, context_tokens
315
315
 
316
- def _get_full_prompt(self, messages: List[Dict], tools): # type: ignore
316
+ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
317
317
  prompt, qwen_history = self._get_prompt_and_chat_history(messages)
318
318
  _, context_tokens = self.make_context(self._tokenizer, prompt, qwen_history)
319
319
  return context_tokens