xinference 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (53) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +1 -0
  3. xinference/conftest.py +7 -0
  4. xinference/core/chat_interface.py +39 -24
  5. xinference/core/model.py +3 -1
  6. xinference/core/scheduler.py +3 -0
  7. xinference/core/worker.py +1 -1
  8. xinference/model/embedding/core.py +12 -5
  9. xinference/model/llm/__init__.py +2 -1
  10. xinference/model/llm/core.py +10 -0
  11. xinference/model/llm/llama_cpp/core.py +266 -3
  12. xinference/model/llm/llm_family.json +390 -17
  13. xinference/model/llm/llm_family_modelscope.json +348 -29
  14. xinference/model/llm/mlx/core.py +15 -4
  15. xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +9 -13
  16. xinference/model/llm/sglang/core.py +7 -2
  17. xinference/model/llm/transformers/chatglm.py +4 -4
  18. xinference/model/llm/transformers/core.py +22 -5
  19. xinference/model/llm/transformers/intern_vl.py +2 -1
  20. xinference/model/llm/transformers/utils.py +1 -1
  21. xinference/model/llm/utils.py +134 -60
  22. xinference/model/llm/vllm/core.py +31 -42
  23. xinference/types.py +4 -0
  24. xinference/web/ui/build/asset-manifest.json +3 -3
  25. xinference/web/ui/build/index.html +1 -1
  26. xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
  27. xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
  28. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
  35. xinference/web/ui/src/locales/en.json +9 -1
  36. xinference/web/ui/src/locales/zh.json +9 -1
  37. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/METADATA +9 -5
  38. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/RECORD +43 -44
  39. xinference/model/llm/reasoning_parsers/__init__.py +0 -13
  40. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
  41. xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
  42. xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
  43. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
  44. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
  45. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
  46. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
  47. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
  48. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
  49. /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
  50. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/LICENSE +0 -0
  51. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/WHEEL +0 -0
  52. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/entry_points.txt +0 -0
  53. {xinference-1.3.0.post2.dist-info → xinference-1.3.1.post1.dist-info}/top_level.txt +0 -0
@@ -1,20 +1,17 @@
1
1
  import re
2
2
  from typing import Optional, Tuple, Union
3
3
 
4
- from ....types import ChatCompletionChunkDelta, CompletionChoice
5
- from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
4
+ from ...types import ChatCompletionChunkDelta, CompletionChoice
6
5
 
7
6
 
8
- @ReasoningParserManager.register_module("deepseek-v3")
9
- @ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
10
- @ReasoningParserManager.register_module("deepseek-r1-distill-llama")
11
- class DeepSeekR1ReasoningParser(ReasoningParser):
12
- """Reasoning parser for DeepSeek-R1 model."""
7
+ class ReasoningParser:
8
+ """Reasoning parser for reasoning model."""
13
9
 
14
10
  def __init__(
15
11
  self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
16
12
  ):
17
- super().__init__(reasoning_start_tag, reasoning_end_tag)
13
+ self.reasoning_start_tag = reasoning_start_tag
14
+ self.reasoning_end_tag = reasoning_end_tag
18
15
  self.reasoning_regex = re.compile(
19
16
  rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
20
17
  )
@@ -23,7 +20,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
23
20
  self,
24
21
  previous_text: str,
25
22
  current_text: str,
26
- delta: ChatCompletionChunkDelta,
23
+ delta_text: str,
27
24
  ) -> ChatCompletionChunkDelta:
28
25
  """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
29
26
 
@@ -34,10 +31,9 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
34
31
  Yields:
35
32
  str: Extracted reasoning content chunks.
36
33
  """
37
- if delta is None:
38
- return delta
39
-
40
- delta_text = delta["content"]
34
+ delta = ChatCompletionChunkDelta(
35
+ content=delta_text,
36
+ )
41
37
 
42
38
  # Check if <think> is present in previous or delta.
43
39
  # Keep compatibility with models that don't generate <think> tokens.
@@ -48,6 +48,7 @@ class SGLANGModelConfig(TypedDict, total=False):
48
48
  nnodes: Optional[int]
49
49
  node_rank: Optional[int]
50
50
  dist_init_addr: Optional[str]
51
+ reasoning_content: bool
51
52
 
52
53
 
53
54
  class SGLANGGenerateConfig(TypedDict, total=False):
@@ -99,6 +100,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
99
100
  "qwen2.5-instruct",
100
101
  "qwen2.5-coder-instruct",
101
102
  "QwQ-32B-Preview",
103
+ "QwQ-32B",
102
104
  "deepseek-r1-distill-qwen",
103
105
  "deepseek-r1-distill-llama",
104
106
  "deepseek-v3",
@@ -143,6 +145,8 @@ class SGLANGModel(LLM):
143
145
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
144
146
 
145
147
  self._model_config = self._sanitize_model_config(self._model_config)
148
+ reasoning_content = self._model_config.pop("reasoning_content")
149
+ self.prepare_parse_reasoning_content(reasoning_content)
146
150
 
147
151
  # Fix: GH#2169
148
152
  if sgl.__version__ >= "0.2.14":
@@ -255,6 +259,7 @@ class SGLANGModel(LLM):
255
259
  else:
256
260
  model_config["mem_fraction_static"] = 0.88
257
261
  model_config.setdefault("log_level", "info")
262
+ model_config.setdefault("reasoning_content", False)
258
263
 
259
264
  return model_config
260
265
 
@@ -547,8 +552,8 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
547
552
  if stream:
548
553
  agen = await self.async_generate(full_prompt, generate_config) # type: ignore
549
554
  assert isinstance(agen, AsyncGenerator)
550
- return self._async_to_chat_completion_chunks(agen)
555
+ return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
551
556
  else:
552
557
  c = await self.async_generate(full_prompt, generate_config) # type: ignore
553
558
  assert not isinstance(c, AsyncGenerator)
554
- return self._to_chat_completion(c)
559
+ return self._to_chat_completion(c, self.reasoning_parser)
@@ -383,7 +383,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
383
383
  function_call = self._process_response_non_streaming(
384
384
  response, tools, use_tool=True
385
385
  )
386
- return self._tool_calls_completion(
386
+ return self._post_process_completion(
387
387
  self.model_family, self.model_uid, function_call
388
388
  )
389
389
  else:
@@ -397,7 +397,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
397
397
  prompt_tokens = len(inputs["input_ids"][0])
398
398
  for chunk_text in self._stream_chat(inputs, tools, **kwargs):
399
399
  if tools and isinstance(chunk_text, dict):
400
- yield self._tool_calls_completion_chunk(
400
+ yield self._post_process_completion_chunk(
401
401
  self.model_family, self.model_uid, chunk_text
402
402
  )
403
403
  return
@@ -484,7 +484,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
484
484
  function_call = self._process_response_non_streaming(
485
485
  response, req.tools, use_tool=True
486
486
  )
487
- req.completion[0] = self._tool_calls_completion(
487
+ req.completion[0] = self._post_process_completion(
488
488
  self.model_family, self.model_uid, function_call
489
489
  )
490
490
  req.completion[0]["usage"] = usage
@@ -516,7 +516,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
516
516
  c for c in req.completion if not isinstance(c, str)
517
517
  ][0]["id"]
518
518
  results.append(
519
- self._tool_calls_completion_chunk(
519
+ self._post_process_completion_chunk(
520
520
  self.model_family,
521
521
  self.model_uid,
522
522
  new_response,
@@ -61,6 +61,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
61
61
  "deepseek-vl-chat",
62
62
  "internvl-chat",
63
63
  "internvl2",
64
+ "Internvl2.5",
65
+ "Internvl2.5-MPO",
64
66
  "cogvlm2",
65
67
  "cogvlm2-video-llama3-chat",
66
68
  "MiniCPM-Llama3-V-2_5",
@@ -112,6 +114,7 @@ class PytorchModel(LLM):
112
114
  pytorch_model_config.setdefault("trust_remote_code", True)
113
115
  pytorch_model_config.setdefault("max_num_seqs", 16)
114
116
  pytorch_model_config.setdefault("enable_tensorizer", False)
117
+ pytorch_model_config.setdefault("reasoning_content", False)
115
118
  return pytorch_model_config
116
119
 
117
120
  def _sanitize_generate_config(
@@ -324,6 +327,9 @@ class PytorchModel(LLM):
324
327
  kwargs.update({"device_map": "auto"})
325
328
  is_device_map_auto = True
326
329
 
330
+ reasoning_content = self._pytorch_model_config.pop("reasoning_content")
331
+ self.prepare_parse_reasoning_content(reasoning_content)
332
+
327
333
  if self._check_tensorizer_integrity():
328
334
  self._model, self._tokenizer = self._load_tensorizer(**kwargs)
329
335
  else:
@@ -714,23 +720,34 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
714
720
 
715
721
  def handle_chat_result_non_streaming(self, req: InferenceRequest):
716
722
  if req.tools:
717
- req.completion[0] = self._tool_calls_completion(
718
- self.model_family, self.model_uid, req.completion[0]
723
+ req.completion[0] = self._post_process_completion(
724
+ self.model_family,
725
+ self.model_uid,
726
+ req.completion[0],
727
+ self.reasoning_parser,
719
728
  )
720
729
  else:
721
- req.completion[0] = self._to_chat_completion(req.completion[0])
730
+ req.completion[0] = self._to_chat_completion(
731
+ req.completion[0], self.reasoning_parser
732
+ )
722
733
 
723
734
  def handle_chat_result_streaming(self, req: InferenceRequest):
724
735
  results = []
725
736
  for i, c in enumerate(req.completion):
726
737
  if c == "<bos_stream>":
727
738
  results.append(
728
- self._get_first_chat_completion_chunk(req.completion[i + 1])
739
+ self._get_first_chat_completion_chunk(
740
+ req.completion[i + 1], self.reasoning_parser
741
+ )
729
742
  )
730
743
  elif c == "<eos_stream>":
731
744
  break
732
745
  else:
733
- results.append(self._to_chat_completion_chunk(c))
746
+ results.append(
747
+ self._to_chat_completion_chunk(
748
+ c, self.reasoning_parser, req.previous_texts
749
+ )
750
+ )
734
751
 
735
752
  if req.stopped and req.include_usage:
736
753
  results.append(self._get_final_chat_completion_chunk(req.completion[-1]))
@@ -265,7 +265,8 @@ class InternVLChatModel(PytorchChatModel):
265
265
  if world_size == 1:
266
266
  return None
267
267
  model_size = f"{self.model_spec.model_size_in_billions}B"
268
- model_name = f"{self.model_family.model_name.lower()}-{model_size}"
268
+ model_name = self.model_family.model_name.lower().replace("-mpo", "")
269
+ model_name = f"{model_name}-{model_size}"
269
270
  num_layers = {
270
271
  "internvl2-1B": 24,
271
272
  "internvl2-2B": 24,
@@ -132,7 +132,7 @@ def _pad_seqs_inplace(seqs: List[List[int]], reqs: List[InferenceRequest], pad:
132
132
 
133
133
  def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
134
134
  max_new_tokens = int(
135
- r.sanitized_generate_config.get("max_tokens", max_tokens_field.default)
135
+ r.sanitized_generate_config.get("max_tokens") or max_tokens_field.default
136
136
  )
137
137
  return context_len - max_new_tokens - 8
138
138
 
@@ -41,6 +41,7 @@ from ...types import (
41
41
  ChatCompletion,
42
42
  ChatCompletionChoice,
43
43
  ChatCompletionChunk,
44
+ ChatCompletionChunkDelta,
44
45
  ChatCompletionMessage,
45
46
  Completion,
46
47
  CompletionChoice,
@@ -54,7 +55,7 @@ from .llm_family import (
54
55
  _get_cache_dir,
55
56
  get_cache_status,
56
57
  )
57
- from .reasoning_parsers.abs_reasoning_parsers import ReasoningParser
58
+ from .reasoning_parser import ReasoningParser
58
59
 
59
60
  logger = logging.getLogger(__name__)
60
61
 
@@ -243,62 +244,95 @@ class ChatModelMixin:
243
244
  raise ValueError(f"Invalid model family: {model_family}")
244
245
 
245
246
  @classmethod
246
- def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
247
+ def _to_chat_completion_chunk(
248
+ cls,
249
+ chunk: CompletionChunk,
250
+ reasoning_parser: Optional[ReasoningParser] = None,
251
+ previous_texts: Optional[List[str]] = None,
252
+ ) -> ChatCompletionChunk:
247
253
  choices = chunk.get("choices")
248
254
  if (
249
255
  chunk.get("object") == "chat.completion.chunk"
250
256
  and choices
251
257
  and "delta" in choices[0]
252
258
  ):
259
+ if reasoning_parser is not None:
260
+ # process parsing reasoning content
261
+ assert previous_texts is not None
262
+ delta = choices[0]["delta"] # type: ignore
263
+ if text := delta.get("content"):
264
+ current_text = previous_texts[-1] + text
265
+ delta = reasoning_parser.extract_reasoning_content_streaming(
266
+ previous_text=previous_texts[-1],
267
+ current_text=current_text,
268
+ delta_text=text,
269
+ )
270
+ previous_texts[-1] = current_text
271
+ choices[0]["delta"] = delta # type: ignore
253
272
  # Already a ChatCompletionChunk, we don't need to convert chunk.
254
273
  return cast(ChatCompletionChunk, chunk)
274
+
275
+ choices_list = []
276
+ for i, choice in enumerate(choices): # type: ignore
277
+ delta = ChatCompletionChunkDelta()
278
+ if "text" in choice and choice["finish_reason"] is None:
279
+ if reasoning_parser is None:
280
+ delta["content"] = choice["text"]
281
+ else:
282
+ assert previous_texts is not None
283
+ current_text = previous_texts[-1] + choice["text"]
284
+ delta = reasoning_parser.extract_reasoning_content_streaming(
285
+ previous_text=previous_texts[-1],
286
+ current_text=current_text,
287
+ delta_text=choice["text"],
288
+ )
289
+ previous_texts[-1] = current_text
290
+ if "tool_calls" in choice:
291
+ delta["tool_calls"] = choice["tool_calls"]
292
+ choices_list.append(
293
+ {
294
+ "index": i,
295
+ "delta": delta,
296
+ "finish_reason": choice["finish_reason"],
297
+ }
298
+ )
255
299
  chat_chunk = {
256
300
  "id": "chat" + chunk["id"],
257
301
  "model": chunk["model"],
258
302
  "created": chunk["created"],
259
303
  "object": "chat.completion.chunk",
260
- "choices": [
261
- {
262
- "index": i,
263
- "delta": {
264
- **(
265
- {"content": choice["text"]}
266
- if ("text" in choice and choice["finish_reason"] is None)
267
- else {}
268
- ),
269
- **(
270
- {"tool_calls": choice["tool_calls"]}
271
- if "tool_calls" in choice
272
- else {}
273
- ),
274
- },
275
- "finish_reason": choice["finish_reason"],
276
- }
277
- for i, choice in enumerate(chunk["choices"])
278
- ],
304
+ "choices": choices_list,
279
305
  }
280
306
  return cast(ChatCompletionChunk, chat_chunk)
281
307
 
282
308
  @classmethod
283
309
  def _get_first_chat_completion_chunk(
284
- cls, chunk: CompletionChunk
310
+ cls,
311
+ chunk: CompletionChunk,
312
+ reasoning_parser: Optional[ReasoningParser] = None,
285
313
  ) -> ChatCompletionChunk:
314
+ choices_list = []
315
+ for i, choice in enumerate(chunk["choices"]):
316
+ delta = {
317
+ "role": "assistant",
318
+ }
319
+ if reasoning_parser is None:
320
+ delta["content"] = ""
321
+ else:
322
+ delta["reasoning_content"] = ""
323
+ choices_list.append(
324
+ {
325
+ "index": i,
326
+ "delta": delta,
327
+ "finish_reason": None,
328
+ }
329
+ )
286
330
  chat_chunk = {
287
331
  "id": "chat" + chunk["id"],
288
332
  "model": chunk["model"],
289
333
  "created": chunk["created"],
290
334
  "object": "chat.completion.chunk",
291
- "choices": [
292
- {
293
- "index": i,
294
- "delta": {
295
- "role": "assistant",
296
- "content": "",
297
- },
298
- "finish_reason": None,
299
- }
300
- for i, choice in enumerate(chunk["choices"])
301
- ],
335
+ "choices": choices_list,
302
336
  }
303
337
  return cast(ChatCompletionChunk, chat_chunk)
304
338
 
@@ -324,15 +358,19 @@ class ChatModelMixin:
324
358
  chunks: Iterator[CompletionChunk],
325
359
  reasoning_parse: Optional[ReasoningParser] = None,
326
360
  ) -> Iterator[ChatCompletionChunk]:
361
+ previous_texts = [""]
327
362
  for i, chunk in enumerate(chunks):
328
363
  if i == 0:
329
- yield cls._get_first_chat_completion_chunk(chunk)
364
+ yield cls._get_first_chat_completion_chunk(chunk, reasoning_parse)
330
365
  # usage
331
366
  choices = chunk.get("choices")
332
367
  if not choices:
333
368
  yield cls._get_final_chat_completion_chunk(chunk)
334
369
  else:
335
- yield cls._to_chat_completion_chunk(chunk)
370
+ r = cls._to_chat_completion_chunk(
371
+ chunk, reasoning_parse, previous_texts
372
+ )
373
+ yield r
336
374
 
337
375
  @classmethod
338
376
  def _tools_to_messages_for_deepseek(
@@ -370,33 +408,19 @@ class ChatModelMixin:
370
408
  reasoning_parser: Optional[ReasoningParser] = None,
371
409
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
372
410
  i = 0
373
- previous_text = ""
374
- current_text = ""
411
+ previous_texts = [""]
375
412
  async for chunk in chunks:
376
413
  if i == 0:
377
- chat_chunk = cls._get_first_chat_completion_chunk(chunk)
414
+ chat_chunk = cls._get_first_chat_completion_chunk(
415
+ chunk, reasoning_parser
416
+ )
378
417
  elif not chunk.get("choices"):
379
418
  # usage
380
419
  chat_chunk = cls._get_final_chat_completion_chunk(chunk)
381
420
  else:
382
- chat_chunk = cls._to_chat_completion_chunk(chunk)
383
- if reasoning_parser is not None:
384
- choices = chat_chunk.get("choices")
385
- if choices is None:
386
- continue
387
- for choice in choices:
388
- delta = choice.get("delta")
389
- if not delta:
390
- continue
391
- current_text = previous_text + delta.get("content", "")
392
- choice[
393
- "delta"
394
- ] = reasoning_parser.extract_reasoning_content_streaming(
395
- previous_text=previous_text,
396
- current_text=current_text,
397
- delta=delta,
398
- )
399
- previous_text = current_text
421
+ chat_chunk = cls._to_chat_completion_chunk(
422
+ chunk, reasoning_parser, previous_texts
423
+ )
400
424
  yield chat_chunk
401
425
  i += 1
402
426
 
@@ -404,6 +428,21 @@ class ChatModelMixin:
404
428
  def _to_chat_completion(
405
429
  completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
406
430
  ) -> ChatCompletion:
431
+ if completion.get("object") == "chat.completion" and completion.get("choices"):
432
+ # Already a ChatCompletion
433
+ if reasoning_parser is not None:
434
+ for choice in completion["choices"]:
435
+ message = choice["message"] # type: ignore
436
+ text = message["content"]
437
+ (
438
+ reasoning_content,
439
+ content,
440
+ ) = reasoning_parser.extract_reasoning_content(text)
441
+ message["content"] = content
442
+ if reasoning_content is not None:
443
+ message["reasoning_content"] = reasoning_content
444
+ return cast(ChatCompletion, completion)
445
+
407
446
  choices = []
408
447
  for i, choice in enumerate(completion["choices"]):
409
448
  content = choice["text"]
@@ -565,7 +604,14 @@ class ChatModelMixin:
565
604
  return result
566
605
 
567
606
  @classmethod
568
- def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
607
+ def _post_process_completion_chunk(
608
+ cls,
609
+ model_family,
610
+ model_uid,
611
+ c,
612
+ chunk_id=None,
613
+ reasoning_parser: Optional[ReasoningParser] = None,
614
+ ):
569
615
  _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
570
616
  tool_result = cls._eval_tool_arguments(model_family, c)
571
617
  tool_calls = []
@@ -585,11 +631,22 @@ class ChatModelMixin:
585
631
  else:
586
632
  failed_contents.append(content)
587
633
  finish_reason = "tool_calls" if tool_calls else "stop"
634
+
635
+ reasoning_content = None
636
+ content = ". ".join(failed_contents) if failed_contents else None
637
+ if reasoning_parser is not None:
638
+ reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
639
+ content
640
+ )
588
641
  d = {
589
642
  "role": "assistant",
590
- "content": ". ".join(failed_contents) if failed_contents else None,
643
+ "content": content,
591
644
  "tool_calls": tool_calls,
592
645
  }
646
+ # add only reasoning_content is None
647
+ if reasoning_content is not None:
648
+ d["reasoning_content"] = reasoning_content
649
+
593
650
  try:
594
651
  usage = c.get("usage")
595
652
  assert "prompt_tokens" in usage
@@ -616,7 +673,13 @@ class ChatModelMixin:
616
673
  }
617
674
 
618
675
  @classmethod
619
- def _tool_calls_completion(cls, model_family, model_uid, c):
676
+ def _post_process_completion(
677
+ cls,
678
+ model_family,
679
+ model_uid,
680
+ c,
681
+ reasoning_parser: Optional[ReasoningParser] = None,
682
+ ):
620
683
  _id = str(uuid.uuid4())
621
684
  tool_result = cls._eval_tool_arguments(model_family, c)
622
685
 
@@ -637,11 +700,22 @@ class ChatModelMixin:
637
700
  else:
638
701
  failed_contents.append(content)
639
702
  finish_reason = "tool_calls" if tool_calls else "stop"
703
+
704
+ reasoning_content = None
705
+ content = ". ".join(failed_contents) if failed_contents else None
706
+ if reasoning_parser is not None:
707
+ reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
708
+ content
709
+ )
640
710
  m = {
641
711
  "role": "assistant",
642
- "content": ". ".join(failed_contents) if failed_contents else None,
712
+ "content": content,
643
713
  "tool_calls": tool_calls,
644
714
  }
715
+ # add only reasoning_content is None
716
+ if reasoning_content is not None:
717
+ m["reasoning_content"] = reasoning_content
718
+
645
719
  try:
646
720
  usage = c.get("usage")
647
721
  assert "prompt_tokens" in usage
@@ -43,8 +43,6 @@ from ....types import (
43
43
  )
44
44
  from .. import LLM, LLMFamilyV1, LLMSpecV1
45
45
  from ..llm_family import CustomLLMFamilyV1
46
- from ..reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
47
- from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
48
46
  from ..utils import (
49
47
  DEEPSEEK_TOOL_CALL_FAMILY,
50
48
  QWEN_TOOL_CALL_FAMILY,
@@ -160,6 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
160
158
  VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
161
159
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
162
160
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
161
+ VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
163
162
  VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
164
163
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
165
164
 
@@ -196,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
196
195
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
197
196
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
198
197
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
198
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
199
199
 
200
200
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
201
201
  VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
@@ -211,9 +211,10 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
211
211
 
212
212
  if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
213
213
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
214
+ VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
214
215
 
215
216
  if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
216
- VLLM_SUPPORTED_CHAT_MODELS.append("qwen-2.5-instruct-1m")
217
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
217
218
 
218
219
 
219
220
  class VLLMModel(LLM):
@@ -243,7 +244,6 @@ class VLLMModel(LLM):
243
244
  self.lora_modules = peft_model
244
245
  self.lora_requests: List[LoRARequest] = []
245
246
  self._xavier_config = None
246
- self.reasoning_parser = None
247
247
 
248
248
  def set_xavier_config(self, value: Optional[Dict]):
249
249
  self._xavier_config = value # type: ignore
@@ -274,14 +274,8 @@ class VLLMModel(LLM):
274
274
  self._model_config = self._sanitize_model_config(self._model_config)
275
275
  reasoning_content = self._model_config.pop("reasoning_content")
276
276
 
277
- # Initialize reasoning parser if model has reasoning ability
278
- if "reasoning" in self.model_family.model_ability and reasoning_content:
279
- module_name = self.model_family.model_family or self.model_family.model_name
280
- self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
281
- self.reasoning_parser = self.reasoning_parser(
282
- self.model_family.reasoning_start_tag,
283
- self.model_family.reasoning_end_tag,
284
- )
277
+ self.prepare_parse_reasoning_content(reasoning_content)
278
+
285
279
  if self.lora_modules is None:
286
280
  self.lora_requests = []
287
281
  else:
@@ -581,6 +575,10 @@ class VLLMModel(LLM):
581
575
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
582
576
 
583
577
  sanitized_generate_config = self._sanitize_generate_config(generate_config)
578
+ if self.reasoning_parser:
579
+ # For reasoning model, the </think> we be split into multiple words,
580
+ # if `stop` param is passed, so we pop it from config.
581
+ sanitized_generate_config.pop("stop")
584
582
  logger.debug(
585
583
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
586
584
  )
@@ -812,18 +810,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
812
810
  i = 0
813
811
  async for chunk in chunks:
814
812
  if i == 0:
815
- yield self._get_first_chat_completion_chunk(chunk)
813
+ yield self._get_first_chat_completion_chunk(
814
+ chunk, self.reasoning_parser
815
+ )
816
816
  # usage
817
817
  choices = chunk.get("choices")
818
818
  if not choices:
819
819
  yield self._get_final_chat_completion_chunk(chunk)
820
820
  else:
821
821
  if self.is_tool_call_chunk(chunk):
822
- yield self._tool_calls_completion_chunk(
823
- self.model_family, self.model_uid, chunk
822
+ yield self._post_process_completion_chunk(
823
+ self.model_family,
824
+ self.model_uid,
825
+ chunk,
826
+ reasoning_parser=self.reasoning_parser,
824
827
  )
825
828
  else:
826
- yield self._to_chat_completion_chunk(chunk)
829
+ yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
827
830
  i += 1
828
831
 
829
832
  @vllm_check
@@ -863,7 +866,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
863
866
  )
864
867
  assert not isinstance(c, AsyncGenerator)
865
868
  if tools:
866
- return self._tool_calls_completion(self.model_family, self.model_uid, c)
869
+ return self._post_process_completion(
870
+ self.model_family, self.model_uid, c, self.reasoning_parser
871
+ )
867
872
  return self._to_chat_completion(c, self.reasoning_parser)
868
873
 
869
874
 
@@ -905,31 +910,15 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
905
910
  def _sanitize_model_config(
906
911
  self, model_config: Optional[VLLMModelConfig]
907
912
  ) -> VLLMModelConfig:
908
- if model_config is None:
909
- model_config = VLLMModelConfig()
910
-
911
- cuda_count = self._get_cuda_count()
912
-
913
- model_config.setdefault("tokenizer_mode", "auto")
914
- model_config.setdefault("trust_remote_code", True)
915
- model_config.setdefault("tensor_parallel_size", cuda_count)
916
- model_config.setdefault("block_size", 16)
917
- model_config.setdefault("swap_space", 4)
918
- model_config.setdefault("gpu_memory_utilization", 0.90)
919
- model_config.setdefault("max_num_seqs", 256)
920
- model_config.setdefault("quantization", None)
921
- model_config.setdefault("max_model_len", None)
922
- model_config["limit_mm_per_prompt"] = (
923
- json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
924
- if model_config.get("limit_mm_per_prompt")
925
- else {
926
- "image": 2, # default 2 images all chat
927
- }
928
- )
929
- # Add scheduling policy if vLLM version is 0.6.3 or higher
930
- if vllm.__version__ >= "0.6.3":
931
- model_config.setdefault("scheduling_policy", "fcfs")
932
-
913
+ model_config = super()._sanitize_model_config(model_config)
914
+ if vllm.__version__ >= "0.5.5":
915
+ model_config["limit_mm_per_prompt"] = (
916
+ json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
917
+ if model_config.get("limit_mm_per_prompt")
918
+ else {
919
+ "image": 2, # default 2 images all chat
920
+ }
921
+ )
933
922
  return model_config
934
923
 
935
924
  def _sanitize_chat_config(