xinference 1.3.0.post1__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (52) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +1 -0
  3. xinference/conftest.py +7 -0
  4. xinference/core/model.py +3 -1
  5. xinference/core/scheduler.py +3 -0
  6. xinference/core/worker.py +1 -1
  7. xinference/deploy/cmdline.py +0 -8
  8. xinference/model/embedding/core.py +12 -5
  9. xinference/model/llm/__init__.py +2 -1
  10. xinference/model/llm/core.py +13 -0
  11. xinference/model/llm/llama_cpp/core.py +260 -3
  12. xinference/model/llm/llm_family.json +306 -17
  13. xinference/model/llm/llm_family_modelscope.json +347 -28
  14. xinference/model/llm/mlx/core.py +15 -4
  15. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +1 -1
  16. xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +4 -5
  17. xinference/model/llm/sglang/core.py +7 -2
  18. xinference/model/llm/transformers/chatglm.py +4 -4
  19. xinference/model/llm/transformers/core.py +22 -5
  20. xinference/model/llm/transformers/intern_vl.py +2 -1
  21. xinference/model/llm/transformers/utils.py +1 -1
  22. xinference/model/llm/utils.py +103 -67
  23. xinference/model/llm/vllm/core.py +29 -42
  24. xinference/types.py +4 -0
  25. xinference/web/ui/build/asset-manifest.json +3 -3
  26. xinference/web/ui/build/index.html +1 -1
  27. xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
  28. xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
  36. xinference/web/ui/src/locales/en.json +9 -1
  37. xinference/web/ui/src/locales/zh.json +9 -1
  38. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/METADATA +7 -3
  39. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/RECORD +44 -43
  40. xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
  41. xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
  42. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
  43. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
  44. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
  45. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
  46. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
  47. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
  48. /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
  49. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/LICENSE +0 -0
  50. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/WHEEL +0 -0
  51. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/entry_points.txt +0 -0
  52. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/top_level.txt +0 -0
@@ -48,6 +48,7 @@ class SGLANGModelConfig(TypedDict, total=False):
48
48
  nnodes: Optional[int]
49
49
  node_rank: Optional[int]
50
50
  dist_init_addr: Optional[str]
51
+ reasoning_content: bool
51
52
 
52
53
 
53
54
  class SGLANGGenerateConfig(TypedDict, total=False):
@@ -99,6 +100,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
99
100
  "qwen2.5-instruct",
100
101
  "qwen2.5-coder-instruct",
101
102
  "QwQ-32B-Preview",
103
+ "QwQ-32B",
102
104
  "deepseek-r1-distill-qwen",
103
105
  "deepseek-r1-distill-llama",
104
106
  "deepseek-v3",
@@ -143,6 +145,8 @@ class SGLANGModel(LLM):
143
145
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
144
146
 
145
147
  self._model_config = self._sanitize_model_config(self._model_config)
148
+ reasoning_content = self._model_config.pop("reasoning_content")
149
+ self.prepare_parse_reasoning_content(reasoning_content)
146
150
 
147
151
  # Fix: GH#2169
148
152
  if sgl.__version__ >= "0.2.14":
@@ -255,6 +259,7 @@ class SGLANGModel(LLM):
255
259
  else:
256
260
  model_config["mem_fraction_static"] = 0.88
257
261
  model_config.setdefault("log_level", "info")
262
+ model_config.setdefault("reasoning_content", False)
258
263
 
259
264
  return model_config
260
265
 
@@ -547,8 +552,8 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
547
552
  if stream:
548
553
  agen = await self.async_generate(full_prompt, generate_config) # type: ignore
549
554
  assert isinstance(agen, AsyncGenerator)
550
- return self._async_to_chat_completion_chunks(agen)
555
+ return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
551
556
  else:
552
557
  c = await self.async_generate(full_prompt, generate_config) # type: ignore
553
558
  assert not isinstance(c, AsyncGenerator)
554
- return self._to_chat_completion(c)
559
+ return self._to_chat_completion(c, self.reasoning_parser)
@@ -383,7 +383,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
383
383
  function_call = self._process_response_non_streaming(
384
384
  response, tools, use_tool=True
385
385
  )
386
- return self._tool_calls_completion(
386
+ return self._post_process_completion(
387
387
  self.model_family, self.model_uid, function_call
388
388
  )
389
389
  else:
@@ -397,7 +397,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
397
397
  prompt_tokens = len(inputs["input_ids"][0])
398
398
  for chunk_text in self._stream_chat(inputs, tools, **kwargs):
399
399
  if tools and isinstance(chunk_text, dict):
400
- yield self._tool_calls_completion_chunk(
400
+ yield self._post_process_completion_chunk(
401
401
  self.model_family, self.model_uid, chunk_text
402
402
  )
403
403
  return
@@ -484,7 +484,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
484
484
  function_call = self._process_response_non_streaming(
485
485
  response, req.tools, use_tool=True
486
486
  )
487
- req.completion[0] = self._tool_calls_completion(
487
+ req.completion[0] = self._post_process_completion(
488
488
  self.model_family, self.model_uid, function_call
489
489
  )
490
490
  req.completion[0]["usage"] = usage
@@ -516,7 +516,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
516
516
  c for c in req.completion if not isinstance(c, str)
517
517
  ][0]["id"]
518
518
  results.append(
519
- self._tool_calls_completion_chunk(
519
+ self._post_process_completion_chunk(
520
520
  self.model_family,
521
521
  self.model_uid,
522
522
  new_response,
@@ -61,6 +61,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
61
61
  "deepseek-vl-chat",
62
62
  "internvl-chat",
63
63
  "internvl2",
64
+ "Internvl2.5",
65
+ "Internvl2.5-MPO",
64
66
  "cogvlm2",
65
67
  "cogvlm2-video-llama3-chat",
66
68
  "MiniCPM-Llama3-V-2_5",
@@ -112,6 +114,7 @@ class PytorchModel(LLM):
112
114
  pytorch_model_config.setdefault("trust_remote_code", True)
113
115
  pytorch_model_config.setdefault("max_num_seqs", 16)
114
116
  pytorch_model_config.setdefault("enable_tensorizer", False)
117
+ pytorch_model_config.setdefault("reasoning_content", False)
115
118
  return pytorch_model_config
116
119
 
117
120
  def _sanitize_generate_config(
@@ -324,6 +327,9 @@ class PytorchModel(LLM):
324
327
  kwargs.update({"device_map": "auto"})
325
328
  is_device_map_auto = True
326
329
 
330
+ reasoning_content = self._pytorch_model_config.pop("reasoning_content")
331
+ self.prepare_parse_reasoning_content(reasoning_content)
332
+
327
333
  if self._check_tensorizer_integrity():
328
334
  self._model, self._tokenizer = self._load_tensorizer(**kwargs)
329
335
  else:
@@ -714,23 +720,34 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
714
720
 
715
721
  def handle_chat_result_non_streaming(self, req: InferenceRequest):
716
722
  if req.tools:
717
- req.completion[0] = self._tool_calls_completion(
718
- self.model_family, self.model_uid, req.completion[0]
723
+ req.completion[0] = self._post_process_completion(
724
+ self.model_family,
725
+ self.model_uid,
726
+ req.completion[0],
727
+ self.reasoning_parser,
719
728
  )
720
729
  else:
721
- req.completion[0] = self._to_chat_completion(req.completion[0])
730
+ req.completion[0] = self._to_chat_completion(
731
+ req.completion[0], self.reasoning_parser
732
+ )
722
733
 
723
734
  def handle_chat_result_streaming(self, req: InferenceRequest):
724
735
  results = []
725
736
  for i, c in enumerate(req.completion):
726
737
  if c == "<bos_stream>":
727
738
  results.append(
728
- self._get_first_chat_completion_chunk(req.completion[i + 1])
739
+ self._get_first_chat_completion_chunk(
740
+ req.completion[i + 1], self.reasoning_parser
741
+ )
729
742
  )
730
743
  elif c == "<eos_stream>":
731
744
  break
732
745
  else:
733
- results.append(self._to_chat_completion_chunk(c))
746
+ results.append(
747
+ self._to_chat_completion_chunk(
748
+ c, self.reasoning_parser, req.previous_texts
749
+ )
750
+ )
734
751
 
735
752
  if req.stopped and req.include_usage:
736
753
  results.append(self._get_final_chat_completion_chunk(req.completion[-1]))
@@ -265,7 +265,8 @@ class InternVLChatModel(PytorchChatModel):
265
265
  if world_size == 1:
266
266
  return None
267
267
  model_size = f"{self.model_spec.model_size_in_billions}B"
268
- model_name = f"{self.model_family.model_name.lower()}-{model_size}"
268
+ model_name = self.model_family.model_name.lower().replace("-mpo", "")
269
+ model_name = f"{model_name}-{model_size}"
269
270
  num_layers = {
270
271
  "internvl2-1B": 24,
271
272
  "internvl2-2B": 24,
@@ -132,7 +132,7 @@ def _pad_seqs_inplace(seqs: List[List[int]], reqs: List[InferenceRequest], pad:
132
132
 
133
133
  def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
134
134
  max_new_tokens = int(
135
- r.sanitized_generate_config.get("max_tokens", max_tokens_field.default)
135
+ r.sanitized_generate_config.get("max_tokens") or max_tokens_field.default
136
136
  )
137
137
  return context_len - max_new_tokens - 8
138
138
 
@@ -41,6 +41,7 @@ from ...types import (
41
41
  ChatCompletion,
42
42
  ChatCompletionChoice,
43
43
  ChatCompletionChunk,
44
+ ChatCompletionChunkDelta,
44
45
  ChatCompletionMessage,
45
46
  Completion,
46
47
  CompletionChoice,
@@ -243,62 +244,73 @@ class ChatModelMixin:
243
244
  raise ValueError(f"Invalid model family: {model_family}")
244
245
 
245
246
  @classmethod
246
- def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
247
- choices = chunk.get("choices")
248
- if (
249
- chunk.get("object") == "chat.completion.chunk"
250
- and choices
251
- and "delta" in choices[0]
252
- ):
253
- # Already a ChatCompletionChunk, we don't need to convert chunk.
254
- return cast(ChatCompletionChunk, chunk)
247
+ def _to_chat_completion_chunk(
248
+ cls,
249
+ chunk: CompletionChunk,
250
+ reasoning_parser: Optional[ReasoningParser] = None,
251
+ previous_texts: Optional[List[str]] = None,
252
+ ) -> ChatCompletionChunk:
253
+ choices_list = []
254
+ for i, choice in enumerate(chunk["choices"]):
255
+ delta = ChatCompletionChunkDelta()
256
+ if "text" in choice and choice["finish_reason"] is None:
257
+ if reasoning_parser is None:
258
+ delta["content"] = choice["text"]
259
+ else:
260
+ assert previous_texts is not None
261
+ current_text = previous_texts[-1] + choice["text"]
262
+ delta = reasoning_parser.extract_reasoning_content_streaming(
263
+ previous_text=previous_texts[-1],
264
+ current_text=current_text,
265
+ delta_text=choice["text"],
266
+ )
267
+ previous_texts[-1] = current_text
268
+ if "tool_calls" in choice:
269
+ delta["tool_calls"] = choice["tool_calls"]
270
+ choices_list.append(
271
+ {
272
+ "index": i,
273
+ "delta": delta,
274
+ "finish_reason": choice["finish_reason"],
275
+ }
276
+ )
255
277
  chat_chunk = {
256
278
  "id": "chat" + chunk["id"],
257
279
  "model": chunk["model"],
258
280
  "created": chunk["created"],
259
281
  "object": "chat.completion.chunk",
260
- "choices": [
261
- {
262
- "index": i,
263
- "delta": {
264
- **(
265
- {"content": choice["text"]}
266
- if ("text" in choice and choice["finish_reason"] is None)
267
- else {}
268
- ),
269
- **(
270
- {"tool_calls": choice["tool_calls"]}
271
- if "tool_calls" in choice
272
- else {}
273
- ),
274
- },
275
- "finish_reason": choice["finish_reason"],
276
- }
277
- for i, choice in enumerate(chunk["choices"])
278
- ],
282
+ "choices": choices_list,
279
283
  }
280
284
  return cast(ChatCompletionChunk, chat_chunk)
281
285
 
282
286
  @classmethod
283
287
  def _get_first_chat_completion_chunk(
284
- cls, chunk: CompletionChunk
288
+ cls,
289
+ chunk: CompletionChunk,
290
+ reasoning_parser: Optional[ReasoningParser] = None,
285
291
  ) -> ChatCompletionChunk:
292
+ choices_list = []
293
+ for i, choice in enumerate(chunk["choices"]):
294
+ delta = {
295
+ "role": "assistant",
296
+ }
297
+ if reasoning_parser is None:
298
+ delta["content"] = ""
299
+ else:
300
+ delta["reasoning_content"] = ""
301
+ choices_list.append(
302
+ {
303
+ "index": i,
304
+ "delta": delta,
305
+ "finish_reason": None,
306
+ }
307
+ )
286
308
  chat_chunk = {
287
309
  "id": "chat" + chunk["id"],
288
310
  "model": chunk["model"],
289
311
  "created": chunk["created"],
290
312
  "object": "chat.completion.chunk",
291
- "choices": [
292
- {
293
- "index": i,
294
- "delta": {
295
- "role": "assistant",
296
- "content": "",
297
- },
298
- "finish_reason": None,
299
- }
300
- for i, choice in enumerate(chunk["choices"])
301
- ],
313
+ "choices": choices_list,
302
314
  }
303
315
  return cast(ChatCompletionChunk, chat_chunk)
304
316
 
@@ -324,15 +336,18 @@ class ChatModelMixin:
324
336
  chunks: Iterator[CompletionChunk],
325
337
  reasoning_parse: Optional[ReasoningParser] = None,
326
338
  ) -> Iterator[ChatCompletionChunk]:
339
+ previous_texts = [""]
327
340
  for i, chunk in enumerate(chunks):
328
341
  if i == 0:
329
- yield cls._get_first_chat_completion_chunk(chunk)
342
+ yield cls._get_first_chat_completion_chunk(chunk, reasoning_parse)
330
343
  # usage
331
344
  choices = chunk.get("choices")
332
345
  if not choices:
333
346
  yield cls._get_final_chat_completion_chunk(chunk)
334
347
  else:
335
- yield cls._to_chat_completion_chunk(chunk)
348
+ yield cls._to_chat_completion_chunk(
349
+ chunk, reasoning_parse, previous_texts
350
+ )
336
351
 
337
352
  @classmethod
338
353
  def _tools_to_messages_for_deepseek(
@@ -370,33 +385,19 @@ class ChatModelMixin:
370
385
  reasoning_parser: Optional[ReasoningParser] = None,
371
386
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
372
387
  i = 0
373
- previous_text = ""
374
- current_text = ""
388
+ previous_texts = [""]
375
389
  async for chunk in chunks:
376
390
  if i == 0:
377
- chat_chunk = cls._get_first_chat_completion_chunk(chunk)
391
+ chat_chunk = cls._get_first_chat_completion_chunk(
392
+ chunk, reasoning_parser
393
+ )
378
394
  elif not chunk.get("choices"):
379
395
  # usage
380
396
  chat_chunk = cls._get_final_chat_completion_chunk(chunk)
381
397
  else:
382
- chat_chunk = cls._to_chat_completion_chunk(chunk)
383
- if reasoning_parser is not None:
384
- choices = chat_chunk.get("choices")
385
- if choices is None:
386
- continue
387
- for choice in choices:
388
- delta = choice.get("delta")
389
- if not delta:
390
- continue
391
- current_text = previous_text + delta.get("content", "")
392
- choice[
393
- "delta"
394
- ] = reasoning_parser.extract_reasoning_content_streaming(
395
- previous_text=previous_text,
396
- current_text=current_text,
397
- delta=delta,
398
- )
399
- previous_text = current_text
398
+ chat_chunk = cls._to_chat_completion_chunk(
399
+ chunk, reasoning_parser, previous_texts
400
+ )
400
401
  yield chat_chunk
401
402
  i += 1
402
403
 
@@ -565,7 +566,14 @@ class ChatModelMixin:
565
566
  return result
566
567
 
567
568
  @classmethod
568
- def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
569
+ def _post_process_completion_chunk(
570
+ cls,
571
+ model_family,
572
+ model_uid,
573
+ c,
574
+ chunk_id=None,
575
+ reasoning_parser: Optional[ReasoningParser] = None,
576
+ ):
569
577
  _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
570
578
  tool_result = cls._eval_tool_arguments(model_family, c)
571
579
  tool_calls = []
@@ -585,11 +593,22 @@ class ChatModelMixin:
585
593
  else:
586
594
  failed_contents.append(content)
587
595
  finish_reason = "tool_calls" if tool_calls else "stop"
596
+
597
+ reasoning_content = None
598
+ content = ". ".join(failed_contents) if failed_contents else None
599
+ if reasoning_parser is not None:
600
+ reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
601
+ content
602
+ )
588
603
  d = {
589
604
  "role": "assistant",
590
- "content": ". ".join(failed_contents) if failed_contents else None,
605
+ "content": content,
591
606
  "tool_calls": tool_calls,
592
607
  }
608
+ # add only reasoning_content is None
609
+ if reasoning_content is not None:
610
+ d["reasoning_content"] = reasoning_content
611
+
593
612
  try:
594
613
  usage = c.get("usage")
595
614
  assert "prompt_tokens" in usage
@@ -616,7 +635,13 @@ class ChatModelMixin:
616
635
  }
617
636
 
618
637
  @classmethod
619
- def _tool_calls_completion(cls, model_family, model_uid, c):
638
+ def _post_process_completion(
639
+ cls,
640
+ model_family,
641
+ model_uid,
642
+ c,
643
+ reasoning_parser: Optional[ReasoningParser] = None,
644
+ ):
620
645
  _id = str(uuid.uuid4())
621
646
  tool_result = cls._eval_tool_arguments(model_family, c)
622
647
 
@@ -637,11 +662,22 @@ class ChatModelMixin:
637
662
  else:
638
663
  failed_contents.append(content)
639
664
  finish_reason = "tool_calls" if tool_calls else "stop"
665
+
666
+ reasoning_content = None
667
+ content = ". ".join(failed_contents) if failed_contents else None
668
+ if reasoning_parser is not None:
669
+ reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
670
+ content
671
+ )
640
672
  m = {
641
673
  "role": "assistant",
642
- "content": ". ".join(failed_contents) if failed_contents else None,
674
+ "content": content,
643
675
  "tool_calls": tool_calls,
644
676
  }
677
+ # add only reasoning_content is None
678
+ if reasoning_content is not None:
679
+ m["reasoning_content"] = reasoning_content
680
+
645
681
  try:
646
682
  usage = c.get("usage")
647
683
  assert "prompt_tokens" in usage
@@ -43,8 +43,6 @@ from ....types import (
43
43
  )
44
44
  from .. import LLM, LLMFamilyV1, LLMSpecV1
45
45
  from ..llm_family import CustomLLMFamilyV1
46
- from ..reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
47
- from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
48
46
  from ..utils import (
49
47
  DEEPSEEK_TOOL_CALL_FAMILY,
50
48
  QWEN_TOOL_CALL_FAMILY,
@@ -160,6 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
160
158
  VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
161
159
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
162
160
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
161
+ VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
163
162
  VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
164
163
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
165
164
 
@@ -196,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
196
195
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
197
196
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
198
197
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
198
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
199
199
 
200
200
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
201
201
  VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
@@ -211,9 +211,10 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
211
211
 
212
212
  if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
213
213
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
214
+ VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
214
215
 
215
216
  if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
216
- VLLM_SUPPORTED_CHAT_MODELS.append("qwen-2.5-instruct-1m")
217
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
217
218
 
218
219
 
219
220
  class VLLMModel(LLM):
@@ -243,7 +244,6 @@ class VLLMModel(LLM):
243
244
  self.lora_modules = peft_model
244
245
  self.lora_requests: List[LoRARequest] = []
245
246
  self._xavier_config = None
246
- self.reasoning_parser = None
247
247
 
248
248
  def set_xavier_config(self, value: Optional[Dict]):
249
249
  self._xavier_config = value # type: ignore
@@ -274,14 +274,8 @@ class VLLMModel(LLM):
274
274
  self._model_config = self._sanitize_model_config(self._model_config)
275
275
  reasoning_content = self._model_config.pop("reasoning_content")
276
276
 
277
- # Initialize reasoning parser if model has reasoning ability
278
- if "reasoning" in self.model_family.model_ability and reasoning_content:
279
- module_name = self.model_family.model_family or self.model_family.model_name
280
- self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
281
- self.reasoning_parser = self.reasoning_parser(
282
- self.model_family.reasoning_start_tag,
283
- self.model_family.reasoning_end_tag,
284
- )
277
+ self.prepare_parse_reasoning_content(reasoning_content)
278
+
285
279
  if self.lora_modules is None:
286
280
  self.lora_requests = []
287
281
  else:
@@ -581,6 +575,8 @@ class VLLMModel(LLM):
581
575
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
582
576
 
583
577
  sanitized_generate_config = self._sanitize_generate_config(generate_config)
578
+ if self.reasoning_parser:
579
+ sanitized_generate_config.pop("stop")
584
580
  logger.debug(
585
581
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
586
582
  )
@@ -812,18 +808,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
812
808
  i = 0
813
809
  async for chunk in chunks:
814
810
  if i == 0:
815
- yield self._get_first_chat_completion_chunk(chunk)
811
+ yield self._get_first_chat_completion_chunk(
812
+ chunk, self.reasoning_parser
813
+ )
816
814
  # usage
817
815
  choices = chunk.get("choices")
818
816
  if not choices:
819
817
  yield self._get_final_chat_completion_chunk(chunk)
820
818
  else:
821
819
  if self.is_tool_call_chunk(chunk):
822
- yield self._tool_calls_completion_chunk(
823
- self.model_family, self.model_uid, chunk
820
+ yield self._post_process_completion_chunk(
821
+ self.model_family,
822
+ self.model_uid,
823
+ chunk,
824
+ reasoning_parser=self.reasoning_parser,
824
825
  )
825
826
  else:
826
- yield self._to_chat_completion_chunk(chunk)
827
+ yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
827
828
  i += 1
828
829
 
829
830
  @vllm_check
@@ -863,7 +864,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
863
864
  )
864
865
  assert not isinstance(c, AsyncGenerator)
865
866
  if tools:
866
- return self._tool_calls_completion(self.model_family, self.model_uid, c)
867
+ return self._post_process_completion(
868
+ self.model_family, self.model_uid, c, self.reasoning_parser
869
+ )
867
870
  return self._to_chat_completion(c, self.reasoning_parser)
868
871
 
869
872
 
@@ -905,31 +908,15 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
905
908
  def _sanitize_model_config(
906
909
  self, model_config: Optional[VLLMModelConfig]
907
910
  ) -> VLLMModelConfig:
908
- if model_config is None:
909
- model_config = VLLMModelConfig()
910
-
911
- cuda_count = self._get_cuda_count()
912
-
913
- model_config.setdefault("tokenizer_mode", "auto")
914
- model_config.setdefault("trust_remote_code", True)
915
- model_config.setdefault("tensor_parallel_size", cuda_count)
916
- model_config.setdefault("block_size", 16)
917
- model_config.setdefault("swap_space", 4)
918
- model_config.setdefault("gpu_memory_utilization", 0.90)
919
- model_config.setdefault("max_num_seqs", 256)
920
- model_config.setdefault("quantization", None)
921
- model_config.setdefault("max_model_len", None)
922
- model_config["limit_mm_per_prompt"] = (
923
- json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
924
- if model_config.get("limit_mm_per_prompt")
925
- else {
926
- "image": 2, # default 2 images all chat
927
- }
928
- )
929
- # Add scheduling policy if vLLM version is 0.6.3 or higher
930
- if vllm.__version__ >= "0.6.3":
931
- model_config.setdefault("scheduling_policy", "fcfs")
932
-
911
+ model_config = super()._sanitize_model_config(model_config)
912
+ if vllm.__version__ >= "0.5.5":
913
+ model_config["limit_mm_per_prompt"] = (
914
+ json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
915
+ if model_config.get("limit_mm_per_prompt")
916
+ else {
917
+ "image": 2, # default 2 images all chat
918
+ }
919
+ )
933
920
  return model_config
934
921
 
935
922
  def _sanitize_chat_config(
xinference/types.py CHANGED
@@ -78,6 +78,7 @@ class EmbeddingData(TypedDict):
78
78
  class Embedding(TypedDict):
79
79
  object: Literal["list"]
80
80
  model: str
81
+ model_replica: str
81
82
  data: List[EmbeddingData]
82
83
  usage: EmbeddingUsage
83
84
 
@@ -276,6 +277,7 @@ class LlamaCppModelConfig(TypedDict, total=False):
276
277
  use_mmap: bool
277
278
  use_mlock: bool
278
279
  n_threads: Optional[int]
280
+ n_parallel: Optional[int]
279
281
  n_batch: int
280
282
  last_n_tokens_size: int
281
283
  lora_base: Optional[str]
@@ -284,6 +286,7 @@ class LlamaCppModelConfig(TypedDict, total=False):
284
286
  n_gqa: Optional[int] # (TEMPORARY) must be 8 for llama2 70b
285
287
  rms_norm_eps: Optional[float] # (TEMPORARY)
286
288
  verbose: bool
289
+ reasoning_content: bool
287
290
 
288
291
 
289
292
  class PytorchGenerateConfig(TypedDict, total=False):
@@ -330,6 +333,7 @@ class PytorchModelConfig(TypedDict, total=False):
330
333
  trust_remote_code: bool
331
334
  max_num_seqs: int
332
335
  enable_tensorizer: Optional[bool]
336
+ reasoning_content: bool
333
337
 
334
338
 
335
339
  def get_pydantic_model_from_method(
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
3
  "main.css": "./static/css/main.f8177338.css",
4
- "main.js": "./static/js/main.ad42919c.js",
4
+ "main.js": "./static/js/main.55b70cb7.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
7
  "main.f8177338.css.map": "./static/css/main.f8177338.css.map",
8
- "main.ad42919c.js.map": "./static/js/main.ad42919c.js.map"
8
+ "main.55b70cb7.js.map": "./static/js/main.55b70cb7.js.map"
9
9
  },
10
10
  "entrypoints": [
11
11
  "static/css/main.f8177338.css",
12
- "static/js/main.ad42919c.js"
12
+ "static/js/main.55b70cb7.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.ad42919c.js"></script><link href="./static/css/main.f8177338.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.55b70cb7.js"></script><link href="./static/css/main.f8177338.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>