PyPI - xinference - Versions diffs - 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl - Mend

xinference 1.3.0.post2py3-none-any.whl → 1.3.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (53) hide show

xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} RENAMED Viewed

@@ -1,20 +1,17 @@
 import re
 from typing import Optional, Tuple, Union
-from ....types import ChatCompletionChunkDelta, CompletionChoice
-from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from ...types import ChatCompletionChunkDelta, CompletionChoice
-@ReasoningParserManager.register_module("deepseek-v3")
-@ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
-@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
-class DeepSeekR1ReasoningParser(ReasoningParser):
-    """Reasoning parser for DeepSeek-R1 model."""
+class ReasoningParser:
+    """Reasoning parser for reasoning model."""
     def __init__(
         self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
     ):
-        super().__init__(reasoning_start_tag, reasoning_end_tag)
+        self.reasoning_start_tag = reasoning_start_tag
+        self.reasoning_end_tag = reasoning_end_tag
         self.reasoning_regex = re.compile(
             rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
         )
@@ -23,7 +20,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         self,
         previous_text: str,
         current_text: str,
-        delta: ChatCompletionChunkDelta,
+        delta_text: str,
     ) -> ChatCompletionChunkDelta:
         """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
@@ -34,10 +31,9 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         Yields:
             str: Extracted reasoning content chunks.
         """
-        if delta is None:
-            return delta
-        delta_text = delta["content"]
+        delta = ChatCompletionChunkDelta(
+            content=delta_text,
+        )
         # Check if <think> is present in previous or delta.
         # Keep compatibility with models that don't generate <think> tokens.

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -48,6 +48,7 @@ class SGLANGModelConfig(TypedDict, total=False):
     nnodes: Optional[int]
     node_rank: Optional[int]
     dist_init_addr: Optional[str]
+    reasoning_content: bool
 class SGLANGGenerateConfig(TypedDict, total=False):
@@ -99,6 +100,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
     "QwQ-32B-Preview",
+    "QwQ-32B",
     "deepseek-r1-distill-qwen",
     "deepseek-r1-distill-llama",
     "deepseek-v3",
@@ -143,6 +145,8 @@ class SGLANGModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         self._model_config = self._sanitize_model_config(self._model_config)
+        reasoning_content = self._model_config.pop("reasoning_content")
+        self.prepare_parse_reasoning_content(reasoning_content)
         # Fix: GH#2169
         if sgl.__version__ >= "0.2.14":
@@ -255,6 +259,7 @@ class SGLANGModel(LLM):
             else:
                 model_config["mem_fraction_static"] = 0.88
         model_config.setdefault("log_level", "info")
+        model_config.setdefault("reasoning_content", False)
         return model_config
@@ -547,8 +552,8 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         if stream:
             agen = await self.async_generate(full_prompt, generate_config)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
-            return self._async_to_chat_completion_chunks(agen)
+            return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
         else:
             c = await self.async_generate(full_prompt, generate_config)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
-            return self._to_chat_completion(c)
+            return self._to_chat_completion(c, self.reasoning_parser)

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -383,7 +383,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                 function_call = self._process_response_non_streaming(
                     response, tools, use_tool=True
                 )
-                return self._tool_calls_completion(
+                return self._post_process_completion(
                     self.model_family, self.model_uid, function_call
                 )
             else:
@@ -397,7 +397,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                 prompt_tokens = len(inputs["input_ids"][0])
                 for chunk_text in self._stream_chat(inputs, tools, **kwargs):
                     if tools and isinstance(chunk_text, dict):
-                        yield self._tool_calls_completion_chunk(
+                        yield self._post_process_completion_chunk(
                             self.model_family, self.model_uid, chunk_text
                         )
                         return
@@ -484,7 +484,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             function_call = self._process_response_non_streaming(
                 response, req.tools, use_tool=True
             )
-            req.completion[0] = self._tool_calls_completion(
+            req.completion[0] = self._post_process_completion(
                 self.model_family, self.model_uid, function_call
             )
             req.completion[0]["usage"] = usage
@@ -516,7 +516,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                             c for c in req.completion if not isinstance(c, str)
                         ][0]["id"]
                         results.append(
-                            self._tool_calls_completion_chunk(
+                            self._post_process_completion_chunk(
                                 self.model_family,
                                 self.model_uid,
                                 new_response,

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -61,6 +61,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "deepseek-vl-chat",
     "internvl-chat",
     "internvl2",
+    "Internvl2.5",
+    "Internvl2.5-MPO",
     "cogvlm2",
     "cogvlm2-video-llama3-chat",
     "MiniCPM-Llama3-V-2_5",
@@ -112,6 +114,7 @@ class PytorchModel(LLM):
         pytorch_model_config.setdefault("trust_remote_code", True)
         pytorch_model_config.setdefault("max_num_seqs", 16)
         pytorch_model_config.setdefault("enable_tensorizer", False)
+        pytorch_model_config.setdefault("reasoning_content", False)
         return pytorch_model_config
     def _sanitize_generate_config(
@@ -324,6 +327,9 @@ class PytorchModel(LLM):
             kwargs.update({"device_map": "auto"})
             is_device_map_auto = True
+        reasoning_content = self._pytorch_model_config.pop("reasoning_content")
+        self.prepare_parse_reasoning_content(reasoning_content)
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer(**kwargs)
         else:
@@ -714,23 +720,34 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def handle_chat_result_non_streaming(self, req: InferenceRequest):
         if req.tools:
-            req.completion[0] = self._tool_calls_completion(
-                self.model_family, self.model_uid, req.completion[0]
+            req.completion[0] = self._post_process_completion(
+                self.model_family,
+                self.model_uid,
+                req.completion[0],
+                self.reasoning_parser,
             )
         else:
-            req.completion[0] = self._to_chat_completion(req.completion[0])
+            req.completion[0] = self._to_chat_completion(
+                req.completion[0], self.reasoning_parser
+            )
     def handle_chat_result_streaming(self, req: InferenceRequest):
         results = []
         for i, c in enumerate(req.completion):
             if c == "<bos_stream>":
                 results.append(
-                    self._get_first_chat_completion_chunk(req.completion[i + 1])
+                    self._get_first_chat_completion_chunk(
+                        req.completion[i + 1], self.reasoning_parser
+                    )
                 )
             elif c == "<eos_stream>":
                 break
             else:
-                results.append(self._to_chat_completion_chunk(c))
+                results.append(
+                    self._to_chat_completion_chunk(
+                        c, self.reasoning_parser, req.previous_texts
+                    )
+                )
         if req.stopped and req.include_usage:
             results.append(self._get_final_chat_completion_chunk(req.completion[-1]))

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -265,7 +265,8 @@ class InternVLChatModel(PytorchChatModel):
         if world_size == 1:
             return None
         model_size = f"{self.model_spec.model_size_in_billions}B"
-        model_name = f"{self.model_family.model_name.lower()}-{model_size}"
+        model_name = self.model_family.model_name.lower().replace("-mpo", "")
+        model_name = f"{model_name}-{model_size}"
         num_layers = {
             "internvl2-1B": 24,
             "internvl2-2B": 24,

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -132,7 +132,7 @@ def _pad_seqs_inplace(seqs: List[List[int]], reqs: List[InferenceRequest], pad:
 def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
     max_new_tokens = int(
-        r.sanitized_generate_config.get("max_tokens", max_tokens_field.default)
+        r.sanitized_generate_config.get("max_tokens") or max_tokens_field.default
     )
     return context_len - max_new_tokens - 8

xinference/model/llm/utils.py CHANGED Viewed

@@ -41,6 +41,7 @@ from ...types import (
     ChatCompletion,
     ChatCompletionChoice,
     ChatCompletionChunk,
+    ChatCompletionChunkDelta,
     ChatCompletionMessage,
     Completion,
     CompletionChoice,
@@ -54,7 +55,7 @@ from .llm_family import (
     _get_cache_dir,
     get_cache_status,
 )
-from .reasoning_parsers.abs_reasoning_parsers import ReasoningParser
+from .reasoning_parser import ReasoningParser
 logger = logging.getLogger(__name__)
@@ -243,62 +244,95 @@ class ChatModelMixin:
             raise ValueError(f"Invalid model family: {model_family}")
     @classmethod
-    def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
+    def _to_chat_completion_chunk(
+        cls,
+        chunk: CompletionChunk,
+        reasoning_parser: Optional[ReasoningParser] = None,
+        previous_texts: Optional[List[str]] = None,
+    ) -> ChatCompletionChunk:
         choices = chunk.get("choices")
         if (
             chunk.get("object") == "chat.completion.chunk"
             and choices
             and "delta" in choices[0]
         ):
+            if reasoning_parser is not None:
+                # process parsing reasoning content
+                assert previous_texts is not None
+                delta = choices[0]["delta"]  # type: ignore
+                if text := delta.get("content"):
+                    current_text = previous_texts[-1] + text
+                    delta = reasoning_parser.extract_reasoning_content_streaming(
+                        previous_text=previous_texts[-1],
+                        current_text=current_text,
+                        delta_text=text,
+                    )
+                    previous_texts[-1] = current_text
+                    choices[0]["delta"] = delta  # type: ignore
             # Already a ChatCompletionChunk, we don't need to convert chunk.
             return cast(ChatCompletionChunk, chunk)
+        choices_list = []
+        for i, choice in enumerate(choices):  # type: ignore
+            delta = ChatCompletionChunkDelta()
+            if "text" in choice and choice["finish_reason"] is None:
+                if reasoning_parser is None:
+                    delta["content"] = choice["text"]
+                else:
+                    assert previous_texts is not None
+                    current_text = previous_texts[-1] + choice["text"]
+                    delta = reasoning_parser.extract_reasoning_content_streaming(
+                        previous_text=previous_texts[-1],
+                        current_text=current_text,
+                        delta_text=choice["text"],
+                    )
+                    previous_texts[-1] = current_text
+            if "tool_calls" in choice:
+                delta["tool_calls"] = choice["tool_calls"]
+            choices_list.append(
+                {
+                    "index": i,
+                    "delta": delta,
+                    "finish_reason": choice["finish_reason"],
+                }
+            )
         chat_chunk = {
             "id": "chat" + chunk["id"],
             "model": chunk["model"],
             "created": chunk["created"],
             "object": "chat.completion.chunk",
-            "choices": [
-                {
-                    "index": i,
-                    "delta": {
-                        **(
-                            {"content": choice["text"]}
-                            if ("text" in choice and choice["finish_reason"] is None)
-                            else {}
-                        ),
-                        **(
-                            {"tool_calls": choice["tool_calls"]}
-                            if "tool_calls" in choice
-                            else {}
-                        ),
-                    },
-                    "finish_reason": choice["finish_reason"],
-                }
-                for i, choice in enumerate(chunk["choices"])
-            ],
+            "choices": choices_list,
         }
         return cast(ChatCompletionChunk, chat_chunk)
     @classmethod
     def _get_first_chat_completion_chunk(
-        cls, chunk: CompletionChunk
+        cls,
+        chunk: CompletionChunk,
+        reasoning_parser: Optional[ReasoningParser] = None,
     ) -> ChatCompletionChunk:
+        choices_list = []
+        for i, choice in enumerate(chunk["choices"]):
+            delta = {
+                "role": "assistant",
+            }
+            if reasoning_parser is None:
+                delta["content"] = ""
+            else:
+                delta["reasoning_content"] = ""
+            choices_list.append(
+                {
+                    "index": i,
+                    "delta": delta,
+                    "finish_reason": None,
+                }
+            )
         chat_chunk = {
             "id": "chat" + chunk["id"],
             "model": chunk["model"],
             "created": chunk["created"],
             "object": "chat.completion.chunk",
-            "choices": [
-                {
-                    "index": i,
-                    "delta": {
-                        "role": "assistant",
-                        "content": "",
-                    },
-                    "finish_reason": None,
-                }
-                for i, choice in enumerate(chunk["choices"])
-            ],
+            "choices": choices_list,
         }
         return cast(ChatCompletionChunk, chat_chunk)
@@ -324,15 +358,19 @@ class ChatModelMixin:
         chunks: Iterator[CompletionChunk],
         reasoning_parse: Optional[ReasoningParser] = None,
     ) -> Iterator[ChatCompletionChunk]:
+        previous_texts = [""]
         for i, chunk in enumerate(chunks):
             if i == 0:
-                yield cls._get_first_chat_completion_chunk(chunk)
+                yield cls._get_first_chat_completion_chunk(chunk, reasoning_parse)
             # usage
             choices = chunk.get("choices")
             if not choices:
                 yield cls._get_final_chat_completion_chunk(chunk)
             else:
-                yield cls._to_chat_completion_chunk(chunk)
+                r = cls._to_chat_completion_chunk(
+                    chunk, reasoning_parse, previous_texts
+                )
+                yield r
     @classmethod
     def _tools_to_messages_for_deepseek(
@@ -370,33 +408,19 @@ class ChatModelMixin:
         reasoning_parser: Optional[ReasoningParser] = None,
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
         i = 0
-        previous_text = ""
-        current_text = ""
+        previous_texts = [""]
         async for chunk in chunks:
             if i == 0:
-                chat_chunk = cls._get_first_chat_completion_chunk(chunk)
+                chat_chunk = cls._get_first_chat_completion_chunk(
+                    chunk, reasoning_parser
+                )
             elif not chunk.get("choices"):
                 # usage
                 chat_chunk = cls._get_final_chat_completion_chunk(chunk)
             else:
-                chat_chunk = cls._to_chat_completion_chunk(chunk)
-            if reasoning_parser is not None:
-                choices = chat_chunk.get("choices")
-                if choices is None:
-                    continue
-                for choice in choices:
-                    delta = choice.get("delta")
-                    if not delta:
-                        continue
-                    current_text = previous_text + delta.get("content", "")
-                    choice[
-                        "delta"
-                    ] = reasoning_parser.extract_reasoning_content_streaming(
-                        previous_text=previous_text,
-                        current_text=current_text,
-                        delta=delta,
-                    )
-                    previous_text = current_text
+                chat_chunk = cls._to_chat_completion_chunk(
+                    chunk, reasoning_parser, previous_texts
+                )
             yield chat_chunk
             i += 1
@@ -404,6 +428,21 @@ class ChatModelMixin:
     def _to_chat_completion(
         completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
     ) -> ChatCompletion:
+        if completion.get("object") == "chat.completion" and completion.get("choices"):
+            # Already a ChatCompletion
+            if reasoning_parser is not None:
+                for choice in completion["choices"]:
+                    message = choice["message"]  # type: ignore
+                    text = message["content"]
+                    (
+                        reasoning_content,
+                        content,
+                    ) = reasoning_parser.extract_reasoning_content(text)
+                    message["content"] = content
+                    if reasoning_content is not None:
+                        message["reasoning_content"] = reasoning_content
+            return cast(ChatCompletion, completion)
         choices = []
         for i, choice in enumerate(completion["choices"]):
             content = choice["text"]
@@ -565,7 +604,14 @@ class ChatModelMixin:
         return result
     @classmethod
-    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
+    def _post_process_completion_chunk(
+        cls,
+        model_family,
+        model_uid,
+        c,
+        chunk_id=None,
+        reasoning_parser: Optional[ReasoningParser] = None,
+    ):
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
         tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
@@ -585,11 +631,22 @@ class ChatModelMixin:
             else:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
+        reasoning_content = None
+        content = ". ".join(failed_contents) if failed_contents else None
+        if reasoning_parser is not None:
+            reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
+                content
+            )
         d = {
             "role": "assistant",
-            "content": ". ".join(failed_contents) if failed_contents else None,
+            "content": content,
             "tool_calls": tool_calls,
         }
+        # add only reasoning_content is None
+        if reasoning_content is not None:
+            d["reasoning_content"] = reasoning_content
         try:
             usage = c.get("usage")
             assert "prompt_tokens" in usage
@@ -616,7 +673,13 @@ class ChatModelMixin:
         }
     @classmethod
-    def _tool_calls_completion(cls, model_family, model_uid, c):
+    def _post_process_completion(
+        cls,
+        model_family,
+        model_uid,
+        c,
+        reasoning_parser: Optional[ReasoningParser] = None,
+    ):
         _id = str(uuid.uuid4())
         tool_result = cls._eval_tool_arguments(model_family, c)
@@ -637,11 +700,22 @@ class ChatModelMixin:
             else:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
+        reasoning_content = None
+        content = ". ".join(failed_contents) if failed_contents else None
+        if reasoning_parser is not None:
+            reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
+                content
+            )
         m = {
             "role": "assistant",
-            "content": ". ".join(failed_contents) if failed_contents else None,
+            "content": content,
             "tool_calls": tool_calls,
         }
+        # add only reasoning_content is None
+        if reasoning_content is not None:
+            m["reasoning_content"] = reasoning_content
         try:
             usage = c.get("usage")
             assert "prompt_tokens" in usage

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -43,8 +43,6 @@ from ....types import (
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
-from ..reasoning_parsers import deepseek_r1_reasoning_parser  # noqa: F401
-from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -160,6 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
+    VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
     VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
@@ -196,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
@@ -211,9 +211,10 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
 if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
 if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
-    VLLM_SUPPORTED_CHAT_MODELS.append("qwen-2.5-instruct-1m")
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
 class VLLMModel(LLM):
@@ -243,7 +244,6 @@ class VLLMModel(LLM):
         self.lora_modules = peft_model
         self.lora_requests: List[LoRARequest] = []
         self._xavier_config = None
-        self.reasoning_parser = None
     def set_xavier_config(self, value: Optional[Dict]):
         self._xavier_config = value  # type: ignore
@@ -274,14 +274,8 @@ class VLLMModel(LLM):
         self._model_config = self._sanitize_model_config(self._model_config)
         reasoning_content = self._model_config.pop("reasoning_content")
-        # Initialize reasoning parser if model has reasoning ability
-        if "reasoning" in self.model_family.model_ability and reasoning_content:
-            module_name = self.model_family.model_family or self.model_family.model_name
-            self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
-            self.reasoning_parser = self.reasoning_parser(
-                self.model_family.reasoning_start_tag,
-                self.model_family.reasoning_end_tag,
-            )
+        self.prepare_parse_reasoning_content(reasoning_content)
         if self.lora_modules is None:
             self.lora_requests = []
         else:
@@ -581,6 +575,10 @@ class VLLMModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         sanitized_generate_config = self._sanitize_generate_config(generate_config)
+        if self.reasoning_parser:
+            # For reasoning model, the </think> we be split into multiple words,
+            # if `stop` param is passed, so we pop it from config.
+            sanitized_generate_config.pop("stop")
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
@@ -812,18 +810,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         i = 0
         async for chunk in chunks:
             if i == 0:
-                yield self._get_first_chat_completion_chunk(chunk)
+                yield self._get_first_chat_completion_chunk(
+                    chunk, self.reasoning_parser
+                )
             # usage
             choices = chunk.get("choices")
             if not choices:
                 yield self._get_final_chat_completion_chunk(chunk)
             else:
                 if self.is_tool_call_chunk(chunk):
-                    yield self._tool_calls_completion_chunk(
-                        self.model_family, self.model_uid, chunk
+                    yield self._post_process_completion_chunk(
+                        self.model_family,
+                        self.model_uid,
+                        chunk,
+                        reasoning_parser=self.reasoning_parser,
                     )
                 else:
-                    yield self._to_chat_completion_chunk(chunk)
+                    yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
             i += 1
     @vllm_check
@@ -863,7 +866,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             )
             assert not isinstance(c, AsyncGenerator)
             if tools:
-                return self._tool_calls_completion(self.model_family, self.model_uid, c)
+                return self._post_process_completion(
+                    self.model_family, self.model_uid, c, self.reasoning_parser
+                )
             return self._to_chat_completion(c, self.reasoning_parser)
@@ -905,31 +910,15 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     def _sanitize_model_config(
         self, model_config: Optional[VLLMModelConfig]
     ) -> VLLMModelConfig:
-        if model_config is None:
-            model_config = VLLMModelConfig()
-        cuda_count = self._get_cuda_count()
-        model_config.setdefault("tokenizer_mode", "auto")
-        model_config.setdefault("trust_remote_code", True)
-        model_config.setdefault("tensor_parallel_size", cuda_count)
-        model_config.setdefault("block_size", 16)
-        model_config.setdefault("swap_space", 4)
-        model_config.setdefault("gpu_memory_utilization", 0.90)
-        model_config.setdefault("max_num_seqs", 256)
-        model_config.setdefault("quantization", None)
-        model_config.setdefault("max_model_len", None)
-        model_config["limit_mm_per_prompt"] = (
-            json.loads(model_config.get("limit_mm_per_prompt"))  # type: ignore
-            if model_config.get("limit_mm_per_prompt")
-            else {
-                "image": 2,  # default 2 images all chat
-            }
-        )
-        # Add scheduling policy if vLLM version is 0.6.3 or higher
-        if vllm.__version__ >= "0.6.3":
-            model_config.setdefault("scheduling_policy", "fcfs")
+        model_config = super()._sanitize_model_config(model_config)
+        if vllm.__version__ >= "0.5.5":
+            model_config["limit_mm_per_prompt"] = (
+                json.loads(model_config.get("limit_mm_per_prompt"))  # type: ignore
+                if model_config.get("limit_mm_per_prompt")
+                else {
+                    "image": 2,  # default 2 images all chat
+                }
+            )
         return model_config
     def _sanitize_chat_config(

xinference 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.3.0.post2py3-none-any.whl → 1.3.1.post1py3-none-any.whl