PyPI - xinference - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.0.post1__py3-none-any.whl - Mend

xinference 1.2.2py3-none-any.whl → 1.3.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (68) hide show

xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py ADDED Viewed

@@ -0,0 +1,98 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Tuple, Type, Union
+from ....types import ChatCompletionChunkDelta, CompletionChoice
+class ReasoningParser(ABC):
+    """Abstract base class for reasoning content parsers."""
+    def __init__(
+        self,
+        reasoning_start_tag: str = "<think>",
+        reasoning_end_tag: str = "</think>",
+    ):
+        """Initialize the reasoning parser.
+        Args:
+            reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
+            reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
+        """
+        self.reasoning_start_tag = reasoning_start_tag
+        self.reasoning_end_tag = reasoning_end_tag
+    @abstractmethod
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta: ChatCompletionChunkDelta,
+    ) -> ChatCompletionChunkDelta:
+        """Extract reasoning content from model output in a streaming fashion.
+        Args:
+            content (str): The model output content to parse.
+        Yields:
+            str: Extracted reasoning content chunks.
+        """
+        pass
+    @abstractmethod
+    def extract_reasoning_content(
+        self, model_output: Union[str, CompletionChoice]
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Extract reasoning content from model output.
+        Args:
+            content (str): The model output content to parse.
+        Returns:
+            Optional[str]: Extracted reasoning content, or None if no reasoning content found.
+        """
+        pass
+class ReasoningParserManager:
+    """Manager class for reasoning parsers."""
+    _parsers: Dict[str, Type[ReasoningParser]] = {}
+    @classmethod
+    def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
+        """Register a reasoning parser for a specific model.
+        Args:
+            model_name (str): The name of the model.
+            parser_cls (Type[ReasoningParser]): The parser class to register.
+        """
+        cls._parsers[model_name] = parser_cls
+    @classmethod
+    def register_module(cls, model_name: str):
+        """Decorator for registering a reasoning parser for a specific model.
+        Args:
+            model_name (str): The name of the model.
+        Returns:
+            Callable: The decorator function.
+        """
+        def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
+            cls.register(model_name, parser_cls)
+            return parser_cls
+        return _register
+    @classmethod
+    def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
+        """Get the registered parser for a specific model.
+        Args:
+            model_name (str): The name of the model.
+        Returns:
+            Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
+        """
+        return cls._parsers.get(model_name)

xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py ADDED Viewed

@@ -0,0 +1,140 @@
+import re
+from typing import Optional, Tuple, Union
+from ....types import ChatCompletionChunkDelta, CompletionChoice
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+@ReasoningParserManager.register_module("deepseek-v3")
+@ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
+@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """Reasoning parser for DeepSeek-R1 model."""
+    def __init__(
+        self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
+    ):
+        super().__init__(reasoning_start_tag, reasoning_end_tag)
+        self.reasoning_regex = re.compile(
+            rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
+        )
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta: ChatCompletionChunkDelta,
+    ) -> ChatCompletionChunkDelta:
+        """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
+        Args:
+            previous_text (str): The previous accumulated text content.
+            current_text (Union[str, ChatCompletionChunk]): The current text chunk or completion chunk.
+        Yields:
+            str: Extracted reasoning content chunks.
+        """
+        if delta is None:
+            return delta
+        delta_text = delta["content"]
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
+        if self.reasoning_start_tag in previous_text:
+            if self.reasoning_end_tag in delta_text:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_idx = delta_text.find(self.reasoning_end_tag)
+                reasoning_content = delta_text[:end_idx]
+                content = delta_text[end_idx + len(self.reasoning_end_tag) :]
+                delta["reasoning_content"] = reasoning_content
+                if content is not None:
+                    delta["content"] = content
+                return delta
+            elif self.reasoning_end_tag in previous_text:
+                # <think> in previous, </think> in previous,
+                # <think> in previous, </think> in previous,
+                # reasoning content ends
+                return delta
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                delta["reasoning_content"] = delta_text
+                delta["content"] = ""
+                return delta
+        elif self.reasoning_start_tag in delta_text:
+            if self.reasoning_end_tag in delta_text:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_idx = delta_text.find(self.reasoning_start_tag)
+                end_idx = delta_text.find(self.reasoning_end_tag)
+                reasoning_content = delta_text[
+                    start_idx + len(self.reasoning_start_tag) : end_idx
+                ]
+                content = delta_text[end_idx + len(self.reasoning_end_tag) :]
+                delta["reasoning_content"] = reasoning_content
+                if content is not None:
+                    delta["content"] = content
+                return delta
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                delta["reasoning_content"] = delta_text
+                delta["content"] = ""
+                return delta
+        else:
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.reasoning_end_tag in delta_text:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_idx = delta_text.find(self.reasoning_end_tag)
+                reasoning_content = delta_text[:end_idx]
+                content = delta_text[end_idx + len(self.reasoning_end_tag) :]
+                delta["reasoning_content"] = reasoning_content
+                if content is not None:
+                    delta["content"] = content
+                return delta
+            elif self.reasoning_end_tag in previous_text:
+                # </think> in previous, thinking content ends
+                return delta
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                delta["reasoning_content"] = delta_text
+                delta["content"] = ""
+                return delta
+    def extract_reasoning_content(
+        self, model_output: Union[str, CompletionChoice]
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Extract reasoning content from DeepSeek-R1 model output.
+        Args:
+            content (str): The model output content to parse.
+        Returns:
+            Optional[str]: Extracted reasoning content, or None if no reasoning content found.
+        """
+        if not isinstance(model_output, str):
+            model_output = model_output["text"]
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.reasoning_end_tag not in model_output:
+            return model_output, ""
+        else:
+            # Add a start token if it's missing to keep compatibility.
+            if self.reasoning_start_tag not in model_output:
+                model_output = f"{self.reasoning_start_tag}{model_output}"
+            # Use a regex to find the reasoning content
+            reasoning_content = self.reasoning_regex.findall(model_output)[0]
+            end_index = len(
+                f"{self.reasoning_start_tag}{reasoning_content}{self.reasoning_end_tag}"
+            )
+            final_output = model_output[end_index:]
+            if len(final_output) == 0:
+                return reasoning_content, ""
+            return reasoning_content, final_output

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -14,10 +14,14 @@
 import json
 import logging
+import sys
+import threading
 import time
 import uuid
 from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
+from xoscar.utils import get_next_port
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -40,6 +44,10 @@ class SGLANGModelConfig(TypedDict, total=False):
     mem_fraction_static: float
     log_level: str
     attention_reduce_in_fp32: bool  # For gemma
+    # distributed
+    nnodes: Optional[int]
+    node_rank: Optional[int]
+    dist_init_addr: Optional[str]
 class SGLANGGenerateConfig(TypedDict, total=False):
@@ -91,6 +99,10 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
     "QwQ-32B-Preview",
+    "deepseek-r1-distill-qwen",
+    "deepseek-r1-distill-llama",
+    "deepseek-v3",
+    "deepseek-r1",
 ]
@@ -107,6 +119,16 @@ class SGLANGModel(LLM):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._model_config = model_config
         self._engine = None
+        self._address = model_config.pop("address", None)  # type: ignore
+        self._n_worker = model_config.pop("n_worker", 1)  # type: ignore
+        self._shard = model_config.pop("shard", 0)  # type: ignore
+        self._driver_info = model_config.pop("driver_info", None)  # type: ignore
+        self._loading_thread = None
+        self._loading_error = None
+    @property
+    def driver_info(self) -> Optional[dict]:
+        return self._driver_info
     def load(self):
         try:
@@ -128,18 +150,84 @@ class SGLANGModel(LLM):
         else:
             self._model_config.setdefault("attention_reduce_in_fp32", False)
-        logger.info(
-            f"Loading {self.model_uid} with following model config: {self._model_config}"
-        )
+        # gen port for sgl Runtime,
+        # this is useful for sglang service on a same machine.
+        # sglang typically find a port between [port, 40000]
+        # we need to ensure the generated port < 40000
+        sgl_port = None
+        for _ in range(10):
+            sgl_port = get_next_port()
+            if sgl_port >= 40000:
+                sgl_port = None
+            else:
+                break
+        if sgl_port is None:
+            raise ValueError("Failed to find a port for sglang")
+        if self._n_worker > 1:
+            # distributed inference
+            self._model_config["nnodes"] = self._n_worker
+            self._model_config["node_rank"] = self._shard
+            # model across multiple workers
+            if self._shard == 0:
+                # distributed, need to init driver_info
+                assert self._driver_info is None
+                # This must run inside Xoscar pool
+                dist_init_addr = f"{self._address.split(':', 1)[0]}:{get_next_port()}"
+                self._driver_info = {"dist_init_addr": dist_init_addr}
+                self._model_config["dist_init_addr"] = dist_init_addr
+            else:
+                assert self._driver_info is not None
+                self._model_config["dist_init_addr"] = self._driver_info[
+                    "dist_init_addr"
+                ]
-        self._engine = sgl.Runtime(
-            model_path=self.model_path,
-            tokenizer_path=self.model_path,
-            **self._model_config,
-        )
+            logger.info(
+                f"Loading {self.model_uid}, shard({self._shard} of {self._n_worker}) with following model config: {self._model_config}"
+            )
+            def _load():
+                try:
+                    self._engine = sgl.Runtime(
+                        model_path=self.model_path,
+                        tokenizer_path=self.model_path,
+                        port=sgl_port,
+                        **self._model_config,
+                    )
+                except:
+                    logger.exception("Creating sglang Runtime failed")
+                    self._loading_error = sys.exc_info()
+            self._loading_thread = threading.Thread(target=_load)
+            self._loading_thread.start()
+            if self._shard == 0:
+                # wait for 3 seconds to ensure torch distributed inited first
+                self._loading_thread.join(3)
+        else:
+            logger.info(
+                f"Loading {self.model_uid} with following model config: {self._model_config}"
+            )
+            self._engine = sgl.Runtime(
+                model_path=self.model_path,
+                tokenizer_path=self.model_path,
+                port=sgl_port,
+                **self._model_config,
+            )
+    def wait_for_load(self):
+        if self._loading_thread:
+            if self._shard == 0:
+                # for the shard 0, we wait it to complete
+                # the sglang will serve forever for the other shards,
+                # so we only check if any error happens.
+                self._loading_thread.join()
+            if self._loading_error:
+                _, err, tb = self._loading_error
+                raise err.with_traceback(tb)
     def stop(self):
-        logger.info("Stopping SGLang engine")
+        logger.info("Stopping SGLang engine, sglang pid: %s", self._engine.pid)
         self._engine.shutdown()
     def _sanitize_model_config(
@@ -151,7 +239,7 @@ class SGLANGModel(LLM):
         cuda_count = self._get_cuda_count()
         model_config.setdefault("tokenizer_mode", "auto")
         model_config.setdefault("trust_remote_code", True)
-        model_config.setdefault("tp_size", cuda_count)
+        model_config.setdefault("tp_size", cuda_count * self._n_worker)
         # See https://github.com/sgl-project/sglang/blob/00023d622a6d484e67ef4a0e444f708b8fc861c8/python/sglang/srt/server_args.py#L100-L109
         mem_fraction_static = model_config.get("mem_fraction_static")
         if mem_fraction_static is None:
@@ -159,7 +247,7 @@ class SGLANGModel(LLM):
             if tp_size >= 16:
                 model_config["mem_fraction_static"] = 0.79
             elif tp_size >= 8:
-                model_config["mem_fraction_static"] = 0.83
+                model_config["mem_fraction_static"] = 0.81
             elif tp_size >= 4:
                 model_config["mem_fraction_static"] = 0.85
             elif tp_size >= 2:

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -265,15 +265,24 @@ class InternVLChatModel(PytorchChatModel):
         if world_size == 1:
             return None
         model_size = f"{self.model_spec.model_size_in_billions}B"
+        model_name = f"{self.model_family.model_name.lower()}-{model_size}"
         num_layers = {
-            "1B": 24,
-            "2B": 24,
-            "4B": 32,
-            "8B": 32,
-            "26B": 48,
-            "40B": 60,
-            "76B": 80,
-        }[model_size]
+            "internvl2-1B": 24,
+            "internvl2-2B": 24,
+            "internvl2-4B": 32,
+            "internvl2-8B": 32,
+            "internvl2-26B": 48,
+            "internvl2-40B": 60,
+            "internvl2-76B": 80,
+            "internvl2.5-1B": 24,
+            "internvl2.5-2B": 24,
+            "internvl2.5-4B": 36,
+            "internvl2.5-8B": 32,
+            "internvl2.5-26B": 48,
+            "internvl2.5-38B": 64,
+            "internvl2.5-78B": 80,
+        }[model_name]
         # Since the first GPU will be used for ViT, treat it as half a GPU.
         num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
         num_layers_per_gpu = [num_layers_per_gpu] * world_size
@@ -322,9 +331,7 @@ class InternVLChatModel(PytorchChatModel):
             self._model.cuda()
         self._tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            trust_remote_code=True,
-            use_fast=False,
+            self.model_path, trust_remote_code=True, use_fast=False
         )
     @cache_clean
@@ -339,11 +346,12 @@ class InternVLChatModel(PytorchChatModel):
         IMG_END_TOKEN = "</img>"
         IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+        generate_config = generate_config if isinstance(generate_config, dict) else {}
         generation_config = {
-            "max_new_tokens": generate_config.get("max_tokens", 1024)
-            if generate_config
-            else 1024,
+            "max_new_tokens": (generate_config.get("max_tokens", 1024)),
             "do_sample": False,
+            "temperature": generate_config.get("temperature", None),
         }
         stream = (
@@ -458,6 +466,7 @@ class InternVLChatModel(PytorchChatModel):
         streamer = TextIteratorStreamer(
             self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
         )
         # Define the generation configuration
         generate_kwargs["streamer"] = streamer
         # Start the model chat in a separate thread

xinference/model/llm/utils.py CHANGED Viewed

@@ -54,6 +54,7 @@ from .llm_family import (
     _get_cache_dir,
     get_cache_status,
 )
+from .reasoning_parsers.abs_reasoning_parsers import ReasoningParser
 logger = logging.getLogger(__name__)
@@ -321,6 +322,7 @@ class ChatModelMixin:
     def _to_chat_completion_chunks(
         cls,
         chunks: Iterator[CompletionChunk],
+        reasoning_parse: Optional[ReasoningParser] = None,
     ) -> Iterator[ChatCompletionChunk]:
         for i, chunk in enumerate(chunks):
             if i == 0:
@@ -365,37 +367,72 @@ class ChatModelMixin:
     async def _async_to_chat_completion_chunks(
         cls,
         chunks: AsyncGenerator[CompletionChunk, None],
+        reasoning_parser: Optional[ReasoningParser] = None,
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
         i = 0
+        previous_text = ""
+        current_text = ""
         async for chunk in chunks:
             if i == 0:
-                yield cls._get_first_chat_completion_chunk(chunk)
-            # usage
-            choices = chunk.get("choices")
-            if not choices:
-                yield cls._get_final_chat_completion_chunk(chunk)
+                chat_chunk = cls._get_first_chat_completion_chunk(chunk)
+            elif not chunk.get("choices"):
+                # usage
+                chat_chunk = cls._get_final_chat_completion_chunk(chunk)
             else:
-                yield cls._to_chat_completion_chunk(chunk)
+                chat_chunk = cls._to_chat_completion_chunk(chunk)
+            if reasoning_parser is not None:
+                choices = chat_chunk.get("choices")
+                if choices is None:
+                    continue
+                for choice in choices:
+                    delta = choice.get("delta")
+                    if not delta:
+                        continue
+                    current_text = previous_text + delta.get("content", "")
+                    choice[
+                        "delta"
+                    ] = reasoning_parser.extract_reasoning_content_streaming(
+                        previous_text=previous_text,
+                        current_text=current_text,
+                        delta=delta,
+                    )
+                    previous_text = current_text
+            yield chat_chunk
             i += 1
     @staticmethod
-    def _to_chat_completion(completion: Completion) -> ChatCompletion:
+    def _to_chat_completion(
+        completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
+    ) -> ChatCompletion:
+        choices = []
+        for i, choice in enumerate(completion["choices"]):
+            content = choice["text"]
+            reasoning_content = None
+            if reasoning_parser is not None:
+                reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
+                    choice
+                )
+            message = {"role": "assistant", "content": content}
+            # add only reasoning_content is None
+            if reasoning_content is not None:
+                message["reasoning_content"] = reasoning_content
+            choices.append(
+                {
+                    "index": i,
+                    "message": message,
+                    "finish_reason": choice["finish_reason"],
+                }
+            )
         return {
             "id": "chat" + completion["id"],
             "object": "chat.completion",
             "created": completion["created"],
             "model": completion["model"],
-            "choices": [
-                {
-                    "index": i,
-                    "message": {
-                        "role": "assistant",
-                        "content": choice["text"],
-                    },
-                    "finish_reason": choice["finish_reason"],
-                }
-                for i, choice in enumerate(completion["choices"])
-            ],
+            "choices": choices,  # type: ignore
             "usage": completion["usage"],
         }

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -43,6 +43,8 @@ from ....types import (
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
+from ..reasoning_parsers import deepseek_r1_reasoning_parser  # noqa: F401
+from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -72,6 +74,7 @@ class VLLMModelConfig(TypedDict, total=False):
     limit_mm_per_prompt: Optional[Dict[str, int]]
     guided_decoding_backend: Optional[str]
     scheduling_policy: Optional[str]
+    reasoning_content: bool
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -176,6 +179,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v3")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1")
 if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
@@ -190,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
@@ -206,6 +212,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
 if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
+if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen-2.5-instruct-1m")
 class VLLMModel(LLM):
     def __init__(
@@ -234,6 +243,7 @@ class VLLMModel(LLM):
         self.lora_modules = peft_model
         self.lora_requests: List[LoRARequest] = []
         self._xavier_config = None
+        self.reasoning_parser = None
     def set_xavier_config(self, value: Optional[Dict]):
         self._xavier_config = value  # type: ignore
@@ -262,6 +272,16 @@ class VLLMModel(LLM):
             multiprocessing.set_start_method("fork", force=True)
         self._model_config = self._sanitize_model_config(self._model_config)
+        reasoning_content = self._model_config.pop("reasoning_content")
+        # Initialize reasoning parser if model has reasoning ability
+        if "reasoning" in self.model_family.model_ability and reasoning_content:
+            module_name = self.model_family.model_family or self.model_family.model_name
+            self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
+            self.reasoning_parser = self.reasoning_parser(
+                self.model_family.reasoning_start_tag,
+                self.model_family.reasoning_end_tag,
+            )
         if self.lora_modules is None:
             self.lora_requests = []
         else:
@@ -368,6 +388,7 @@ class VLLMModel(LLM):
         model_config.setdefault("quantization", None)
         model_config.setdefault("max_model_len", None)
         model_config.setdefault("guided_decoding_backend", "outlines")
+        model_config.setdefault("reasoning_content", False)
         # Add scheduling policy if vLLM version is 0.6.3 or higher
         if vllm.__version__ >= "0.6.3":
             model_config.setdefault("scheduling_policy", "fcfs")
@@ -835,7 +856,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             assert isinstance(agen, AsyncGenerator)
             if tools:
                 return self._async_to_tool_completion_chunks(agen)
-            return self._async_to_chat_completion_chunks(agen)
+            return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
         else:
             c = await self.async_generate(
                 full_prompt, generate_config, request_id=request_id
@@ -843,7 +864,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._tool_calls_completion(self.model_family, self.model_uid, c)
-            return self._to_chat_completion(c)
+            return self._to_chat_completion(c, self.reasoning_parser)
 class VLLMVisionModel(VLLMModel, ChatModelMixin):

xinference 1.2.2__py3-none-any.whl → 1.3.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.2.2py3-none-any.whl → 1.3.0.post1py3-none-any.whl