PyPI - xinference - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

xinference 1.5.1py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -160,7 +160,10 @@ class MLXModel(LLM):
     def load(self):
         reasoning_content = self._model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._model_config.pop("enable_thinking", True)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         kwargs = {}
         kwargs["revision"] = self._model_config.get(
@@ -450,7 +453,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
         full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}  # type: ignore
+            self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {}  # type: ignore
         )
         if tools:
             if (
@@ -634,7 +637,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
             from qwen_vl_utils import process_vision_info
             full_context_kwargs = (
-                self._get_chat_template_kwargs_from_generate_config(generate_config)  # type: ignore
+                self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser)  # type: ignore
                 or {}
             )
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:

xinference/model/llm/reasoning_parser.py CHANGED Viewed

@@ -1,20 +1,33 @@
 import re
-from typing import Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union
-from ...types import ChatCompletionChunkDelta, CompletionChoice
+from ...types import (
+    ChatCompletionChunk,
+    ChatCompletionChunkDelta,
+    CompletionChoice,
+    CompletionChunk,
+)
 class ReasoningParser:
     """Reasoning parser for reasoning model."""
     def __init__(
-        self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
+        self,
+        reasoning_content: bool = False,
+        reasoning_start_tag: str = "",
+        reasoning_end_tag: str = "",
+        enable_thinking: bool = True,
     ):
+        self.reasoning_content = reasoning_content
         self.reasoning_start_tag = reasoning_start_tag
         self.reasoning_end_tag = reasoning_end_tag
         self.reasoning_regex = re.compile(
             rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
         )
+        # enable_thinking can be set to False only for hybrid model
+        # e.g. qwen3, which can support both thinking and non-thinking
+        self.enable_thinking = enable_thinking
     def extract_reasoning_content_streaming(
         self,
@@ -62,9 +75,9 @@ class ReasoningParser:
                 delta["content"] = None
                 return delta
         elif self.reasoning_start_tag in delta_text:
+            start_idx = delta_text.find(self.reasoning_start_tag)
             if self.reasoning_end_tag in delta_text:
                 # <think> in delta, </think> in delta, extract reasoning content
-                start_idx = delta_text.find(self.reasoning_start_tag)
                 end_idx = delta_text.find(self.reasoning_end_tag)
                 reasoning_content = delta_text[
                     start_idx + len(self.reasoning_start_tag) : end_idx
@@ -79,7 +92,10 @@ class ReasoningParser:
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
-                delta["reasoning_content"] = delta_text
+                reasoning_content = delta_text[
+                    start_idx + len(self.reasoning_start_tag) :
+                ]
+                delta["reasoning_content"] = reasoning_content
                 delta["content"] = None
                 return delta
         else:
@@ -142,3 +158,263 @@ class ReasoningParser:
             if len(final_output) == 0:
                 return reasoning_content, ""
             return reasoning_content, final_output
+    def check_content_parser(self) -> bool:
+        """Check if the parser should extract reasoning content.
+        Returns:
+            bool: True if reasoning content should be extracted, False otherwise
+        """
+        return self.reasoning_content
+    def _create_chat_completion_chunk(
+        self, chunk: Union[Dict[str, Any], CompletionChunk], content: str
+    ) -> ChatCompletionChunk:
+        """Helper method to create a ChatCompletionChunk with specified content.
+        Args:
+            chunk: The original chunk to copy metadata from
+            content: The content to include in the chunk
+        Returns:
+            ChatCompletionChunk: A new chat completion chunk
+        """
+        return ChatCompletionChunk(
+            id="chat" + chunk["id"],
+            model=chunk["model"],
+            created=chunk["created"],
+            object="chat.completion.chunk",
+            choices=[
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": content,
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        )
+    def _create_completion_chunk(
+        self, chunk: Union[Dict[str, Any], CompletionChunk], text: str
+    ) -> CompletionChunk:
+        """Helper method to create a CompletionChunk with specified text.
+        Args:
+            chunk: The original chunk to copy metadata from
+            text: The text to include in the chunk
+        Returns:
+            CompletionChunk: A new completion chunk
+        """
+        return CompletionChunk(
+            id=chunk["id"],
+            model=chunk["model"],
+            created=chunk["created"],
+            object="text_completion",
+            choices=[
+                {
+                    "index": 0,
+                    "text": text,
+                    "logprobs": None,
+                    "finish_reason": None,
+                }
+            ],
+        )
+    async def prepare_reasoning_content_streaming(
+        self, chunks: AsyncGenerator[CompletionChunk, None]
+    ):
+        """Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
+        if not, add a chunk with the tag at the beginning.
+        Args:
+            chunks (AsyncGenerator[CompletionChunk, None]): Chunks from model output
+        Yields:
+            AsyncGenerator[CompletionChunk, None]: Processed chunks
+        """
+        # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
+        # yield chunks as is
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            async for chunk in chunks:
+                yield chunk
+            return
+        # If chunks is empty, return
+        if not chunks:
+            return
+        # Flag to identify the first chunk
+        is_first_chunk = True
+        async for chunk in chunks:
+            if is_first_chunk:
+                # Reset the flag after processing the first chunk
+                is_first_chunk = False
+                choices = chunk.get("choices")
+                if not choices or not choices[0]:
+                    continue
+                if (
+                    chunk.get("object") == "chat.completion.chunk"
+                    and "delta" in choices[0]
+                ):
+                    # For chat completion chunks with delta format
+                    delta = choices[0].get("delta")
+                    if delta is None:
+                        continue
+                    assert isinstance(delta, dict)
+                    text = delta.get("content")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_chat_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                else:
+                    # For standard completion chunks
+                    text = choices[0].get("text")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                # Yield the original first chunk
+                yield chunk
+            else:
+                # For non-first chunks, yield directly
+                yield chunk
+    def prepare_reasoning_content_sync(self, chunks: Iterator[CompletionChunk]):
+        """Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
+        if not, add a chunk with the tag at the beginning. This is a synchronous version of
+        prepare_reasoning_content_streaming.
+        Args:
+            chunks (Iterator[CompletionChunk]): Chunks from model output
+        Returns:
+            Iterator[CompletionChunk]: Processed chunks
+        """
+        # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
+        # yield chunks as is
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            for chunk in chunks:
+                yield chunk
+            return
+        # Flag to identify the first chunk
+        is_first_chunk = True
+        for chunk in chunks:
+            if is_first_chunk:
+                # Reset the flag after processing the first chunk
+                is_first_chunk = False
+                choices = chunk.get("choices")
+                if not choices or not choices[0]:
+                    continue
+                if (
+                    chunk.get("object") == "chat.completion.chunk"
+                    and "delta" in choices[0]
+                ):
+                    # For chat completion chunks with delta format
+                    delta = choices[0].get("delta")
+                    if delta is None:
+                        continue
+                    assert isinstance(delta, dict)
+                    text = delta.get("content")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_chat_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                else:
+                    # For standard completion chunks
+                    text = choices[0].get("text")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                # Yield the original first chunk
+                yield chunk
+            else:
+                # For non-first chunks, yield directly
+                yield chunk
+    def prepare_reasoning_content(self, completion):
+        """Ensures that the model output string starts with the reasoning_start_tag.
+        If the model_output is not a string (e.g., CompletionChoice), it extracts
+        the text content. If the reasoning_start_tag is not found in the text,
+        it prepends the tag to the text.
+        Args:
+            completion: The completion object containing model output,
+                which can be either a chat completion or a standard completion.
+        """
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            return completion
+        if completion.get("object") == "chat.completion" and completion.get("choices"):
+            text = completion["choices"][0]["message"]["content"]
+            if self.reasoning_start_tag not in text:
+                text = f"{self.reasoning_start_tag}\n{text}"
+            completion["choices"][0]["message"]["content"] = text
+            return completion
+        text = completion["choices"][0]["text"]
+        if self.reasoning_start_tag not in text:
+            text = f"{self.reasoning_start_tag}\n{text}"
+        completion["choices"][0]["text"] = text
+        return completion
+    def prepare_first_reasoning_content_chunk(
+        self,
+        chunk: CompletionChunk,
+    ) -> List[ChatCompletionChunk]:
+        """Prepares the first chunk of a completion by adding reasoning_start_tag if needed.
+        This function checks if the first chunk contains the reasoning_start_tag. If not,
+        it creates two new chunks containing the reasoning_start_tag and a newline character
+        that will be inserted before the original chunk.
+        Args:
+            chunk (CompletionChunk): The first chunk of a completion to check and possibly modify
+        Returns:
+            List[ChatCompletionChunk]: A list of new chunks to insert before the original chunk,
+                or an empty list if no modification is needed
+        """
+        chunks: List[ChatCompletionChunk] = []
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            return chunks
+        choices = chunk.get("choices")
+        if not choices or not choices[0]:
+            return chunks
+        text = choices[0].get("text")
+        if not text:
+            return chunks
+        if self.reasoning_start_tag not in text:
+            # Create chunks with reasoning_start_tag and newline
+            chunks.append(
+                self._create_chat_completion_chunk(
+                    chunk, f"{self.reasoning_start_tag}\n"
+                )
+            )
+        return chunks

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -101,13 +101,17 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "deepseek-v2-chat-0628",
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
+    "XiYanSQL-QwenCoder-2504",
     "QwQ-32B-Preview",
     "QwQ-32B",
     "deepseek-r1-distill-qwen",
     "deepseek-r1-distill-llama",
     "deepseek-v3",
     "deepseek-r1",
+    "DianJin-R1",
     "qwen3",
+    "HuatuoGPT-o1-Qwen2.5",
+    "HuatuoGPT-o1-LLaMA-3.1",
 ]
 SGLANG_SUPPORTED_VISION_MODEL_LIST = [
     "qwen2.5-vl-instruct",
@@ -155,7 +159,10 @@ class SGLANGModel(LLM):
         self._model_config = self._sanitize_model_config(self._model_config)
         reasoning_content = self._model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._model_config.pop("enable_thinking", False)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         # Fix: GH#2169
         if sgl.__version__ >= "0.2.14":
@@ -568,7 +575,10 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         assert self.model_family.chat_template is not None
         full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
         )
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
@@ -640,7 +650,10 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
         )
         full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
         )
         prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
         images, video_inputs = process_vision_info(messages)

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -464,7 +464,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     full_context_kwargs = (
                         self._get_chat_template_kwargs_from_generate_config(
-                            r.generate_config
+                            r.generate_config, self.reasoning_parser
                         )
                         or {}
                     )
@@ -508,7 +508,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         if "<bos_stream>" in req.completion:
             bos_pos = req.completion.index("<bos_stream>")
-            results.append(
+            results.extend(
                 self._get_first_chat_completion_chunk(req.completion[bos_pos + 1])
             )

xinference/model/llm/transformers/cogagent.py CHANGED Viewed

@@ -207,7 +207,7 @@ class CogAgentChatModel(PytorchChatModel):
             "return_dict": True,
         }
         full_context_kwargs.update(
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}  # type: ignore
+            self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {}  # type: ignore
         )
         assert self.model_family.chat_template is not None
         inputs = self.get_full_context(

xinference/model/llm/transformers/cogvlm2.py CHANGED Viewed

@@ -316,7 +316,7 @@ class CogVLM2Model(PytorchChatModel):
     def get_dtype(self):
         return self._torch_type
-    def _get_full_prompt(self, messages: List[Dict], tools):  # type: ignore
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):  # type: ignore
         prompt, system_prompt, chat_history = parse_messages(messages)
         system_prompt = system_prompt or ""
         query, image, history = self.get_query_and_history(

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -339,7 +339,10 @@ class PytorchModel(LLM):
             is_device_map_auto = True
         reasoning_content = self._pytorch_model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._pytorch_model_config.pop("enable_thinking", False)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer(**kwargs)
@@ -702,7 +705,10 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
         model_family = self.model_family.model_family or self.model_family.model_name
         full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
         )
         if (
             tools
@@ -753,7 +759,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         results = []
         for i, c in enumerate(req.completion):
             if c == "<bos_stream>":
-                results.append(
+                results.extend(
                     self._get_first_chat_completion_chunk(
                         req.completion[i + 1], self.reasoning_parser
                     )

xinference/model/llm/transformers/glm4v.py CHANGED Viewed

@@ -196,7 +196,7 @@ class Glm4VModel(PytorchChatModel):
             has_content=False,
         )
-    def _get_full_prompt(self, messages, tools):
+    def _get_full_prompt(self, messages, tools, generate_config: dict):
         msgs = self._get_processed_msgs(messages)
         inputs = self._tokenizer.apply_chat_template(
             msgs,

xinference/model/llm/transformers/minicpmv26.py CHANGED Viewed

@@ -324,7 +324,7 @@ class MiniCPMV26Model(PytorchChatModel):
             "input_image": images,
         }
-    def _get_full_prompt(self, messages: List[Dict], tools):  # type: ignore
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):  # type: ignore
         msgs, video_existed = self._convert_to_specific_style(messages)
         if video_existed:
             raise RuntimeError(

xinference/model/llm/transformers/qwen-omni.py CHANGED Viewed

@@ -67,6 +67,12 @@ class Qwen2_5OmniChatModel(PytorchChatModel):
         return False
     def load(self):
+        logger.debug(
+            "Try to load model, current python: %s, sys path: %s",
+            sys.executable,
+            sys.path,
+        )
         from transformers import (
             Qwen2_5OmniForConditionalGeneration,
             Qwen2_5OmniProcessor,

xinference/model/llm/transformers/qwen_vl.py CHANGED Viewed

@@ -313,7 +313,7 @@ class QwenVLChatModel(PytorchChatModel):
         return raw_text, context_tokens
-    def _get_full_prompt(self, messages: List[Dict], tools):  # type: ignore
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):  # type: ignore
         prompt, qwen_history = self._get_prompt_and_chat_history(messages)
         _, context_tokens = self.make_context(self._tokenizer, prompt, qwen_history)
         return context_tokens

xinference 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

Potentially problematic release.

xinference 1.5.1py3-none-any.whl → 1.6.0py3-none-any.whl