PyPI - xinference - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

xinference 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (80) hide show

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -31,7 +31,12 @@ from ....types import (
 )
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin, generate_completion_chunk
+from ..utils import (
+    DEEPSEEK_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_FAMILY,
+    ChatModelMixin,
+    generate_completion_chunk,
+)
 logger = logging.getLogger(__name__)
@@ -424,8 +429,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
         full_context_kwargs = {}
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            full_context_kwargs["tools"] = tools
+        if tools:
+            if model_family in QWEN_TOOL_CALL_FAMILY:
+                full_context_kwargs["tools"] = tools
+            elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
+                self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs

xinference/model/llm/reasoning_parsers/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2024 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py ADDED Viewed

@@ -0,0 +1,98 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Tuple, Type, Union
+from ....types import ChatCompletionChunkDelta, CompletionChoice, CompletionChunk
+class ReasoningParser(ABC):
+    """Abstract base class for reasoning content parsers."""
+    def __init__(
+        self,
+        reasoning_start_tag: str = "<think>",
+        reasoning_end_tag: str = "</think>",
+    ):
+        """Initialize the reasoning parser.
+        Args:
+            reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
+            reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
+        """
+        self.reasoning_start_tag = reasoning_start_tag
+        self.reasoning_end_tag = reasoning_end_tag
+    @abstractmethod
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta: Union[str, CompletionChunk],
+    ) -> ChatCompletionChunkDelta:
+        """Extract reasoning content from model output in a streaming fashion.
+        Args:
+            content (str): The model output content to parse.
+        Yields:
+            str: Extracted reasoning content chunks.
+        """
+        pass
+    @abstractmethod
+    def extract_reasoning_content(
+        self, model_output: Union[str, CompletionChoice]
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Extract reasoning content from model output.
+        Args:
+            content (str): The model output content to parse.
+        Returns:
+            Optional[str]: Extracted reasoning content, or None if no reasoning content found.
+        """
+        pass
+class ReasoningParserManager:
+    """Manager class for reasoning parsers."""
+    _parsers: Dict[str, Type[ReasoningParser]] = {}
+    @classmethod
+    def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
+        """Register a reasoning parser for a specific model.
+        Args:
+            model_name (str): The name of the model.
+            parser_cls (Type[ReasoningParser]): The parser class to register.
+        """
+        cls._parsers[model_name] = parser_cls
+    @classmethod
+    def register_module(cls, model_name: str):
+        """Decorator for registering a reasoning parser for a specific model.
+        Args:
+            model_name (str): The name of the model.
+        Returns:
+            Callable: The decorator function.
+        """
+        def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
+            cls.register(model_name, parser_cls)
+            return parser_cls
+        return _register
+    @classmethod
+    def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
+        """Get the registered parser for a specific model.
+        Args:
+            model_name (str): The name of the model.
+        Returns:
+            Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
+        """
+        return cls._parsers.get(model_name)

xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py ADDED Viewed

@@ -0,0 +1,140 @@
+import re
+from typing import Optional, Tuple, Union
+from ....types import ChatCompletionChunkDelta, CompletionChoice
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+@ReasoningParserManager.register_module("deepseek-v3")
+@ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
+@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """Reasoning parser for DeepSeek-R1 model."""
+    def __init__(
+        self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
+    ):
+        super().__init__(reasoning_start_tag, reasoning_end_tag)
+        self.reasoning_regex = re.compile(
+            rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
+        )
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta: ChatCompletionChunkDelta,
+    ) -> Optional[ChatCompletionChunkDelta]:
+        """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
+        Args:
+            previous_text (str): The previous accumulated text content.
+            current_text (Union[str, ChatCompletionChunk]): The current text chunk or completion chunk.
+        Yields:
+            str: Extracted reasoning content chunks.
+        """
+        if delta is None:
+            return delta
+        delta_text = delta["content"]
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
+        if self.reasoning_start_tag in previous_text:
+            if self.reasoning_end_tag in delta_text:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_idx = delta_text.find(self.reasoning_end_tag)
+                reasoning_content = delta_text[:end_idx]
+                content = delta_text[end_idx + len(self.reasoning_end_tag) :]
+                delta["reasoning_content"] = reasoning_content
+                if content is not None:
+                    delta["content"] = content
+                return delta
+            elif self.reasoning_end_tag in previous_text:
+                # <think> in previous, </think> in previous,
+                # <think> in previous, </think> in previous,
+                # reasoning content ends
+                return delta
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                delta["reasoning_content"] = delta_text
+                delta["content"] = ""
+                return delta
+        elif self.reasoning_start_tag in delta_text:
+            if self.reasoning_end_tag in delta_text:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_idx = delta_text.find(self.reasoning_start_tag)
+                end_idx = delta_text.find(self.reasoning_end_tag)
+                reasoning_content = delta_text[
+                    start_idx + len(self.reasoning_start_tag) : end_idx
+                ]
+                content = delta_text[end_idx + len(self.reasoning_end_tag) :]
+                delta["reasoning_content"] = reasoning_content
+                if content is not None:
+                    delta["content"] = content
+                return delta
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                delta["reasoning_content"] = delta_text
+                delta["content"] = ""
+                return delta
+        else:
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.reasoning_end_tag in delta_text:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_idx = delta_text.find(self.reasoning_end_tag)
+                reasoning_content = delta_text[:end_idx]
+                content = delta_text[end_idx + len(self.reasoning_end_tag) :]
+                delta["reasoning_content"] = reasoning_content
+                if content is not None:
+                    delta["content"] = content
+                return delta
+            elif self.reasoning_end_tag in previous_text:
+                # </think> in previous, thinking content ends
+                return delta
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                delta["reasoning_content"] = delta_text
+                delta["content"] = ""
+                return delta
+    def extract_reasoning_content(
+        self, model_output: Union[str, CompletionChoice]
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Extract reasoning content from DeepSeek-R1 model output.
+        Args:
+            content (str): The model output content to parse.
+        Returns:
+            Optional[str]: Extracted reasoning content, or None if no reasoning content found.
+        """
+        if not isinstance(model_output, str):
+            model_output = model_output["text"]
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.reasoning_end_tag not in model_output:
+            return model_output, None
+        else:
+            # Add a start token if it's missing to keep compatibility.
+            if self.reasoning_start_tag not in model_output:
+                model_output = f"{self.reasoning_start_tag}{model_output}"
+            # Use a regex to find the reasoning content
+            reasoning_content = self.reasoning_regex.findall(model_output)[0]
+            end_index = len(
+                f"{self.reasoning_start_tag}{reasoning_content}{self.reasoning_end_tag}"
+            )
+            final_output = model_output[end_index:]
+            if len(final_output) == 0:
+                return reasoning_content, None
+            return reasoning_content, final_output

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -14,10 +14,14 @@
 import json
 import logging
+import sys
+import threading
 import time
 import uuid
 from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
+from xoscar.utils import get_next_port
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -40,6 +44,10 @@ class SGLANGModelConfig(TypedDict, total=False):
     mem_fraction_static: float
     log_level: str
     attention_reduce_in_fp32: bool  # For gemma
+    # distributed
+    nnodes: Optional[int]
+    node_rank: Optional[int]
+    dist_init_addr: Optional[str]
 class SGLANGGenerateConfig(TypedDict, total=False):
@@ -91,6 +99,10 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
     "QwQ-32B-Preview",
+    "deepseek-r1-distill-qwen",
+    "deepseek-r1-distill-llama",
+    "deepseek-v3",
+    "deepseek-r1",
 ]
@@ -107,6 +119,16 @@ class SGLANGModel(LLM):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._model_config = model_config
         self._engine = None
+        self._address = model_config.pop("address", None)  # type: ignore
+        self._n_worker = model_config.pop("n_worker", 1)  # type: ignore
+        self._shard = model_config.pop("shard", 0)  # type: ignore
+        self._driver_info = model_config.pop("driver_info", None)  # type: ignore
+        self._loading_thread = None
+        self._loading_error = None
+    @property
+    def driver_info(self) -> Optional[dict]:
+        return self._driver_info
     def load(self):
         try:
@@ -128,18 +150,84 @@ class SGLANGModel(LLM):
         else:
             self._model_config.setdefault("attention_reduce_in_fp32", False)
-        logger.info(
-            f"Loading {self.model_uid} with following model config: {self._model_config}"
-        )
+        # gen port for sgl Runtime,
+        # this is useful for sglang service on a same machine.
+        # sglang typically find a port between [port, 40000]
+        # we need to ensure the generated port < 40000
+        sgl_port = None
+        for _ in range(10):
+            sgl_port = get_next_port()
+            if sgl_port >= 40000:
+                sgl_port = None
+            else:
+                break
+        if sgl_port is None:
+            raise ValueError("Failed to find a port for sglang")
+        if self._n_worker > 1:
+            # distributed inference
+            self._model_config["nnodes"] = self._n_worker
+            self._model_config["node_rank"] = self._shard
+            # model across multiple workers
+            if self._shard == 0:
+                # distributed, need to init driver_info
+                assert self._driver_info is None
+                # This must run inside Xoscar pool
+                dist_init_addr = f"{self._address.split(':', 1)[0]}:{get_next_port()}"
+                self._driver_info = {"dist_init_addr": dist_init_addr}
+                self._model_config["dist_init_addr"] = dist_init_addr
+            else:
+                assert self._driver_info is not None
+                self._model_config["dist_init_addr"] = self._driver_info[
+                    "dist_init_addr"
+                ]
-        self._engine = sgl.Runtime(
-            model_path=self.model_path,
-            tokenizer_path=self.model_path,
-            **self._model_config,
-        )
+            logger.info(
+                f"Loading {self.model_uid}, shard({self._shard} of {self._n_worker}) with following model config: {self._model_config}"
+            )
+            def _load():
+                try:
+                    self._engine = sgl.Runtime(
+                        model_path=self.model_path,
+                        tokenizer_path=self.model_path,
+                        port=sgl_port,
+                        **self._model_config,
+                    )
+                except:
+                    logger.exception("Creating sglang Runtime failed")
+                    self._loading_error = sys.exc_info()
+            self._loading_thread = threading.Thread(target=_load)
+            self._loading_thread.start()
+            if self._shard == 0:
+                # wait for 3 seconds to ensure torch distributed inited first
+                self._loading_thread.join(3)
+        else:
+            logger.info(
+                f"Loading {self.model_uid} with following model config: {self._model_config}"
+            )
+            self._engine = sgl.Runtime(
+                model_path=self.model_path,
+                tokenizer_path=self.model_path,
+                port=sgl_port,
+                **self._model_config,
+            )
+    def wait_for_load(self):
+        if self._loading_thread:
+            if self._shard == 0:
+                # for the shard 0, we wait it to complete
+                # the sglang will serve forever for the other shards,
+                # so we only check if any error happens.
+                self._loading_thread.join()
+            if self._loading_error:
+                _, err, tb = self._loading_error
+                raise err.with_traceback(tb)
     def stop(self):
-        logger.info("Stopping SGLang engine")
+        logger.info("Stopping SGLang engine, sglang pid: %s", self._engine.pid)
         self._engine.shutdown()
     def _sanitize_model_config(
@@ -151,7 +239,7 @@ class SGLANGModel(LLM):
         cuda_count = self._get_cuda_count()
         model_config.setdefault("tokenizer_mode", "auto")
         model_config.setdefault("trust_remote_code", True)
-        model_config.setdefault("tp_size", cuda_count)
+        model_config.setdefault("tp_size", cuda_count * self._n_worker)
         # See https://github.com/sgl-project/sglang/blob/00023d622a6d484e67ef4a0e444f708b8fc861c8/python/sglang/srt/server_args.py#L100-L109
         mem_fraction_static = model_config.get("mem_fraction_static")
         if mem_fraction_static is None:
@@ -159,7 +247,7 @@ class SGLANGModel(LLM):
             if tp_size >= 16:
                 model_config["mem_fraction_static"] = 0.79
             elif tp_size >= 8:
-                model_config["mem_fraction_static"] = 0.83
+                model_config["mem_fraction_static"] = 0.81
             elif tp_size >= 4:
                 model_config["mem_fraction_static"] = 0.85
             elif tp_size >= 2:

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -39,7 +39,12 @@ from ....types import (
 from ...utils import select_device
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import LLAMA3_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import (
+    DEEPSEEK_TOOL_CALL_FAMILY,
+    LLAMA3_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_FAMILY,
+    ChatModelMixin,
+)
 from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
 logger = logging.getLogger(__name__)
@@ -62,6 +67,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "MiniCPM-V-2.6",
     "glm-4v",
     "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
     "qwen2-audio",
     "qwen2-audio-instruct",
     "deepseek-v2",
@@ -681,6 +687,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             or model_family in LLAMA3_TOOL_CALL_FAMILY
         ):
             full_context_kwargs["tools"] = tools
+        elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
+            self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages,

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -265,15 +265,24 @@ class InternVLChatModel(PytorchChatModel):
         if world_size == 1:
             return None
         model_size = f"{self.model_spec.model_size_in_billions}B"
+        model_name = f"{self.model_family.model_name.lower()}-{model_size}"
         num_layers = {
-            "1B": 24,
-            "2B": 24,
-            "4B": 32,
-            "8B": 32,
-            "26B": 48,
-            "40B": 60,
-            "76B": 80,
-        }[model_size]
+            "internvl2-1B": 24,
+            "internvl2-2B": 24,
+            "internvl2-4B": 32,
+            "internvl2-8B": 32,
+            "internvl2-26B": 48,
+            "internvl2-40B": 60,
+            "internvl2-76B": 80,
+            "internvl2.5-1B": 24,
+            "internvl2.5-2B": 24,
+            "internvl2.5-4B": 36,
+            "internvl2.5-8B": 32,
+            "internvl2.5-26B": 48,
+            "internvl2.5-38B": 64,
+            "internvl2.5-78B": 80,
+        }[model_name]
         # Since the first GPU will be used for ViT, treat it as half a GPU.
         num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
         num_layers_per_gpu = [num_layers_per_gpu] * world_size
@@ -322,9 +331,7 @@ class InternVLChatModel(PytorchChatModel):
             self._model.cuda()
         self._tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            trust_remote_code=True,
-            use_fast=False,
+            self.model_path, trust_remote_code=True, use_fast=False
         )
     @cache_clean
@@ -339,11 +346,12 @@ class InternVLChatModel(PytorchChatModel):
         IMG_END_TOKEN = "</img>"
         IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+        generate_config = generate_config if isinstance(generate_config, dict) else {}
         generation_config = {
-            "max_new_tokens": generate_config.get("max_tokens", 1024)
-            if generate_config
-            else 1024,
+            "max_new_tokens": (generate_config.get("max_tokens", 1024)),
             "do_sample": False,
+            "temperature": generate_config.get("temperature", None),
         }
         stream = (
@@ -458,6 +466,7 @@ class InternVLChatModel(PytorchChatModel):
         streamer = TextIteratorStreamer(
             self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
         )
         # Define the generation configuration
         generate_kwargs["streamer"] = streamer
         # Start the model chat in a separate thread

xinference/model/llm/transformers/qwen2_audio.py CHANGED Viewed

@@ -55,9 +55,9 @@ class Qwen2AudioChatModel(PytorchChatModel):
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
-        self._device = device
         # for multiple GPU, set back to auto to make multiple devices work
         device = "auto" if device == "cuda" else device
+        self._device = device
         self._processor = AutoProcessor.from_pretrained(
             self.model_path,
@@ -105,6 +105,8 @@ class Qwen2AudioChatModel(PytorchChatModel):
         inputs = self._processor(
             text=text, audios=audios, return_tensors="pt", padding=True
         )
+        # Make sure that the inputs and the model are on the same device.
+        inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
         inputs.input_ids = inputs.input_ids.to(self._device)
         generate_config = generate_config if generate_config else {}
         stream = generate_config.get("stream", False) if generate_config else False

xinference/model/llm/transformers/qwen2_vl.py CHANGED Viewed

@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
             return True
+        if "qwen2.5-vl-instruct".lower() in llm_family.lower():
+            return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
         return False
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
     def load(self):
         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+        try:
+            from transformers import Qwen2_5_VLForConditionalGeneration
+        except ImportError:
+            Qwen2_5_VLForConditionalGeneration = None
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
         self._device = device
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
         )
         self._tokenizer = self._processor.tokenizer
         flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        llm_family = self.model_family.model_family or self.model_family.model_name
+        model_cls = (
+            Qwen2_5_VLForConditionalGeneration
+            if "qwen2.5" in llm_family
+            else Qwen2VLForConditionalGeneration
+        )
+        if model_cls is None:
+            raise ImportError("`transformers` version is too old, please upgrade it")
         if flash_attn_installed:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 torch_dtype="bfloat16",
                 device_map=device,
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
             ).eval()
         elif is_npu_available():
             # Ascend do not support bf16
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 device_map="auto",
                 trust_remote_code=True,
                 torch_dtype="float16",
             ).eval()
         else:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path, device_map=device, trust_remote_code=True
             ).eval()

xinference 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

Potentially problematic release.

xinference 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl