PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (64) hide show

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -286,12 +286,18 @@ class PytorchModel(LLM):
         kwargs = {}
-        dtype = get_device_preferred_dtype(self._device)
-        if dtype is not None:
-            kwargs["torch_dtype"] = dtype
+        torch_dtype = self._pytorch_model_config.get("torch_dtype")
+        if torch_dtype is not None:
+            if isinstance(torch_dtype, str) and torch_dtype != "auto":
+                torch_dtype = getattr(torch, torch_dtype)
+            kwargs["torch_dtype"] = torch_dtype
         else:
-            raise ValueError(f"Device {self._device} is not supported in temporary")
+            dtype = get_device_preferred_dtype(self._device)
+            if dtype is not None:
+                kwargs["torch_dtype"] = dtype
+            else:
+                raise ValueError(f"Device {self._device} is not supported in temporary")
         kwargs["revision"] = self._pytorch_model_config.get(
             "revision", self.model_spec.model_revision
@@ -327,6 +333,8 @@ class PytorchModel(LLM):
             reasoning_content, enable_thinking=enable_thinking
         )
+        logger.debug("Loading Transformers model with kwargs: %s", kwargs)
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer(**kwargs)
         else:
@@ -488,7 +496,7 @@ class PytorchModel(LLM):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:
@@ -878,7 +886,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:

xinference/model/llm/transformers/gemma3.py CHANGED Viewed

@@ -28,7 +28,7 @@ class Gemma3TextChatModel(PytorchChatModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "gemma-3-1b-it".lower() in llm_family.lower():

xinference/model/llm/transformers/gpt_oss.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import logging
+from typing import Dict, Iterator, List, Optional, Union
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    PytorchGenerateConfig,
+    PytorchModelConfig,
+)
+from ..harmony import async_stream_harmony_chat_completion
+from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
+from .core import PytorchChatModel, register_non_default_model
+logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("gpt-oss")
+class GPTOSSPytorchChatModel(PytorchChatModel):
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        config = super()._sanitize_model_config(pytorch_model_config)
+        config.setdefault("torch_dtype", "auto")
+        return config  # type:ignore
+    @classmethod
+    def match_json(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if "gpt" not in model_family and "oss" not in model_family:
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+    async def chat(  # type:ignore
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        gen = super().chat(messages, generate_config=generate_config)
+        if inspect.iscoroutine(gen):
+            gen = await gen
+        if inspect.isasyncgen(gen):
+            # Streaming
+            async def stream_parser():
+                full_text = ""
+                full_reasoning = ""
+                async for parsed_chunk in async_stream_harmony_chat_completion(gen):
+                    choices = parsed_chunk.get("choices")
+                    if choices and len(choices) > 0:
+                        delta = choices[0].get("delta", {})
+                        if delta.get("content"):
+                            full_text += delta["content"]
+                        if delta.get("reasoning_content"):
+                            full_reasoning += delta["reasoning_content"]
+                    yield parsed_chunk
+                logger.debug(
+                    "Chat finished, content: %r, reasoning: %r",
+                    full_text,
+                    full_reasoning,
+                )
+            return stream_parser()
+        else:
+            # Non-streaming sync - handle single result
+            async for parsed_completion in async_stream_harmony_chat_completion(gen):  # type: ignore
+                return parsed_completion

xinference/model/llm/transformers/multimodal/core.py CHANGED Viewed

@@ -21,9 +21,9 @@ from .....types import (
     CompletionChunk,
     PytorchGenerateConfig,
 )
+from ....utils import cache_clean
 from ...utils import generate_chat_completion, generate_completion_chunk
 from ..core import PytorchChatModel
-from ..utils import cache_clean
 class PytorchMultiModalModel(PytorchChatModel):

xinference/model/llm/transformers/multimodal/gemma3.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Gemma3ChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "gemma-3-it".lower() in llm_family.lower():

xinference/model/llm/transformers/multimodal/glm4_1v.py CHANGED Viewed

@@ -28,14 +28,14 @@ logger = logging.getLogger(__name__)
 @register_transformer
-@register_non_default_model("glm-4.1v-thinking")
+@register_non_default_model("glm-4.1v-thinking", "glm-4.5v")
 class Glm4_1VModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
-        if "glm-4.1v" in family.lower():
+        if "glm-4.1v" in family.lower() or "glm-4.5v" in family.lower():
             return True
         return False

xinference/model/llm/transformers/multimodal/ovis2.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Ovis2ChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "ovis2".lower() in llm_family.lower():

xinference/model/llm/transformers/multimodal/qwen-omni.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
-import importlib.util
 import io
 import logging
 import time
@@ -20,13 +19,13 @@ import uuid
 from threading import Thread
 from typing import Any, Dict, Iterator, List, Optional, Tuple
-from .....model.utils import select_device
 from .....types import (
     ChatCompletion,
     ChatCompletionAudio,
     ChatCompletionChoice,
     CompletionUsage,
 )
+from ....utils import is_flash_attn_available, select_device
 from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import PytorchGenerateConfig, register_non_default_model
 from .core import PytorchMultiModalModel
@@ -46,7 +45,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2.5-omni".lower() in llm_family.lower():
@@ -71,12 +70,12 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
         # for multiple GPU, set back to auto to make multiple devices work
         device = "auto" if self._device == "cuda" else self._device
-        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
-        kwargs = (
-            {}
-            if not flash_attn_installed
-            else {"attn_implementation": "flash_attention_2"}
+        kwargs = {}
+        enable_flash_attn = self._pytorch_model_config.get(
+            "enable_flash_attn", is_flash_attn_available()
         )
+        if enable_flash_attn:
+            kwargs["attn_implementation"] = "flash_attention_2"
         kwargs = self.apply_bnb_quantization(kwargs)
         logger.debug("Loading model with extra kwargs: %s", kwargs)

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -11,15 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import importlib.util
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 from .....core.model import register_batching_multimodal_models
 from .....device_utils import is_npu_available
-from .....model.utils import select_device
 from .....types import PytorchModelConfig
 from ....scheduler.request import InferenceRequest
+from ....utils import is_flash_attn_available, select_device
 from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -48,7 +47,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -87,7 +86,6 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
             Qwen2_5_VLForConditionalGeneration = None
         kwargs = self.apply_bnb_quantization()
-        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
         llm_family = self.model_family.model_family or self.model_family.model_name
         model_cls = (
             Qwen2_5_VLForConditionalGeneration
@@ -97,12 +95,17 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
         if model_cls is None:
             raise ImportError("`transformers` version is too old, please upgrade it")
         device = "auto" if self._device == "cuda" else self._device
-        if flash_attn_installed:
+        enable_flash_attn = self._pytorch_model_config.get(
+            "enable_flash_attn", is_flash_attn_available()
+        )
+        if enable_flash_attn:
             self._model = model_cls.from_pretrained(
                 self.model_path,
                 torch_dtype="bfloat16",
-                device_map=device,
                 attn_implementation="flash_attention_2",
+                device_map=device,
                 trust_remote_code=True,
                 **kwargs,
             ).eval()

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import asyncio
-import functools
 import logging
 import os
 import time
@@ -495,34 +494,3 @@ def batch_inference_one_step(
         for r in req_list:
             r.stopped = True
             r.error_msg = str(e)
-def cache_clean(fn):
-    @functools.wraps(fn)
-    async def _async_wrapper(self, *args, **kwargs):
-        import gc
-        from ....device_utils import empty_cache
-        result = await fn(self, *args, **kwargs)
-        gc.collect()
-        empty_cache()
-        return result
-    @functools.wraps(fn)
-    def _wrapper(self, *args, **kwargs):
-        import gc
-        from ....device_utils import empty_cache
-        result = fn(self, *args, **kwargs)
-        gc.collect()
-        empty_cache()
-        return result
-    if asyncio.iscoroutinefunction(fn):
-        return _async_wrapper
-    else:
-        return _wrapper

xinference/model/llm/utils.py CHANGED Viewed

@@ -67,6 +67,9 @@ QWEN_TOOL_CALL_FAMILY = [
     "qwen3",
     "HuatuoGPT-o1-Qwen2.5",
     "DianJin-R1",
+    "Qwen3-Thinking",
+    "Qwen3-Instruct",
+    "Qwen3-Coder",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -79,9 +82,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
     "HuatuoGPT-o1-LLaMA-3.1",
 ]
-DEEPSEEK_TOOL_CALL_FAMILY = [
-    "deepseek-v3",
-]
+DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"]
 TOOL_CALL_FAMILY = (
     QWEN_TOOL_CALL_FAMILY
@@ -167,8 +168,7 @@ class ChatModelMixin:
                     return json.loads(kwargs)
                 except json.JSONDecodeError:
                     raise TypeError(
-                        f"`chat_template_kwargs` should be json parsable, "
-                        f"got: {kwargs}"
+                        f"`chat_template_kwargs` should be json parsable, got: {kwargs}"
                     )
             elif isinstance(kwargs, dict):
                 return kwargs
@@ -254,7 +254,7 @@ class ChatModelMixin:
                         ret += role + "\n" + text + intra_message_sep + "\n"
                     else:
                         placeholders = "\n".join(
-                            f"Image-{i+1}: <image>\n"
+                            f"Image-{i + 1}: <image>\n"
                             for i in range(
                                 len(images) - len(image_futures), len(images)
                             )
@@ -463,6 +463,7 @@ class ChatModelMixin:
                 chat_context_var.set(ctx)
         previous_texts = [""]
+        full_text = ""
         # Process chunks
         if reasoning_parser:
             set_context()
@@ -474,10 +475,14 @@ class ChatModelMixin:
                 # usage
                 chat_chunk = cls._get_final_chat_completion_chunk(chunk)
             else:
+                if choices[0].get("text"):
+                    full_text += choices[0]["text"]  # type: ignore
                 chat_chunk = cls._to_chat_completion_chunk(
                     chunk, reasoning_parser, previous_texts
                 )
             yield chat_chunk
+        logger.debug("Chat finished, output: %s", full_text)
     @staticmethod
     def _to_chat_completion(
@@ -683,6 +688,52 @@ class ChatModelMixin:
         return results
+    @classmethod
+    def _eval_deepseek_r1_arguments(cls, c) -> List[Tuple]:
+        """
+        Parses tool calls from deepseek-r1 (0528) chat template format.
+        Returns:
+            List of (None, function_name, arguments_dict)
+            or (raw_content, None, None) if parsing fails.
+        """
+        text = c["choices"][0]["text"]
+        pattern = (
+            r"<\｜tool▁call▁begin｜>function<\｜tool▁sep｜>([^\n]+)\n"
+            r"```json\n(.*?)\n```<\｜tool▁call▁end｜>"
+        )
+        matches = re.findall(pattern, text, re.DOTALL)
+        if not matches:
+            return [(text, None, None)]
+        tool_calls = set()
+        results = []
+        for func_name, raw_json in matches:
+            func_and_args = None
+            try:
+                func_and_args = json.loads(raw_json)
+                arguments_hashable = frozenset(func_and_args.items())
+                tool_call_tuple = (
+                    None,
+                    func_name,
+                    func_and_args,
+                )
+            except Exception:
+                tool_call_tuple = (raw_json, None, None)
+                arguments_hashable = None
+            dedup_key = (
+                (func_name, arguments_hashable)
+                if func_and_args is not None
+                else raw_json
+            )
+            if dedup_key not in tool_calls:
+                tool_calls.add(dedup_key)
+                results.append(tool_call_tuple)
+        return results
     @classmethod
     def _eval_tool_arguments(
         cls, model_family, c, tool_call_text: Optional[str] = None
@@ -695,7 +746,10 @@ class ChatModelMixin:
         elif family in LLAMA3_TOOL_CALL_FAMILY:
             result = cls._eval_llama3_chat_arguments(c)
         elif family in DEEPSEEK_TOOL_CALL_FAMILY:
-            result = cls._eval_deepseek_chat_arguments(c)
+            if family == "deepseek-r1-0528":
+                result = cls._eval_deepseek_r1_arguments(c)
+            else:
+                result = cls._eval_deepseek_chat_arguments(c)
         else:
             raise Exception(
                 f"Model {model_family.model_name} is not support tool calls."

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -89,6 +89,7 @@ class VLLMModelConfig(TypedDict, total=False):
     mm_processor_kwargs: NotRequired[dict[str, Any]]
     min_pixels: NotRequired[int]
     max_pixels: NotRequired[int]
+    enable_expert_parallel: bool
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -273,8 +274,12 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
-if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
+if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
+    VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
 class VLLMModel(LLM):
@@ -557,7 +562,9 @@ class VLLMModel(LLM):
                 raise err.with_traceback(tb)
         # set context length after engine inited
-        self._set_context_length()
+        # if shard > 0, the engine will be inited in another process
+        if self._engine:
+            self._set_context_length()
     def _set_context_length(self):
         from vllm import envs
@@ -839,7 +846,7 @@ class VLLMModel(LLM):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -1187,7 +1194,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
+        if llm_spec.model_format not in [
+            "pytorch",
+            "gptq",
+            "awq",
+            "fp8",
+            "bnb",
+            "ggufv2",
+        ]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -1284,6 +1298,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         previous_texts = [""]
         tool_call = False
         tool_call_texts = [""]
+        full_text = ""
         if self.reasoning_parser:
             set_context()
             chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
@@ -1299,6 +1314,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             if not choices:
                 yield self._get_final_chat_completion_chunk(chunk)
             else:
+                full_text += chunk["choices"][0]["text"]
                 if self.is_tool_call_chunk_start(chunk):
                     tool_call = True
                 if tool_call:
@@ -1320,6 +1336,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                         chunk, self.reasoning_parser, previous_texts
                     )
             i += 1
+        logger.debug("Chat finished, output: %s", full_text)
     @vllm_check
     async def async_chat(
@@ -1348,13 +1365,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             ):
                 full_context_kwargs["tools"] = tools
         assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(
-            messages, self.model_family.chat_template, **full_context_kwargs
-        )
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
+        lora_request = None
+        lora_model = generate_config.get("lora_name")
+        if lora_model is not None:
+            for lora in self.lora_requests:
+                if lora_model == lora.lora_name:
+                    lora_request = lora
+                    break
+        tokenizer = await self._get_tokenizer(lora_request)
+        full_prompt = self.get_full_context(
+            messages,
+            self.model_family.chat_template,
+            tokenizer=tokenizer,
+            **full_context_kwargs,
+        )
         if stream:
             agen = await self.async_generate(
                 full_prompt, generate_config, tools, request_id=request_id
@@ -1386,7 +1416,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):

xinference 1.8.1rc1__py3-none-any.whl → 1.9.0__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.0py3-none-any.whl