PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -78,7 +78,7 @@ class LlamaCppLLMSpecV2(BaseModel):
 class PytorchLLMSpecV2(BaseModel):
-    model_format: Literal["pytorch", "gptq", "awq", "fp8"]
+    model_format: Literal["pytorch", "gptq", "awq", "fp8", "bnb"]
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
     quantization: str

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -39,6 +39,7 @@ from ..llm_family import CustomLLMFamilyV2
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_SYMBOLS,
     ChatModelMixin,
     generate_completion_chunk,
 )
@@ -337,7 +338,7 @@ class SGLANGModel(LLM):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -471,6 +472,7 @@ class SGLANGModel(LLM):
         *,
         image_data: Optional[Union[List[str], str]] = None,
         generate_config: Optional[SGLANGGenerateConfig] = None,
+        tools: Optional[List[Dict]] = None,
         request_id: Optional[str] = None,
     ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
         sanitized_generate_config = self._sanitize_generate_config(generate_config)
@@ -501,6 +503,10 @@ class SGLANGModel(LLM):
             async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
                 prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+                complete_response = ""
+                match_tool_call_tmp_results: List[CompletionChunk] = []
+                is_match_tool_call = False
+                chunk = None
                 finish_reason = None
                 async for meta_info, out in self._stream_generate(
                     prompt, image_data, **sanitized_generate_config
@@ -508,6 +514,7 @@ class SGLANGModel(LLM):
                     chunk = self._convert_state_to_completion_chunk(
                         request_id, self.model_uid, output_text=out
                     )
+                    complete_response += out
                     finish_reason = meta_info["finish_reason"]
                     prompt_tokens = meta_info["prompt_tokens"]
                     completion_tokens = meta_info["completion_tokens"]
@@ -517,6 +524,49 @@ class SGLANGModel(LLM):
                         completion_tokens=completion_tokens,
                         total_tokens=total_tokens,
                     )
+                    if tools:
+                        """
+                        The qwen2 tool call returns format like this:
+                        <tool_call>
+                        {...}
+                        </tool_call>
+                        Here is to match this.
+                        """
+                        if (
+                            len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)
+                        ) and (
+                            not QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response)
+                        ):
+                            for c in match_tool_call_tmp_results:
+                                yield c
+                            match_tool_call_tmp_results.clear()
+                            yield chunk
+                        elif (
+                            len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)
+                        ) and (QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response)):
+                            match_tool_call_tmp_results.append(chunk)
+                        else:
+                            assert len(QWEN_TOOL_CALL_SYMBOLS[0]) <= len(
+                                complete_response
+                            )
+                            if not is_match_tool_call and complete_response.startswith(
+                                QWEN_TOOL_CALL_SYMBOLS[0]
+                            ):
+                                is_match_tool_call = True
+                                match_tool_call_tmp_results.clear()
+                            if not is_match_tool_call:
+                                for c in match_tool_call_tmp_results:
+                                    yield c
+                                match_tool_call_tmp_results.clear()
+                                yield chunk
+                            else:
+                                chunk["choices"][0]["text"] = complete_response
+                    else:
+                        yield chunk
+                if is_match_tool_call:
+                    assert chunk is not None
                     yield chunk
                 finish_reason = (
@@ -561,7 +611,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -588,6 +638,57 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         generate_config.pop("chat_template_kwargs", None)
         return generate_config
+    @staticmethod
+    def is_tool_call_chunk_start(chunk):
+        return chunk["choices"][0]["text"].startswith(QWEN_TOOL_CALL_SYMBOLS[0])
+    @staticmethod
+    def is_tool_call_chunk_end(chunk):
+        return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
+    async def _async_to_tool_completion_chunks(
+        self,
+        chunks: AsyncGenerator[CompletionChunk, None],
+    ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        i = 0
+        previous_texts = [""]
+        tool_call = False
+        tool_call_texts = [""]
+        if self.reasoning_parser:
+            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
+        async for chunk in chunks:
+            if i == 0:
+                for first_chunk in self._get_first_chat_completion_chunk(
+                    chunk, self.reasoning_parser
+                ):
+                    yield first_chunk
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield self._get_final_chat_completion_chunk(chunk)
+            else:
+                if self.is_tool_call_chunk_start(chunk):
+                    tool_call = True
+                if tool_call:
+                    tool_call_text = tool_call_texts[-1]
+                    tool_call_text += chunk["choices"][0]["text"]
+                    tool_call_texts.append(tool_call_text)
+                    if self.is_tool_call_chunk_end(chunk):
+                        yield self._post_process_completion_chunk(
+                            self.model_family,
+                            self.model_uid,
+                            chunk,
+                            reasoning_parser=self.reasoning_parser,
+                            tool_call_text=tool_call_text,
+                        )
+                        tool_call = False
+                        tool_call_texts = [""]
+                else:
+                    yield self._to_chat_completion_chunk(
+                        chunk, self.reasoning_parser, previous_texts
+                    )
+            i += 1
     async def async_chat(
         self,
         messages: List[Dict],
@@ -618,13 +719,15 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
+            agen = await self.async_generate(full_prompt, generate_config=generate_config, tools=tools)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
+            if tools:
+                return self._async_to_tool_completion_chunks(agen)
             return self._async_to_chat_completion_chunks(
                 agen, self.reasoning_parser, chat_template_kwargs
             )
         else:
-            c = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
+            c = await self.async_generate(full_prompt, generate_config=generate_config, tools=tools)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._post_process_completion(
@@ -642,7 +745,7 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -286,12 +286,18 @@ class PytorchModel(LLM):
         kwargs = {}
-        dtype = get_device_preferred_dtype(self._device)
-        if dtype is not None:
-            kwargs["torch_dtype"] = dtype
+        torch_dtype = self._pytorch_model_config.get("torch_dtype")
+        if torch_dtype is not None:
+            if isinstance(torch_dtype, str) and torch_dtype != "auto":
+                torch_dtype = getattr(torch, torch_dtype)
+            kwargs["torch_dtype"] = torch_dtype
         else:
-            raise ValueError(f"Device {self._device} is not supported in temporary")
+            dtype = get_device_preferred_dtype(self._device)
+            if dtype is not None:
+                kwargs["torch_dtype"] = dtype
+            else:
+                raise ValueError(f"Device {self._device} is not supported in temporary")
         kwargs["revision"] = self._pytorch_model_config.get(
             "revision", self.model_spec.model_revision
@@ -327,6 +333,8 @@ class PytorchModel(LLM):
             reasoning_content, enable_thinking=enable_thinking
         )
+        logger.debug("Loading Transformers model with kwargs: %s", kwargs)
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer(**kwargs)
         else:
@@ -488,7 +496,7 @@ class PytorchModel(LLM):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:
@@ -539,15 +547,13 @@ class PytorchModel(LLM):
         So we need pad `0` on the left again.
         """
         data = []
+        max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
         for r in reqs:
             r.extra_kwargs["attention_mask_seq_len"] += 1
+            real_len = r.extra_kwargs["attention_mask_seq_len"]
+            pad_len = max_len - real_len
             if self._tokenizer.padding_side == "left":
-                attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
-                pad_len = seq_length - attention_mask_seq_len
-                assert pad_len >= 0, (
-                    f"pad_len must be greater equal 0, got {pad_len} = "
-                    f"seq_length({seq_length}) - attention_mask_seq_len({attention_mask_seq_len})"
-                )
                 x = torch.cat(
                     [
                         (
@@ -555,14 +561,10 @@ class PytorchModel(LLM):
                             if pad_len > 0
                             else torch.tensor([], dtype=torch.long)
                         ),
-                        torch.ones((attention_mask_seq_len,), dtype=torch.long),
+                        torch.ones((real_len,), dtype=torch.long),
                     ]
                 )
             else:
-                max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs)
-                real_len = r.extra_kwargs["attention_mask_seq_len"]
-                pad_len = max_len - real_len
                 x = torch.cat(
                     [
                         torch.ones((real_len,), dtype=torch.long),
@@ -878,7 +880,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in NON_DEFAULT_MODEL_LIST:

xinference/model/llm/transformers/gemma3.py CHANGED Viewed

@@ -28,7 +28,7 @@ class Gemma3TextChatModel(PytorchChatModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "gemma-3-1b-it".lower() in llm_family.lower():

xinference/model/llm/transformers/gpt_oss.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import logging
+from typing import Dict, Iterator, List, Optional, Union
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    PytorchGenerateConfig,
+    PytorchModelConfig,
+)
+from ..harmony import async_stream_harmony_chat_completion
+from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
+from .core import PytorchChatModel, register_non_default_model
+logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("gpt-oss")
+class GPTOSSPytorchChatModel(PytorchChatModel):
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        config = super()._sanitize_model_config(pytorch_model_config)
+        config.setdefault("torch_dtype", "auto")
+        return config  # type:ignore
+    @classmethod
+    def match_json(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if "gpt" not in model_family and "oss" not in model_family:
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+    async def chat(  # type:ignore
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        gen = super().chat(messages, generate_config=generate_config)
+        if inspect.iscoroutine(gen):
+            gen = await gen
+        if inspect.isasyncgen(gen):
+            # Streaming
+            async def stream_parser():
+                full_text = ""
+                full_reasoning = ""
+                async for parsed_chunk in async_stream_harmony_chat_completion(gen):
+                    choices = parsed_chunk.get("choices")
+                    if choices and len(choices) > 0:
+                        delta = choices[0].get("delta", {})
+                        if delta.get("content"):
+                            full_text += delta["content"]
+                        if delta.get("reasoning_content"):
+                            full_reasoning += delta["reasoning_content"]
+                    yield parsed_chunk
+                logger.debug(
+                    "Chat finished, content: %r, reasoning: %r",
+                    full_text,
+                    full_reasoning,
+                )
+            return stream_parser()
+        else:
+            # Non-streaming sync - handle single result
+            async for parsed_completion in async_stream_harmony_chat_completion(gen):  # type: ignore
+                return parsed_completion

xinference/model/llm/transformers/multimodal/core.py CHANGED Viewed

@@ -21,9 +21,9 @@ from .....types import (
     CompletionChunk,
     PytorchGenerateConfig,
 )
+from ....utils import cache_clean
 from ...utils import generate_chat_completion, generate_completion_chunk
 from ..core import PytorchChatModel
-from ..utils import cache_clean
 class PytorchMultiModalModel(PytorchChatModel):

xinference/model/llm/transformers/multimodal/gemma3.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Gemma3ChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "gemma-3-it".lower() in llm_family.lower():

xinference/model/llm/transformers/multimodal/glm4_1v.py CHANGED Viewed

@@ -28,14 +28,14 @@ logger = logging.getLogger(__name__)
 @register_transformer
-@register_non_default_model("glm-4.1v-thinking")
+@register_non_default_model("glm-4.1v-thinking", "glm-4.5v")
 class Glm4_1VModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
-        if "glm-4.1v" in family.lower():
+        if "glm-4.1v" in family.lower() or "glm-4.5v" in family.lower():
             return True
         return False

xinference/model/llm/transformers/multimodal/ovis2.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Ovis2ChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "ovis2".lower() in llm_family.lower():

xinference/model/llm/transformers/multimodal/qwen-omni.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
-import importlib.util
 import io
 import logging
 import time
@@ -20,13 +19,13 @@ import uuid
 from threading import Thread
 from typing import Any, Dict, Iterator, List, Optional, Tuple
-from .....model.utils import select_device
 from .....types import (
     ChatCompletion,
     ChatCompletionAudio,
     ChatCompletionChoice,
     CompletionUsage,
 )
+from ....utils import is_flash_attn_available, select_device
 from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import PytorchGenerateConfig, register_non_default_model
 from .core import PytorchMultiModalModel
@@ -46,7 +45,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2.5-omni".lower() in llm_family.lower():
@@ -71,12 +70,12 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
         # for multiple GPU, set back to auto to make multiple devices work
         device = "auto" if self._device == "cuda" else self._device
-        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
-        kwargs = (
-            {}
-            if not flash_attn_installed
-            else {"attn_implementation": "flash_attention_2"}
+        kwargs = {}
+        enable_flash_attn = self._pytorch_model_config.get(
+            "enable_flash_attn", is_flash_attn_available()
         )
+        if enable_flash_attn:
+            kwargs["attn_implementation"] = "flash_attention_2"
         kwargs = self.apply_bnb_quantization(kwargs)
         logger.debug("Loading model with extra kwargs: %s", kwargs)

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -11,15 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import importlib.util
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 from .....core.model import register_batching_multimodal_models
 from .....device_utils import is_npu_available
-from .....model.utils import select_device
 from .....types import PytorchModelConfig
 from ....scheduler.request import InferenceRequest
+from ....utils import is_flash_attn_available, select_device
 from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -48,7 +47,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -87,7 +86,6 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
             Qwen2_5_VLForConditionalGeneration = None
         kwargs = self.apply_bnb_quantization()
-        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
         llm_family = self.model_family.model_family or self.model_family.model_name
         model_cls = (
             Qwen2_5_VLForConditionalGeneration
@@ -97,12 +95,17 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
         if model_cls is None:
             raise ImportError("`transformers` version is too old, please upgrade it")
         device = "auto" if self._device == "cuda" else self._device
-        if flash_attn_installed:
+        enable_flash_attn = self._pytorch_model_config.get(
+            "enable_flash_attn", is_flash_attn_available()
+        )
+        if enable_flash_attn:
             self._model = model_cls.from_pretrained(
                 self.model_path,
                 torch_dtype="bfloat16",
-                device_map=device,
                 attn_implementation="flash_attention_2",
+                device_map=device,
                 trust_remote_code=True,
                 **kwargs,
             ).eval()

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import asyncio
-import functools
 import logging
 import os
 import time
@@ -495,34 +494,3 @@ def batch_inference_one_step(
         for r in req_list:
             r.stopped = True
             r.error_msg = str(e)
-def cache_clean(fn):
-    @functools.wraps(fn)
-    async def _async_wrapper(self, *args, **kwargs):
-        import gc
-        from ....device_utils import empty_cache
-        result = await fn(self, *args, **kwargs)
-        gc.collect()
-        empty_cache()
-        return result
-    @functools.wraps(fn)
-    def _wrapper(self, *args, **kwargs):
-        import gc
-        from ....device_utils import empty_cache
-        result = fn(self, *args, **kwargs)
-        gc.collect()
-        empty_cache()
-        return result
-    if asyncio.iscoroutinefunction(fn):
-        return _async_wrapper
-    else:
-        return _wrapper

xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl