PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show

xinference/model/llm/llm_family_openmind_hub.json CHANGED Viewed

@@ -785,40 +785,6 @@
       "</s>"
     ]
   },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "cogvlm2",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat",
-      "vision"
-    ],
-    "model_description": "CogVLM2 have achieved good results in many lists compared to the previous generation of CogVLM open source models. Its excellent performance can compete with some non-open source models.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 20,
-        "quantizations": [
-          "none"
-        ],
-        "model_id": "AI-Research/cogvlm2-llama3-chinese-chat-19b",
-        "model_hub": "openmind_hub"
-      }
-    ],
-    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = '<|begin_of_text|>' + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ '<|end_of_text|>' }}{% endif %}",
-    "stop_token_ids": [
-      128001,
-      128009
-    ],
-    "stop": [
-      "<|end_of_text|>",
-      "<|eot_id|>"
-    ]
-  },
   {
     "version": 1,
     "context_length": 8192,

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import importlib.util
 import json
 import logging
+import multiprocessing
 import sys
 import threading
 import time
@@ -107,7 +108,11 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "deepseek-r1-distill-qwen",
     "deepseek-r1-distill-llama",
     "deepseek-v3",
+    "deepseek-v3-0324",
     "deepseek-r1",
+    "deepseek-r1-0528",
+    "deepseek-r1-0528-qwen3",
+    "deepseek-prover-v2",
     "DianJin-R1",
     "qwen3",
     "HuatuoGPT-o1-Qwen2.5",
@@ -184,6 +189,9 @@ class SGLANGModel(LLM):
         if sgl_port is None:
             raise ValueError("Failed to find a port for sglang")
+        # fork may cause sglang stuck, force set to spawn
+        multiprocessing.set_start_method("spawn")
         if self._n_worker > 1:
             # distributed inference
             self._model_config["nnodes"] = self._n_worker

xinference/model/llm/transformers/__init__.py CHANGED Viewed

@@ -16,12 +16,33 @@
 import importlib
 import os
 import pkgutil
+from typing import Dict
-# Get the path of the current package
+def import_submodules(package_path: str, package_name: str, globals_dict: Dict) -> None:
+    """
+    Recursively import all classes in submodules and subpackages
+    """
+    for _, module_name, is_pkg in pkgutil.iter_modules([package_path]):
+        full_module_name = f"{package_name}.{module_name}"
+        if module_name.startswith(
+            ("_", "test_")
+        ):  # Skip the modules which start with "_" or "test_"
+            continue
+        module = importlib.import_module(full_module_name)
+        globals_dict[module_name] = module
+        # If it's a pkg, recursive processing
+        if is_pkg:
+            subpackage_path = os.path.join(package_path, module_name)
+            import_submodules(subpackage_path, full_module_name, globals_dict)
+# Get the path and name of the current package
 __path__ = [os.path.dirname(os.path.abspath(__file__))]
+__package__ = __name__
-# Automatically import all modules under the current package
-for _, module_name, is_pkg in pkgutil.iter_modules(__path__):
-    if not module_name.startswith("_"):  # Skip modules starting with underscore
-        module = importlib.import_module(f"{__name__}.{module_name}")
-        globals()[module_name] = module
+# Automatic import of all sub-modules and sub-packages
+import_submodules(__path__[0], __package__, globals())

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -22,17 +22,19 @@ import torch
 from ....core.scheduler import InferenceRequest
 from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ..utils import (
     GLM4_TOOL_CALL_FAMILY,
     generate_chat_completion,
     generate_completion_chunk,
 )
-from .core import PytorchChatModel, PytorchModelConfig
+from .core import PytorchChatModel, PytorchModelConfig, register_non_default_model
 logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("glm4-chat", "glm4-chat-1m")
 class ChatglmPytorchChatModel(PytorchChatModel):
     def __init__(
         self,

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -16,7 +16,7 @@ import json
 import logging
 import os
 from functools import lru_cache
-from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
 import torch
@@ -45,36 +45,17 @@ from ..utils import (
     QWEN_TOOL_CALL_FAMILY,
     ChatModelMixin,
 )
-from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
+from .utils import (
+    _get_pad_param,
+    get_context_length,
+    get_max_src_len,
+    pad_prefill_tokens,
+)
 logger = logging.getLogger(__name__)
-NON_DEFAULT_MODEL_LIST: List[str] = [
-    "opt",
-    "glm4-chat",
-    "glm4-chat-1m",
-    "qwen-vl-chat",
-    "OmniLMM",
-    "deepseek-vl-chat",
-    "cogvlm2",
-    "cogvlm2-video-llama3-chat",
-    "MiniCPM-Llama3-V-2_5",
-    "MiniCPM-V-2.6",
-    "glm-4v",
-    "qwen2-audio",
-    "qwen2-audio-instruct",
-    "deepseek-v2",
-    "deepseek-v2-chat",
-    "deepseek-v2.5",
-    "deepseek-v2-chat-0628",
-    "glm-edge-v",
-    "QvQ-72B-Preview",
-    "cogagent",
-    "gemma-3-1b-it",
-    "gemma-3-it",
-    "Ovis2",
-    "deepseek-vl2",
-]
+# !!!!! Do not add model_name to this list, use `register_non_default_model` below instead!
+NON_DEFAULT_MODEL_LIST: List[str] = []
 # Define the decorator to support multiple names registration
@@ -551,6 +532,36 @@ class PytorchModel(LLM):
     def prepare_sanitize_generate_config(self, req: InferenceRequest):
         return self._sanitize_generate_config(req.generate_config)
+    def merge_kv_cache(self, past_cache, new_cache):
+        from torch.nn.functional import pad
+        from transformers import DynamicCache
+        _, seq_len_idx = self.get_batch_size_and_seq_len_indexes_from_kv()
+        past_seq_len = past_cache[0][0].shape[seq_len_idx]
+        new_seq_len = new_cache[0][0].shape[seq_len_idx]
+        if past_seq_len != new_seq_len:
+            padding_target = new_cache if past_seq_len > new_seq_len else past_cache
+            padding_len = abs(past_seq_len - new_seq_len)
+            pad_param = _get_pad_param(seq_len_idx, padding_len)
+            for idx in range(len(padding_target)):
+                k = padding_target.key_cache[idx]
+                v = padding_target.value_cache[idx]
+                _k = pad(k, pad_param)
+                _v = pad(v, pad_param)
+                padding_target.key_cache[idx] = _k
+                padding_target.value_cache[idx] = _v
+        ret_kv = DynamicCache()
+        for idx in range(len(past_cache)):
+            k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
+            v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
+            ret_kv.update(
+                torch.cat((k1, k2), 0).contiguous(),
+                torch.cat((v1, v2), 0).contiguous(),
+                idx,
+            )
+        return ret_kv
     def prepare_batch_inference(self, req_list: List[InferenceRequest]):
         # check some parameters
         for r in req_list:
@@ -642,6 +653,16 @@ class PytorchModel(LLM):
         )
         self.handle_batch_inference_results(req_list)
+    def build_reduced_kv_cache(self, cache, skipped_indexes: Set[int]):
+        batch_size = cache.key_cache[0].shape[0]
+        batch_slices = [num for num in range(batch_size) if num not in skipped_indexes]
+        for idx in range(len(cache)):
+            cache.key_cache[idx] = cache.key_cache[idx][batch_slices, ::].contiguous()
+            cache.value_cache[idx] = cache.value_cache[idx][
+                batch_slices, ::
+            ].contiguous()
+        return cache
 class PytorchChatModel(PytorchModel, ChatModelMixin):
     def __init__(

xinference/model/llm/transformers/deepseek_v2.py CHANGED Viewed

@@ -15,59 +15,16 @@ import logging
 import torch
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from .core import PytorchChatModel, PytorchModel
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from .core import PytorchChatModel, register_non_default_model
 logger = logging.getLogger(__name__)
-class DeepSeekV2PytorchModel(PytorchModel):
-    def _load_model(self, **kwargs):
-        try:
-            from transformers import (
-                AutoModelForCausalLM,
-                AutoTokenizer,
-                GenerationConfig,
-            )
-        except ImportError:
-            error_message = "Failed to import module 'transformers'"
-            installation_guide = [
-                "Please make sure 'transformers' is installed. ",
-                "You can install it by `pip install transformers`\n",
-            ]
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            trust_remote_code=kwargs["trust_remote_code"],
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_path,
-            attn_implementation="eager",
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True,
-            device_map="auto",
-            **kwargs,
-        )
-        model.generation_config = GenerationConfig.from_pretrained(self.model_path)
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id
-        return model, tokenizer
-    @classmethod
-    def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        if llm_spec.model_format != "pytorch":
-            return False
-        model_family = llm_family.model_family or llm_family.model_name
-        if "deepseek-v2" not in model_family:
-            return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return True
+@register_transformer
+@register_non_default_model(
+    "deepseek-v2-chat", "deepseek-v2.5", "deepseek-v2-chat-0628"
+)
 class DeepSeekV2PytorchChatModel(PytorchChatModel):
     def _load_model(self, **kwargs):
         try:

xinference/model/llm/transformers/gemma3.py CHANGED Viewed

@@ -11,29 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import sys
-import uuid
-from typing import Iterator, List, Optional, Union
-from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    CompletionChunk,
-    PytorchModelConfig,
-)
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import generate_chat_completion, generate_completion_chunk
-from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import cache_clean
+from typing import Dict, List, Set
+from ....core.scheduler import InferenceRequest
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from .core import PytorchChatModel, register_non_default_model
 logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("gemma-3-1b-it")
 class Gemma3TextChatModel(PytorchChatModel):
     @classmethod
     def match_json(
@@ -46,163 +35,129 @@ class Gemma3TextChatModel(PytorchChatModel):
             return True
         return False
+    def _load_model(self, **kwargs):
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
-class Gemma3ChatModel(PytorchChatModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._tokenizer = None
-        self._model = None
-        self._device = None
-        self._processor = None
-    @classmethod
-    def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
-            return False
-        llm_family = model_family.model_family or model_family.model_name
-        if "gemma-3-it".lower() in llm_family.lower():
-            return True
-        return False
-    def _sanitize_model_config(
-        self, pytorch_model_config: Optional[PytorchModelConfig]
-    ) -> PytorchModelConfig:
-        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
-        assert pytorch_model_config is not None
-        pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
-        pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
-        return pytorch_model_config
-    def load(self):
-        from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-        device = self._pytorch_model_config.get("device", "auto")
-        device = select_device(device)
-        self._device = device
-        # for multiple GPU, set back to auto to make multiple devices work
-        device = "auto" if device == "cuda" else device
-        min_pixels = self._pytorch_model_config.get("min_pixels")
-        max_pixels = self._pytorch_model_config.get("max_pixels")
-        kwargs = self.apply_bnb_quantization()
-        self._processor = AutoProcessor.from_pretrained(
+        tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
+            trust_remote_code=kwargs["trust_remote_code"],
+            revision=kwargs["revision"],
         )
-        self._tokenizer = self._processor.tokenizer
-        self._model = Gemma3ForConditionalGeneration.from_pretrained(
-            self.model_path, device_map="auto", torch_dtype="bfloat16", **kwargs
-        )
-    @cache_clean
-    def chat(
-        self,
-        messages: List[ChatCompletionMessage],  # type: ignore
-        generate_config: Optional[PytorchGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        messages = self._transform_messages(messages)
-        generate_config = generate_config if generate_config else {}
-        stream = generate_config.get("stream", False) if generate_config else False
-        if stream:
-            it = self._generate_stream(messages, generate_config)
-            return self._to_chat_completion_chunks(it)
-        else:
-            c = self._generate(messages, generate_config)
-            return c
-    def _generate(
-        self, messages: List, config: PytorchGenerateConfig = {}
-    ) -> ChatCompletion:
-        inputs = self._processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        ).to(self._device)
-        input_len = inputs["input_ids"].shape[-1]
-        generation = self._model.generate(
-            **inputs,
-            do_sample=False,
-            max_new_tokens=config.get("max_tokens", 512),
-            temperature=config.get("temperature", 1),
+        kwargs["torch_dtype"] = torch.bfloat16
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            **kwargs,
         )
-        generation = generation[0][input_len:]
-        decoded = self._processor.decode(generation, skip_special_tokens=True)
-        return generate_chat_completion(self.model_uid, decoded)
+        self._device = model.device
+        return model, tokenizer
-    def _generate_stream(
-        self, messages: List, config: PytorchGenerateConfig = {}
-    ) -> Iterator[CompletionChunk]:
-        from threading import Thread
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
+        return messages
-        from transformers import TextIteratorStreamer
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        """
+        Note that it is important to prepare `past_key_values` for gemma3 prefill phase
+        """
+        from transformers import HybridCache
-        inputs = self._processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
+        inputs = self._tokenizer.apply_chat_template(
+            prompts,
             tokenize=True,
-            return_dict=True,
+            add_generation_prompt=True,
             return_tensors="pt",
+            return_dict=True,
+            padding=True,
         ).to(self._device)
-        tokenizer = self._tokenizer
-        streamer = TextIteratorStreamer(
-            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+        for i, r in enumerate(req_list):
+            r.prompt_tokens = inputs["input_ids"][i].tolist()
+        batch_size = len(prompts)
+        max_cache_len = self.get_context_len()
+        kv = HybridCache(
+            self._model.config,
+            max_batch_size=batch_size,
+            max_cache_len=max_cache_len,
+            dtype=self._model.dtype,
+            device=self._device,
+        )
+        return {**inputs, "past_key_values": kv}
+    def merge_kv_cache(self, past_cache, new_cache):
+        """
+        Note that: DO NOT use the `update` func of `HybridCache`, that is unrelated to KV cache merging.
+        """
+        import torch
+        from transformers import HybridCache
+        max_cache_len = new_cache.max_cache_len
+        batch_size = past_cache.max_batch_size + new_cache.max_batch_size
+        kv_batch = HybridCache(
+            self._model.config,
+            max_batch_size=batch_size,
+            max_cache_len=max_cache_len,
+            dtype=self._model.dtype,
+            device=self._device,
         )
-        gen_kwargs = {"streamer": streamer, **inputs}
-        error = None
-        def model_generate():
-            try:
-                return self._model.generate(
-                    **gen_kwargs,
-                    max_new_tokens=config.get("max_tokens", 512),
-                    temperature=config.get("temperature", 1),
-                )
-            except Exception:
-                nonlocal error
-                error = sys.exc_info()
-                streamer.end()
-                raise
-        thread = Thread(target=model_generate)
-        thread.start()
-        completion_id = str(uuid.uuid1())
-        for new_text in streamer:
-            yield generate_completion_chunk(
-                chunk_text=new_text,
-                finish_reason=None,
-                chunk_id=completion_id,
-                model_uid=self.model_uid,
-                prompt_tokens=-1,
-                completion_tokens=-1,
-                total_tokens=-1,
-                has_choice=True,
-                has_content=True,
-            )
-        if error:
-            _, err, tb = error  # type: ignore
-            raise err.with_traceback(tb)
-        yield generate_completion_chunk(
-            chunk_text=None,
-            finish_reason="stop",
-            chunk_id=completion_id,
-            model_uid=self.model_uid,
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
-            has_choice=True,
-            has_content=False,
+        new_ks = [
+            torch.cat([nk, pk], dim=0).contiguous()
+            for nk, pk in zip(new_cache.key_cache, past_cache.key_cache)
+        ]
+        new_vs = [
+            torch.cat([nv, pv], dim=0).contiguous()
+            for nv, pv in zip(new_cache.value_cache, past_cache.value_cache)
+        ]
+        kv_batch.key_cache.clear()
+        kv_batch.value_cache.clear()
+        kv_batch.key_cache.extend(new_ks)
+        kv_batch.value_cache.extend(new_vs)
+        return kv_batch
+    def build_decode_attention_mask(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        In Gemma3's inference script, attention_mask is handled internally for decode phase.
+        """
+        return None
+    def build_decode_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        In Gemma3's inference script, position_ids is handled internally for decode phase.
+        """
+        return None
+    def build_reduced_kv_cache(self, cache, skipped_indexes: Set[int]):
+        from transformers import HybridCache
+        batch_slices = [
+            num for num in range(cache.max_batch_size) if num not in skipped_indexes
+        ]
+        batch_size = len(batch_slices)
+        kv_batch = HybridCache(
+            self._model.config,
+            max_batch_size=batch_size,
+            max_cache_len=cache.max_cache_len,
+            dtype=self._model.dtype,
+            device=self._device,
         )
+        ks = cache.key_cache
+        vs = cache.value_cache
+        new_ks = [_k[batch_slices, ::].contiguous() for _k in ks]
+        new_vs = [_v[batch_slices, ::].contiguous() for _v in vs]
+        kv_batch.key_cache.clear()
+        kv_batch.value_cache.clear()
+        kv_batch.key_cache.extend(new_ks)
+        kv_batch.value_cache.extend(new_vs)
+        return kv_batch

xinference/model/llm/transformers/multimodal/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl