PyPI - xinference - Versions diffs - 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/model/llm/transformers/cogvlm2_video.py CHANGED Viewed

@@ -63,7 +63,7 @@ class CogVLM2VideoModel(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -71,7 +71,7 @@ class CogVLM2VideoModel(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers.generation import GenerationConfig
@@ -87,10 +87,7 @@ class CogVLM2VideoModel(PytorchChatModel):
             self._model, self._tokenizer = self._load_tensorizer()
             return
-        if "8-bit" in self.quantization.lower():
-            kwargs["load_in_8bit"] = True
-        elif "4-bit" in self.quantization.lower():
-            kwargs["load_in_4bit"] = True
+        kwargs = self.apply_bnb_quantization()
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import json
 import logging
 import os
 from functools import lru_cache
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 import torch
@@ -53,11 +53,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "opt",
     "glm4-chat",
     "glm4-chat-1m",
-    "internlm2-chat",
-    "internlm2.5-chat",
     "qwen-vl-chat",
     "OmniLMM",
-    "yi-vl-chat",
     "deepseek-vl-chat",
     "cogvlm2",
     "cogvlm2-video-llama3-chat",
@@ -75,6 +72,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "cogagent",
     "gemma-3-1b-it",
     "gemma-3-it",
+    "Ovis2",
     "deepseek-vl2",
 ]
@@ -142,6 +140,7 @@ class PytorchModel(LLM):
         pytorch_model_config.setdefault("max_num_seqs", 16)
         pytorch_model_config.setdefault("enable_tensorizer", False)
         pytorch_model_config.setdefault("reasoning_content", False)
+        pytorch_model_config.setdefault("quantization_config", {})
         return pytorch_model_config
     def _sanitize_generate_config(
@@ -264,16 +263,39 @@ class PytorchModel(LLM):
                     f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
                 )
-    def load(self):
-        try:
-            import torch
-        except ImportError:
-            raise ImportError(
-                f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n"
+    def apply_bnb_quantization(
+        self, kwargs: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        model_format = self.model_spec.model_format
+        _kwargs = kwargs if kwargs is not None else {}
+        if model_format == "pytorch":
+            quantization_config = self._pytorch_model_config.get(
+                "quantization_config", {}
             )
-        from .compression import load_compress_model
+            if quantization_config:
+                # If `load_in_4bit` is enabled, apply default quantization presets.
+                if quantization_config.get("load_in_4bit", False):
+                    quantization_config.setdefault(
+                        "bnb_4bit_compute_dtype", torch.float16
+                    )
+                    quantization_config.setdefault("bnb_4bit_use_double_quant", True)
+                    quantization_config.setdefault(
+                        "llm_int8_skip_modules",
+                        [
+                            "lm_head",
+                            "encoder",
+                            "EncDecAttention",
+                        ],
+                    )
-        quantization = self.quantization
+                from transformers import BitsAndBytesConfig
+                _kwargs["quantization_config"] = BitsAndBytesConfig(
+                    **quantization_config
+                )
+        return _kwargs
+    def load(self):
         num_gpus = gpu_count()
         device = self._pytorch_model_config.get("device", "auto")
         self._pytorch_model_config["device"] = select_device(device)
@@ -294,7 +316,6 @@ class PytorchModel(LLM):
         kwargs["trust_remote_code"] = self._pytorch_model_config.get(
             "trust_remote_code"
         )
-        model_format = self.model_spec.model_format
         is_device_map_auto = False
@@ -310,52 +331,18 @@ class PytorchModel(LLM):
             }
             kwargs["max_memory"] = max_memory
-        if quantization != "none" and model_format == "pytorch":
-            if self._device == "cuda" and self._is_linux():
-                kwargs["device_map"] = "auto"
-                is_device_map_auto = True
-                if quantization == "4-bit":
-                    kwargs["load_in_4bit"] = True
-                    kwargs["bnb_4bit_compute_dtype"] = torch.float16
-                    kwargs["bnb_4bit_use_double_quant"] = True
-                    kwargs["llm_int8_skip_modules"] = [
-                        "lm_head",
-                        "encoder",
-                        "EncDecAttention",
-                    ]
-                elif quantization == "8-bit":
-                    kwargs["load_in_8bit"] = True
-                else:
-                    raise ValueError(
-                        f"Quantization {quantization} is not supported in temporary"
-                    )
-            else:
-                if num_gpus != 1 and self._device == "cuda":
-                    raise ValueError(f"Quantization is not supported for multi-gpu")
-                elif quantization != "8-bit":
-                    raise ValueError(
-                        f"Only 8-bit quantization is supported if it is not linux system or cuda device"
-                    )
-                else:
-                    (
-                        self._model,
-                        self._tokenizer,
-                    ) = load_compress_model(
-                        model_path=self.model_path,
-                        device=self._device,
-                        torch_dtype=kwargs["torch_dtype"],
-                        use_fast=self._use_fast_tokenizer,
-                        revision=kwargs["revision"],
-                    )
-                    logger.debug(f"Model Memory: {self._model.get_memory_footprint()}")
-                    return
+        # handle bnb quantization
+        kwargs = self.apply_bnb_quantization(kwargs)
         if num_gpus > 0 and is_hf_accelerate_supported(self._device):
             kwargs.update({"device_map": "auto"})
             is_device_map_auto = True
         reasoning_content = self._pytorch_model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._pytorch_model_config.pop("enable_thinking", False)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer(**kwargs)
@@ -372,7 +359,11 @@ class PytorchModel(LLM):
         logger.debug(f"Model Memory: {self._model.get_memory_footprint()}")
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("transformers") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -689,7 +680,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         return generate_config
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -711,9 +702,14 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def load(self):
         super().load()
-    def _get_full_prompt(self, messages: List[Dict], tools):
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
         model_family = self.model_family.model_family or self.model_family.model_name
-        full_context_kwargs = {}
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
         if (
             tools
             and model_family in QWEN_TOOL_CALL_FAMILY
@@ -736,7 +732,9 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             try:
                 if not r.stopped and r.is_prefill:
                     tools = r.generate_config.get("tools", None)
-                    r.full_prompt = self._get_full_prompt(r.prompt, tools)
+                    r.full_prompt = self._get_full_prompt(
+                        r.prompt, tools, r.generate_config
+                    )
                     if tools:
                         r.tools = tools
             except Exception as e:
@@ -761,7 +759,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         results = []
         for i, c in enumerate(req.completion):
             if c == "<bos_stream>":
-                results.append(
+                results.extend(
                     self._get_first_chat_completion_chunk(
                         req.completion[i + 1], self.reasoning_parser
                     )

xinference/model/llm/transformers/deepseek_v2.py CHANGED Viewed

@@ -48,13 +48,14 @@ class DeepSeekV2PytorchModel(PytorchModel):
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
             device_map="auto",
+            **kwargs,
         )
         model.generation_config = GenerationConfig.from_pretrained(self.model_path)
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
@@ -95,13 +96,14 @@ class DeepSeekV2PytorchChatModel(PytorchChatModel):
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
             device_map="auto",
+            **kwargs,
         )
         model.generation_config = GenerationConfig.from_pretrained(self.model_path)
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":

xinference/model/llm/transformers/deepseek_vl.py CHANGED Viewed

@@ -42,11 +42,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._type = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
-        if "deepseek-vl" == llm_family.lower():
+        if "deepseek-vl-chat" == llm_family.lower():
             return True
         return False
@@ -62,6 +62,8 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._device = select_device(self._device)
         self._type = torch.float16 if self._device == "mps" else torch.bfloat16
+        kwargs = self.apply_bnb_quantization()
         # specify the path to the model
         self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(  # type: ignore
             self.model_path
@@ -69,9 +71,13 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._tokenizer = self._vl_chat_processor.tokenizer
         vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
-            self.model_path, trust_remote_code=True, device_map=self._device
+            self.model_path,
+            trust_remote_code=True,
+            device_map=self._device,
+            torch_dtype=self._type,
+            **kwargs,
         )
-        self._model = vl_gpt.to(self._type).eval()
+        self._model = vl_gpt.eval()
     @staticmethod
     def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:

xinference/model/llm/transformers/deepseek_vl2.py CHANGED Viewed

@@ -42,7 +42,7 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         self._type = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
@@ -60,7 +60,8 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         self._device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(self._device)
-        self._type = torch.float16 if self._device == "mps" else torch.bfloat16
+        self._type = torch.bfloat16
+        kwargs = self.apply_bnb_quantization()
         # specify the path to the model
         self._vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(  # type: ignore
@@ -69,9 +70,13 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         self._tokenizer = self._vl_chat_processor.tokenizer
         vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
-            self.model_path, trust_remote_code=True, device_map=self._device
+            self.model_path,
+            trust_remote_code=True,
+            device_map=self._device,
+            torch_dtype=self._type,
+            **kwargs,
         )
-        self._model = vl_gpt.to(torch.bfloat16).cuda().eval()
+        self._model = vl_gpt.cuda().eval()
     @staticmethod
     def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:

xinference/model/llm/transformers/gemma3.py CHANGED Viewed

@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 class Gemma3TextChatModel(PytorchChatModel):
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -56,7 +56,7 @@ class Gemma3ChatModel(PytorchChatModel):
         self._processor = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -85,6 +85,7 @@ class Gemma3ChatModel(PytorchChatModel):
         device = "auto" if device == "cuda" else device
         min_pixels = self._pytorch_model_config.get("min_pixels")
         max_pixels = self._pytorch_model_config.get("max_pixels")
+        kwargs = self.apply_bnb_quantization()
         self._processor = AutoProcessor.from_pretrained(
             self.model_path,
             min_pixels=min_pixels,
@@ -92,9 +93,7 @@ class Gemma3ChatModel(PytorchChatModel):
         )
         self._tokenizer = self._processor.tokenizer
         self._model = Gemma3ForConditionalGeneration.from_pretrained(
-            self.model_path,
-            device_map="auto",
-            torch_dtype="bfloat16",
+            self.model_path, device_map="auto", torch_dtype="bfloat16", **kwargs
         )
     @cache_clean

xinference/model/llm/transformers/glm4v.py CHANGED Viewed

@@ -39,7 +39,7 @@ class Glm4VModel(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -54,25 +54,7 @@ class Glm4VModel(PytorchChatModel):
         self._device = select_device(device)
         kwargs = {"device_map": self._device}
-        quantization = self.quantization
-        # referenced from PytorchModel.load
-        if quantization != "none":
-            if self._device == "cuda" and self._is_linux():
-                kwargs["device_map"] = "auto"
-                if quantization == "4-bit":
-                    kwargs["load_in_4bit"] = True
-                elif quantization == "8-bit":
-                    kwargs["load_in_8bit"] = True
-                else:
-                    raise ValueError(
-                        f"Quantization {quantization} is not supported in temporary"
-                    )
-            else:
-                if quantization != "8-bit":
-                    raise ValueError(
-                        f"Only 8-bit quantization is supported if it is not linux system or cuda device"
-                    )
+        kwargs = self.apply_bnb_quantization(kwargs)
         if self._check_tensorizer_integrity():
             self._model, self._tokenizer = self._load_tensorizer()
@@ -214,7 +196,7 @@ class Glm4VModel(PytorchChatModel):
             has_content=False,
         )
-    def _get_full_prompt(self, messages, tools):
+    def _get_full_prompt(self, messages, tools, generate_config: dict):
         msgs = self._get_processed_msgs(messages)
         inputs = self._tokenizer.apply_chat_template(
             msgs,

xinference/model/llm/transformers/glm_edge_v.py CHANGED Viewed

@@ -42,7 +42,7 @@ class GlmEdgeVModel(PytorchChatModel):
         self._processor = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -57,25 +57,7 @@ class GlmEdgeVModel(PytorchChatModel):
         self._device = select_device(device)
         kwargs = {"device_map": self._device}
-        quantization = self.quantization
-        # referenced from PytorchModel.load
-        if quantization != "none":
-            if self._device == "cuda" and self._is_linux():
-                kwargs["device_map"] = "auto"
-                if quantization == "4-bit":
-                    kwargs["load_in_4bit"] = True
-                elif quantization == "8-bit":
-                    kwargs["load_in_8bit"] = True
-                else:
-                    raise ValueError(
-                        f"Quantization {quantization} is not supported in temporary"
-                    )
-            else:
-                if quantization != "8-bit":
-                    raise ValueError(
-                        f"Only 8-bit quantization is supported if it is not linux system or cuda device"
-                    )
+        kwargs = self.apply_bnb_quantization(kwargs)
         processor = AutoImageProcessor.from_pretrained(
             self.model_path, trust_remote_code=True
@@ -87,6 +69,7 @@ class GlmEdgeVModel(PytorchChatModel):
             trust_remote_code=True,
             torch_dtype=torch.bfloat16,
             device_map="auto",
+            **kwargs
         )
         self._model = model

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -243,7 +243,7 @@ class InternVLChatModel(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -311,7 +311,7 @@ class InternVLChatModel(PytorchChatModel):
         device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
         return device_map
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModel, AutoTokenizer
         if self._check_tensorizer_integrity():
@@ -329,10 +329,7 @@ class InternVLChatModel(PytorchChatModel):
         if device is not None:
             kwargs["device_map"] = device
-        if "8-bit" in self.quantization.lower():
-            kwargs["load_in_8bit"] = True
-        elif "4-bit" in self.quantization.lower():
-            kwargs["load_in_4bit"] = True
+        kwargs = self.apply_bnb_quantization(kwargs)
         self._model = AutoModel.from_pretrained(self.model_path, **kwargs).eval()

xinference/model/llm/transformers/internlm2.py CHANGED Viewed

@@ -71,7 +71,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         model_family = llm_family.model_family or llm_family.model_name

xinference/model/llm/transformers/minicpmv25.py CHANGED Viewed

@@ -42,7 +42,7 @@ class MiniCPMV25Model(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -55,7 +55,7 @@ class MiniCPMV25Model(PytorchChatModel):
         return AutoModel
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModel, AutoTokenizer
         from transformers.generation import GenerationConfig
@@ -76,11 +76,13 @@ class MiniCPMV25Model(PytorchChatModel):
         if "int4" in self.model_path:
             model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
         else:
+            kwargs = self.apply_bnb_quantization()
             model = AutoModel.from_pretrained(
                 self.model_path,
                 trust_remote_code=True,
                 torch_dtype=torch.float16,
                 device_map=self._device,
+                **kwargs
             )
         tokenizer = AutoTokenizer.from_pretrained(
             self.model_path, trust_remote_code=True

xinference/model/llm/transformers/minicpmv26.py CHANGED Viewed

@@ -49,7 +49,7 @@ class MiniCPMV26Model(PytorchChatModel):
         self._processor = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -71,7 +71,7 @@ class MiniCPMV26Model(PytorchChatModel):
         return AutoModel
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModel, AutoProcessor, AutoTokenizer
         from transformers.generation import GenerationConfig
@@ -96,11 +96,13 @@ class MiniCPMV26Model(PytorchChatModel):
         if "int4" in self.model_path:
             model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
         else:
+            kwargs = self.apply_bnb_quantization()
             model = AutoModel.from_pretrained(
                 self.model_path,
                 trust_remote_code=True,
                 torch_dtype=torch.float16,
                 device_map=self._device,
+                **kwargs,
             )
         tokenizer = AutoTokenizer.from_pretrained(
             self.model_path, trust_remote_code=True
@@ -322,7 +324,7 @@ class MiniCPMV26Model(PytorchChatModel):
             "input_image": images,
         }
-    def _get_full_prompt(self, messages: List[Dict], tools):
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):  # type: ignore
         msgs, video_existed = self._convert_to_specific_style(messages)
         if video_existed:
             raise RuntimeError(

xinference/model/llm/transformers/omnilmm.py CHANGED Viewed

@@ -35,7 +35,7 @@ class OmniLMMModel(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name

xinference/model/llm/transformers/opt.py CHANGED Viewed

@@ -42,7 +42,7 @@ class OptPytorchModel(PytorchModel):
         )
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":

xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

Potentially problematic release.

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl