PyPI - xinference - Versions diffs - 1.5.0.post2__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

xinference 1.5.0.post2py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (89) hide show

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import logging
 import platform
 import sys
@@ -172,7 +172,11 @@ class MLXModel(LLM):
         self._model, self._tokenizer = self._load_model(**kwargs)
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("mlx_lm") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
@@ -423,7 +427,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         return generate_config
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
@@ -445,7 +449,9 @@ class MLXChatModel(MLXModel, ChatModelMixin):
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_context_kwargs = {}
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}  # type: ignore
+        )
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
@@ -476,7 +482,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
 class MLXVisionModel(MLXModel, ChatModelMixin):
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("mlx_vlm") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
@@ -623,7 +633,10 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         if "internvl2" not in model_family.lower():
             from qwen_vl_utils import process_vision_info
-            full_context_kwargs = {}
+            full_context_kwargs = (
+                self._get_chat_template_kwargs_from_generate_config(generate_config)  # type: ignore
+                or {}
+            )
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:
                 full_context_kwargs["tools"] = tools
             assert self.model_family.chat_template is not None

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import json
 import logging
 import sys
@@ -107,6 +107,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "deepseek-r1-distill-llama",
     "deepseek-v3",
     "deepseek-r1",
+    "qwen3",
 ]
 SGLANG_SUPPORTED_VISION_MODEL_LIST = [
     "qwen2.5-vl-instruct",
@@ -297,7 +298,11 @@ class SGLANGModel(LLM):
         return generate_config
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("sglang") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
@@ -435,6 +440,7 @@ class SGLANGModel(LLM):
     async def async_generate(
         self,
         prompt: str,
+        *,
         image_data: Optional[Union[List[str], str]] = None,
         generate_config: Optional[SGLANGGenerateConfig] = None,
         request_id: Optional[str] = None,
@@ -524,7 +530,7 @@ class SGLANGModel(LLM):
 class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
@@ -551,6 +557,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         if self.model_family.stop:
             if (not generate_config.get("stop")) and self.model_family.stop:
                 generate_config["stop"] = self.model_family.stop.copy()
+        generate_config.pop("chat_template_kwargs", None)
         return generate_config
     async def async_chat(
@@ -560,23 +567,28 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(messages, self.model_family.chat_template)
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+        )
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(full_prompt, generate_config)  # type: ignore
+            agen = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
         else:
-            c = await self.async_generate(full_prompt, generate_config)  # type: ignore
+            c = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
             return self._to_chat_completion(c, self.reasoning_parser)
 class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
@@ -627,7 +639,10 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
             self.model_family.chat_template if self.model_family.chat_template else ""
         )
-        prompt = self.get_full_context(messages, chat_template)
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+        )
+        prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
         images, video_inputs = process_vision_info(messages)
         if video_inputs:
             raise ValueError("Not support video input now.")
@@ -650,10 +665,10 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(prompt, base64_images, generate_config)  # type: ignore
+            agen = await self.async_generate(prompt, image_data=base64_images, generate_config=generate_config)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
         else:
-            c = await self.async_generate(prompt, base64_images, generate_config)  # type: ignore
+            c = await self.async_generate(prompt, image_data=base64_images, generate_config=generate_config)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
             return self._to_chat_completion(c, self.reasoning_parser)

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -84,7 +84,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
@@ -462,6 +462,12 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     tools = list(tools) if tools is not None else None
                     tool_choice = r.generate_config.get("tool_choice", "none")
+                    full_context_kwargs = (
+                        self._get_chat_template_kwargs_from_generate_config(
+                            r.generate_config
+                        )
+                        or {}
+                    )
                     r.prompt = self._process_messages(
                         r.prompt, tools=tools, tool_choice=tool_choice
                     )
@@ -469,6 +475,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                         r.prompt,
                         self.model_family.chat_template,  # type: ignore
                         tokenizer=self._tokenizer,
+                        **full_context_kwargs,
                     )
                     if tools:
                         r.tools = tools

xinference/model/llm/transformers/cogagent.py CHANGED Viewed

@@ -46,8 +46,8 @@ class CogAgentChatModel(PytorchChatModel):
         self._device = None
         self._tokenizer = None
         self._model = None
-        self._platform: Literal["Mac", "WIN", "Mobile"] | None = "Mac"
-        self._format: Literal[
+        self._platform: Literal["Mac", "WIN", "Mobile"] | None = "Mac"  # type: ignore
+        self._format: Literal[  # type: ignore
             "(Answer in Action-Operation-Sensitive format.)",
             "(Answer in Status-Plan-Action-Operation format.)",
             "(Answer in Status-Action-Operation-Sensitive format.)",
@@ -56,7 +56,7 @@ class CogAgentChatModel(PytorchChatModel):
         ] | None = "(Answer in Action-Operation-Sensitive format.)"
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -64,8 +64,8 @@ class CogAgentChatModel(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
-        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    def load(self):
+        from transformers import AutoModelForCausalLM, AutoTokenizer
         device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(device)
@@ -73,19 +73,14 @@ class CogAgentChatModel(PytorchChatModel):
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path, trust_remote_code=True
         )
-        if self.quantization == "4-bit":
-            quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-        elif self.quantization == "8-bit":
-            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        else:
-            quantization_config = None
+        kwargs = self.apply_bnb_quantization()
         self._model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
             device_map=self._device,
-            quantization_config=quantization_config,
+            **kwargs,
         ).eval()
     def _message_content_to_cogagent(self, content):
@@ -211,6 +206,9 @@ class CogAgentChatModel(PytorchChatModel):
             "return_tensors": "pt",
             "return_dict": True,
         }
+        full_context_kwargs.update(
+            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}  # type: ignore
+        )
         assert self.model_family.chat_template is not None
         inputs = self.get_full_context(
             [{"role": "user", "image": image, "content": query}],

xinference/model/llm/transformers/cogvlm2.py CHANGED Viewed

@@ -64,7 +64,7 @@ class CogVLM2Model(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -72,7 +72,7 @@ class CogVLM2Model(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers.generation import GenerationConfig
@@ -88,6 +88,8 @@ class CogVLM2Model(PytorchChatModel):
             self._model, self._tokenizer = self._load_tensorizer()
             return
+        kwargs = self.apply_bnb_quantization()
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,
             trust_remote_code=True,
@@ -99,6 +101,7 @@ class CogVLM2Model(PytorchChatModel):
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             device_map="auto",
+            **kwargs
         ).eval()
         # Specify hyperparameters for generation
@@ -313,7 +316,7 @@ class CogVLM2Model(PytorchChatModel):
     def get_dtype(self):
         return self._torch_type
-    def _get_full_prompt(self, messages: List[Dict], tools):
+    def _get_full_prompt(self, messages: List[Dict], tools):  # type: ignore
         prompt, system_prompt, chat_history = parse_messages(messages)
         system_prompt = system_prompt or ""
         query, image, history = self.get_query_and_history(

xinference/model/llm/transformers/cogvlm2_video.py CHANGED Viewed

@@ -63,7 +63,7 @@ class CogVLM2VideoModel(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -71,7 +71,7 @@ class CogVLM2VideoModel(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers.generation import GenerationConfig
@@ -87,10 +87,7 @@ class CogVLM2VideoModel(PytorchChatModel):
             self._model, self._tokenizer = self._load_tensorizer()
             return
-        if "8-bit" in self.quantization.lower():
-            kwargs["load_in_8bit"] = True
-        elif "4-bit" in self.quantization.lower():
-            kwargs["load_in_4bit"] = True
+        kwargs = self.apply_bnb_quantization()
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import json
 import logging
 import os
 from functools import lru_cache
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 import torch
@@ -53,11 +53,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "opt",
     "glm4-chat",
     "glm4-chat-1m",
-    "internlm2-chat",
-    "internlm2.5-chat",
     "qwen-vl-chat",
     "OmniLMM",
-    "yi-vl-chat",
     "deepseek-vl-chat",
     "cogvlm2",
     "cogvlm2-video-llama3-chat",
@@ -75,6 +72,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "cogagent",
     "gemma-3-1b-it",
     "gemma-3-it",
+    "Ovis2",
     "deepseek-vl2",
 ]
@@ -142,6 +140,7 @@ class PytorchModel(LLM):
         pytorch_model_config.setdefault("max_num_seqs", 16)
         pytorch_model_config.setdefault("enable_tensorizer", False)
         pytorch_model_config.setdefault("reasoning_content", False)
+        pytorch_model_config.setdefault("quantization_config", {})
         return pytorch_model_config
     def _sanitize_generate_config(
@@ -264,16 +263,39 @@ class PytorchModel(LLM):
                     f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
                 )
-    def load(self):
-        try:
-            import torch
-        except ImportError:
-            raise ImportError(
-                f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n"
+    def apply_bnb_quantization(
+        self, kwargs: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        model_format = self.model_spec.model_format
+        _kwargs = kwargs if kwargs is not None else {}
+        if model_format == "pytorch":
+            quantization_config = self._pytorch_model_config.get(
+                "quantization_config", {}
             )
-        from .compression import load_compress_model
+            if quantization_config:
+                # If `load_in_4bit` is enabled, apply default quantization presets.
+                if quantization_config.get("load_in_4bit", False):
+                    quantization_config.setdefault(
+                        "bnb_4bit_compute_dtype", torch.float16
+                    )
+                    quantization_config.setdefault("bnb_4bit_use_double_quant", True)
+                    quantization_config.setdefault(
+                        "llm_int8_skip_modules",
+                        [
+                            "lm_head",
+                            "encoder",
+                            "EncDecAttention",
+                        ],
+                    )
+                from transformers import BitsAndBytesConfig
+                _kwargs["quantization_config"] = BitsAndBytesConfig(
+                    **quantization_config
+                )
+        return _kwargs
-        quantization = self.quantization
+    def load(self):
         num_gpus = gpu_count()
         device = self._pytorch_model_config.get("device", "auto")
         self._pytorch_model_config["device"] = select_device(device)
@@ -294,7 +316,6 @@ class PytorchModel(LLM):
         kwargs["trust_remote_code"] = self._pytorch_model_config.get(
             "trust_remote_code"
         )
-        model_format = self.model_spec.model_format
         is_device_map_auto = False
@@ -310,45 +331,8 @@ class PytorchModel(LLM):
             }
             kwargs["max_memory"] = max_memory
-        if quantization != "none" and model_format == "pytorch":
-            if self._device == "cuda" and self._is_linux():
-                kwargs["device_map"] = "auto"
-                is_device_map_auto = True
-                if quantization == "4-bit":
-                    kwargs["load_in_4bit"] = True
-                    kwargs["bnb_4bit_compute_dtype"] = torch.float16
-                    kwargs["bnb_4bit_use_double_quant"] = True
-                    kwargs["llm_int8_skip_modules"] = [
-                        "lm_head",
-                        "encoder",
-                        "EncDecAttention",
-                    ]
-                elif quantization == "8-bit":
-                    kwargs["load_in_8bit"] = True
-                else:
-                    raise ValueError(
-                        f"Quantization {quantization} is not supported in temporary"
-                    )
-            else:
-                if num_gpus != 1 and self._device == "cuda":
-                    raise ValueError(f"Quantization is not supported for multi-gpu")
-                elif quantization != "8-bit":
-                    raise ValueError(
-                        f"Only 8-bit quantization is supported if it is not linux system or cuda device"
-                    )
-                else:
-                    (
-                        self._model,
-                        self._tokenizer,
-                    ) = load_compress_model(
-                        model_path=self.model_path,
-                        device=self._device,
-                        torch_dtype=kwargs["torch_dtype"],
-                        use_fast=self._use_fast_tokenizer,
-                        revision=kwargs["revision"],
-                    )
-                    logger.debug(f"Model Memory: {self._model.get_memory_footprint()}")
-                    return
+        # handle bnb quantization
+        kwargs = self.apply_bnb_quantization(kwargs)
         if num_gpus > 0 and is_hf_accelerate_supported(self._device):
             kwargs.update({"device_map": "auto"})
@@ -372,7 +356,11 @@ class PytorchModel(LLM):
         logger.debug(f"Model Memory: {self._model.get_memory_footprint()}")
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("transformers") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -689,7 +677,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         return generate_config
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -711,9 +699,11 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def load(self):
         super().load()
-    def _get_full_prompt(self, messages: List[Dict], tools):
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
         model_family = self.model_family.model_family or self.model_family.model_name
-        full_context_kwargs = {}
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+        )
         if (
             tools
             and model_family in QWEN_TOOL_CALL_FAMILY
@@ -736,7 +726,9 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             try:
                 if not r.stopped and r.is_prefill:
                     tools = r.generate_config.get("tools", None)
-                    r.full_prompt = self._get_full_prompt(r.prompt, tools)
+                    r.full_prompt = self._get_full_prompt(
+                        r.prompt, tools, r.generate_config
+                    )
                     if tools:
                         r.tools = tools
             except Exception as e:

xinference/model/llm/transformers/deepseek_v2.py CHANGED Viewed

@@ -48,13 +48,14 @@ class DeepSeekV2PytorchModel(PytorchModel):
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
             device_map="auto",
+            **kwargs,
         )
         model.generation_config = GenerationConfig.from_pretrained(self.model_path)
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
@@ -95,13 +96,14 @@ class DeepSeekV2PytorchChatModel(PytorchChatModel):
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
             device_map="auto",
+            **kwargs,
         )
         model.generation_config = GenerationConfig.from_pretrained(self.model_path)
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":

xinference/model/llm/transformers/deepseek_vl.py CHANGED Viewed

@@ -42,11 +42,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._type = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
-        if "deepseek-vl" == llm_family.lower():
+        if "deepseek-vl-chat" == llm_family.lower():
             return True
         return False
@@ -62,6 +62,8 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._device = select_device(self._device)
         self._type = torch.float16 if self._device == "mps" else torch.bfloat16
+        kwargs = self.apply_bnb_quantization()
         # specify the path to the model
         self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(  # type: ignore
             self.model_path
@@ -69,9 +71,13 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._tokenizer = self._vl_chat_processor.tokenizer
         vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
-            self.model_path, trust_remote_code=True, device_map=self._device
+            self.model_path,
+            trust_remote_code=True,
+            device_map=self._device,
+            torch_dtype=self._type,
+            **kwargs,
         )
-        self._model = vl_gpt.to(self._type).eval()
+        self._model = vl_gpt.eval()
     @staticmethod
     def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:

xinference/model/llm/transformers/deepseek_vl2.py CHANGED Viewed

@@ -42,7 +42,7 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         self._type = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
@@ -60,7 +60,8 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         self._device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(self._device)
-        self._type = torch.float16 if self._device == "mps" else torch.bfloat16
+        self._type = torch.bfloat16
+        kwargs = self.apply_bnb_quantization()
         # specify the path to the model
         self._vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(  # type: ignore
@@ -69,9 +70,13 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         self._tokenizer = self._vl_chat_processor.tokenizer
         vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
-            self.model_path, trust_remote_code=True, device_map=self._device
+            self.model_path,
+            trust_remote_code=True,
+            device_map=self._device,
+            torch_dtype=self._type,
+            **kwargs,
         )
-        self._model = vl_gpt.to(torch.bfloat16).cuda().eval()
+        self._model = vl_gpt.cuda().eval()
     @staticmethod
     def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:

xinference/model/llm/transformers/gemma3.py CHANGED Viewed

@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 class Gemma3TextChatModel(PytorchChatModel):
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -56,7 +56,7 @@ class Gemma3ChatModel(PytorchChatModel):
         self._processor = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
@@ -85,6 +85,7 @@ class Gemma3ChatModel(PytorchChatModel):
         device = "auto" if device == "cuda" else device
         min_pixels = self._pytorch_model_config.get("min_pixels")
         max_pixels = self._pytorch_model_config.get("max_pixels")
+        kwargs = self.apply_bnb_quantization()
         self._processor = AutoProcessor.from_pretrained(
             self.model_path,
             min_pixels=min_pixels,
@@ -92,9 +93,7 @@ class Gemma3ChatModel(PytorchChatModel):
         )
         self._tokenizer = self._processor.tokenizer
         self._model = Gemma3ForConditionalGeneration.from_pretrained(
-            self.model_path,
-            device_map="auto",
-            torch_dtype="bfloat16",
+            self.model_path, device_map="auto", torch_dtype="bfloat16", **kwargs
         )
     @cache_clean

xinference 1.5.0.post2__py3-none-any.whl → 1.5.1__py3-none-any.whl

Potentially problematic release.

xinference 1.5.0.post2py3-none-any.whl → 1.5.1py3-none-any.whl