PyPI - xinference - Versions diffs - 1.6.1__py3-none-any.whl → 1.7.0.post1__py3-none-any.whl - Mend

xinference 1.6.1py3-none-any.whl → 1.7.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -22,6 +22,7 @@ import logging
 import os
 import re
 import sys
+import warnings
 from glob import glob
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -197,8 +198,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             return getattr(module, class_name)
     def load(self):
-        from transformers import BitsAndBytesConfig, T5EncoderModel
         if "text2image" in self._abilities or "image2image" in self._abilities:
             from diffusers import AutoPipelineForText2Image as AutoPipelineModel
         elif "inpainting" in self._abilities:
@@ -227,58 +226,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                     self._get_controlnet_model(*cn) for cn in controlnet
                 ]
+        # quantizations
+        # text_encoder
         quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
-        if quantize_text_encoder and not self._gguf_model_path:
-            try:
-                import bitsandbytes  # noqa: F401
-            except ImportError:
-                error_message = "Failed to import module 'bitsandbytes'"
-                installation_guide = [
-                    "Please make sure 'bitsandbytes' is installed. ",
-                    "You can install it by `pip install bitsandbytes`\n",
-                ]
-                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-            for text_encoder_name in quantize_text_encoder.split(","):
-                quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-                quantization_kwargs = {}
-                if torch_dtype:
-                    quantization_kwargs["torch_dtype"] = torch_dtype
-                text_encoder = T5EncoderModel.from_pretrained(
-                    self._model_path,
-                    subfolder=text_encoder_name,
-                    quantization_config=quantization_config,
-                    **quantization_kwargs,
-                )
-                self._kwargs[text_encoder_name] = text_encoder
-                self._kwargs["device_map"] = "balanced"
+        self._quantize_text_encoder(quantize_text_encoder)
+        # transformer
         if self._gguf_model_path:
-            from diffusers import GGUFQuantizationConfig
-            # GGUF transformer
-            self._kwargs["transformer"] = self._get_layer_cls(
-                "transformer"
-            ).from_single_file(
-                self._gguf_model_path,
-                quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
-                torch_dtype=torch_dtype,
-                config=os.path.join(self._model_path, "transformer"),
-            )
-        elif self._kwargs.get("transformer_nf4"):
-            nf4_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch_dtype,
-            )
-            model_nf4 = self._get_layer_cls("transformer").from_pretrained(
-                self._model_path,
-                subfolder="transformer",
-                quantization_config=nf4_config,
-                torch_dtype=torch_dtype,
-            )
-            self._kwargs["transformer"] = model_nf4
+            self._quantize_transformer_gguf()
+        else:
+            self._quantize_transformer()
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
@@ -308,6 +264,133 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                     cache_branch_id=self._kwargs.get("deepcache_cache_branch_id", 0),
                 )
+    def _get_quantize_config(self, method: str, quantization: str, module: str):
+        if method == "bnb":
+            try:
+                import bitsandbytes  # noqa: F401
+            except ImportError:
+                error_message = "Failed to import module 'bitsandbytes'"
+                installation_guide = [
+                    "Please make sure 'bitsandbytes' is installed. ",
+                    "You can install it by `pip install bitsandbytes`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            if module.startswith("diffusers."):
+                from diffusers import BitsAndBytesConfig
+            else:
+                assert module.startswith("transformers.")
+                from transformers import BitsAndBytesConfig
+            if quantization == "4-bit":
+                return BitsAndBytesConfig(load_in_4bit=True)
+            elif quantization == "8-bit":
+                return BitsAndBytesConfig(load_in_8bit=True)
+            elif quantization == "nf4":
+                return BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=self._torch_dtype,
+                )
+        elif method == "torchao":
+            try:
+                import torchao  # noqa: F401
+            except ImportError:
+                error_message = "Failed to import module 'torchao'"
+                installation_guide = [
+                    "Please make sure 'torchao' is installed. ",
+                    "You can install it by `pip install torchao`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            if module.startswith("diffusers."):
+                from diffusers import TorchAoConfig
+            else:
+                assert module.startswith("transformers.")
+                from transformers import TorchAoConfig
+            return TorchAoConfig(quantization)
+        else:
+            raise ValueError(f"Unknown quantization method for image model: {method}")
+    def _quantize_text_encoder(self, quantize_text_encoder: Optional[str]):
+        if self._gguf_model_path:
+            # skip quantization when gguf applied to transformer
+            return
+        if not quantize_text_encoder:
+            return
+        quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
+        quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
+        torch_dtype = self._torch_dtype
+        for text_encoder_name in quantize_text_encoder.split(","):
+            quantization_kwargs: Dict[str, Any] = {}
+            if torch_dtype:
+                quantization_kwargs["torch_dtype"] = torch_dtype
+            text_encoder_cls = self._get_layer_cls(text_encoder_name)
+            quantization_config = self._get_quantize_config(
+                quantization_method, quantization, text_encoder_cls.__module__
+            )
+            text_encoder = text_encoder_cls.from_pretrained(
+                self._model_path,
+                subfolder=text_encoder_name,
+                quantization_config=quantization_config,
+                **quantization_kwargs,
+            )
+            self._kwargs[text_encoder_name] = text_encoder
+        else:
+            if not self._kwargs.get("device_map"):
+                self._kwargs["device_map"] = "balanced"
+    def _quantize_transformer(self):
+        quantization = None
+        nf4 = self._kwargs.pop("transformer_nf4", None)
+        if nf4:
+            warnings.warn(
+                "`transformer_nf4` is deprecated, please use `transformer_quantization=nf4`",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            quantization = "nf4"
+        method = self._kwargs.pop("transformer_quantize_method", "bnb")
+        if not quantization:
+            quantization = self._kwargs.pop("transformer_quantization", None)
+        if not quantization:
+            # skip if no quantization specified
+            return
+        torch_dtype = self._torch_dtype
+        transformer_cls = self._get_layer_cls("transformer")
+        quantization_config = self._get_quantize_config(
+            method, quantization, transformer_cls.__module__
+        )
+        transformer_model = transformer_cls.from_pretrained(
+            self._model_path,
+            subfolder="transformer",
+            quantization_config=quantization_config,
+            torch_dtype=torch_dtype,
+        )
+        self._kwargs["transformer"] = transformer_model
+    def _quantize_transformer_gguf(self):
+        from diffusers import GGUFQuantizationConfig
+        # GGUF transformer
+        torch_dtype = self._torch_dtype
+        self._kwargs["transformer"] = self._get_layer_cls(
+            "transformer"
+        ).from_single_file(
+            self._gguf_model_path,
+            quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
+            torch_dtype=torch_dtype,
+            config=os.path.join(self._model_path, "transformer"),
+        )
     def _load_to_device(self, model):
         if self._kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
@@ -321,7 +404,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         if self._kwargs.get("attention_slicing", False):
             model.enable_attention_slicing()
         if self._kwargs.get("vae_tiling", False):
-            model.enable_vae_tiling()
+            try:
+                model.enable_vae_tiling()
+            except AttributeError:
+                model.vae.enable_tiling()
+        if self._kwargs.get("vae_slicing", False):
+            try:
+                model.enable_vae_slicing()
+            except AttributeError:
+                model.vae.enable_slicing()
     def get_max_num_images_for_batching(self):
         return self._kwargs.get("max_num_images", 16)

xinference/model/llm/llama_cpp/memory.py CHANGED Viewed

@@ -17,8 +17,10 @@ from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any
-from gguf import GGUFReader, GGUFValueType  # noqa: E402
+try:
+    from gguf import GGUFReader, GGUFValueType  # noqa: E402
+except ImportError:
+    GGUFReader = GGUFValueType = None
 logger = logging.getLogger(__name__)

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -6142,6 +6142,53 @@
       "</s>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "minicpm4",
+    "model_lang": [
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "JunHowie/MiniCPM4-0.5B"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "JunHowie/MiniCPM4-8B"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/MiniCPM4-8B-4bit"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      73440
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
@@ -6737,6 +6784,16 @@
           "none"
         ],
         "model_id": "deepseek-ai/DeepSeek-R1-0528"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 671,
+        "quantizations": [
+          "Int4-Int8Mix-Lite",
+          "Int4-Int8Mix-Compact",
+          "Int4-Int8Mix-Medium"
+        ],
+        "model_id": "QuantTrio/DeepSeek-R1-0528-GPTQ-{quantization}"
       }
     ],
     "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -4277,6 +4277,56 @@
       "</s>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "minicpm4",
+    "model_lang": [
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "JunHowie/MiniCPM4-0.5B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "JunHowie/MiniCPM4-8B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/MiniCPM4-8B-4bit",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      73440
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
@@ -4883,6 +4933,17 @@
         ],
         "model_id": "deepseek-ai/DeepSeek-R1-0528",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 671,
+        "quantizations": [
+          "Int4-Int8Mix-Lite",
+          "Int4-Int8Mix-Compact",
+          "Int4-Int8Mix-Medium"
+        ],
+        "model_id": "tclf90/DeepSeek-R1-0528-GPTQ-{quantization}",
+        "model_hub": "modelscope"
       }
     ],
     "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import importlib.util
 import json
 import logging
+import multiprocessing
 import sys
 import threading
 import time
@@ -188,6 +189,9 @@ class SGLANGModel(LLM):
         if sgl_port is None:
             raise ValueError("Failed to find a port for sglang")
+        # fork may cause sglang stuck, force set to spawn
+        multiprocessing.set_start_method("spawn")
         if self._n_worker > 1:
             # distributed inference
             self._model_config["nnodes"] = self._n_worker

xinference/model/llm/utils.py CHANGED Viewed

@@ -709,6 +709,12 @@ class ChatModelMixin:
         finish_reason = "tool_calls" if tool_calls else "stop"
         content = ". ".join(failed_contents) if failed_contents else None
+        # fix: qwen tool_call content field return null
+        family = model_family.model_family or model_family.model_name
+        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
+            content = ""
         d = {
             "role": "assistant",
             "content": content,
@@ -779,6 +785,11 @@ class ChatModelMixin:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
+        # fix: qwen tool_call content field return null
+        family = model_family.model_family or model_family.model_name
+        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
+            content = ""
         m = {
             "role": "assistant",
             "content": content,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -252,6 +252,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
 if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
+if VLLM_INSTALLED and vllm.__version__ >= "0.9.1":
+    VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")
 class VLLMModel(LLM):
     def __init__(

xinference/model/rerank/core.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import gc
 import importlib
+import importlib.util
 import logging
 import os
 import threading
@@ -31,6 +32,7 @@ from ...device_utils import empty_cache
 from ...types import Document, DocumentObj, Rerank, RerankTokens
 from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
 from ..utils import is_model_cached
+from .utils import preprocess_sentence
 logger = logging.getLogger(__name__)
@@ -201,7 +203,10 @@ class RerankModel:
             )
             self._use_fp16 = True
-        if self._model_spec.type == "normal":
+        if (
+            self._model_spec.type == "normal"
+            and "qwen3" not in self._model_spec.model_name.lower()
+        ):
             try:
                 import sentence_transformers
                 from sentence_transformers.cross_encoder import CrossEncoder
@@ -229,6 +234,74 @@ class RerankModel:
             )
             if self._use_fp16:
                 self._model.model.half()
+        elif "qwen3" in self._model_spec.model_name.lower():
+            # qwen3-reranker
+            # now we use transformers
+            # TODO: support engines for rerank models
+            try:
+                from transformers import AutoModelForCausalLM, AutoTokenizer
+            except ImportError:
+                error_message = "Failed to import module 'transformers'"
+                installation_guide = [
+                    "Please make sure 'transformers' is installed. ",
+                    "You can install it by `pip install transformers`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            tokenizer = AutoTokenizer.from_pretrained(
+                self._model_path, padding_side="left"
+            )
+            flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+            model_kwargs = {"device_map": "auto"}
+            if flash_attn_installed:
+                model_kwargs["attn_implementation"] = "flash_attention_2"
+                model_kwargs["torch_dtype"] = torch.float16
+            model = self._model = AutoModelForCausalLM.from_pretrained(
+                self._model_path, **model_kwargs
+            ).eval()
+            max_length = getattr(self._model_spec, "max_tokens")
+            prefix = (
+                "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query "
+                'and the Instruct provided. Note that the answer can only be "yes" or "no".'
+                "<|im_end|>\n<|im_start|>user\n"
+            )
+            suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
+            suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
+            def process_inputs(pairs):
+                inputs = tokenizer(
+                    pairs,
+                    padding=False,
+                    truncation="longest_first",
+                    return_attention_mask=False,
+                    max_length=max_length - len(prefix_tokens) - len(suffix_tokens),
+                )
+                for i, ele in enumerate(inputs["input_ids"]):
+                    inputs["input_ids"][i] = prefix_tokens + ele + suffix_tokens
+                inputs = tokenizer.pad(
+                    inputs, padding=True, return_tensors="pt", max_length=max_length
+                )
+                for key in inputs:
+                    inputs[key] = inputs[key].to(model.device)
+                return inputs
+            token_false_id = tokenizer.convert_tokens_to_ids("no")
+            token_true_id = tokenizer.convert_tokens_to_ids("yes")
+            def compute_logits(inputs, **kwargs):
+                batch_scores = model(**inputs).logits[:, -1, :]
+                true_vector = batch_scores[:, token_true_id]
+                false_vector = batch_scores[:, token_false_id]
+                batch_scores = torch.stack([false_vector, true_vector], dim=1)
+                batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+                scores = batch_scores[:, 1].exp().tolist()
+                return scores
+            self.process_inputs = process_inputs
+            self.compute_logits = compute_logits
         else:
             try:
                 if self._model_spec.type == "LLM-based":
@@ -266,15 +339,17 @@ class RerankModel:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
         logger.info("Rerank with kwargs: %s, model: %s", kwargs, self._model)
-        from .utils import preprocess_sentence
         pre_query = preprocess_sentence(
             query, kwargs.get("instruction", None), self._model_spec.model_name
         )
         sentence_combinations = [[pre_query, doc] for doc in documents]
         # reset n tokens
         self._model.model.n_tokens = 0
-        if self._model_spec.type == "normal":
+        if (
+            self._model_spec.type == "normal"
+            and "qwen3" not in self._model_spec.model_name.lower()
+        ):
+            logger.debug("Passing processed sentences: %s", sentence_combinations)
             similarity_scores = self._model.predict(
                 sentence_combinations,
                 convert_to_numpy=False,
@@ -283,6 +358,23 @@ class RerankModel:
             ).cpu()
             if similarity_scores.dtype == torch.bfloat16:
                 similarity_scores = similarity_scores.float()
+        elif "qwen3" in self._model_spec.model_name.lower():
+            def format_instruction(instruction, query, doc):
+                if instruction is None:
+                    instruction = "Given a web search query, retrieve relevant passages that answer the query"
+                output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
+                    instruction=instruction, query=query, doc=doc
+                )
+                return output
+            pairs = [
+                format_instruction(kwargs.get("instruction", None), query, doc)
+                for doc in documents
+            ]
+            # Tokenize the input texts
+            inputs = self.process_inputs(pairs)
+            similarity_scores = self.compute_logits(inputs)
         else:
             # Related issue: https://github.com/xorbitsai/inference/issues/1775
             similarity_scores = self._model.compute_score(

xinference/model/rerank/model_spec.json CHANGED Viewed

@@ -62,5 +62,29 @@
     "max_tokens": 1024,
     "model_id": "openbmb/MiniCPM-Reranker",
     "model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
+  },
+  {
+    "model_name": "Qwen3-Reranker-0.6B",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 40960,
+    "model_id": "Qwen/Qwen3-Reranker-0.6B",
+    "model_revision": "6e9e69830b95c52b5fd889b7690dda3329508de3"
+  },
+  {
+    "model_name": "Qwen3-Reranker-4B",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 40960,
+    "model_id": "Qwen/Qwen3-Reranker-4B",
+    "model_revision": "f16fc5d5d2b9b1d0db8280929242745d79794ef5"
+  },
+  {
+    "model_name": "Qwen3-Reranker-8B",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 40960,
+    "model_id": "Qwen/Qwen3-Reranker-8B",
+    "model_revision": "5fa94080caafeaa45a15d11f969d7978e087a3db"
   }
 ]

xinference/model/rerank/model_spec_modelscope.json CHANGED Viewed

@@ -57,5 +57,29 @@
     "max_tokens": 1024,
     "model_id": "OpenBMB/MiniCPM-Reranker",
     "model_hub": "modelscope"
+  },
+  {
+    "model_name": "Qwen3-Reranker-0.6B",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 40960,
+    "model_id": "Qwen/Qwen3-Reranker-0.6B",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "Qwen3-Reranker-4B",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 40960,
+    "model_id": "Qwen/Qwen3-Reranker-4B",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "Qwen3-Reranker-8B",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 40960,
+    "model_id": "Qwen/Qwen3-Reranker-8B",
+    "model_hub": "modelscope"
   }
 ]

xinference 1.6.1__py3-none-any.whl → 1.7.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.6.1py3-none-any.whl → 1.7.0.post1py3-none-any.whl