PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/model/rerank/sentence_transformers/core.py ADDED Viewed

@@ -0,0 +1,337 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import importlib.util
+import logging
+import threading
+import uuid
+from typing import List, Optional, Sequence
+import numpy as np
+import torch
+import torch.nn as nn
+from ....device_utils import empty_cache
+from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
+from ...utils import is_flash_attn_available
+from ..core import (
+    RERANK_EMPTY_CACHE_COUNT,
+    RerankModel,
+    RerankModelFamilyV2,
+    RerankSpecV1,
+)
+from ..utils import preprocess_sentence
+logger = logging.getLogger(__name__)
+class _ModelWrapper(nn.Module):
+    def __init__(self, module: nn.Module):
+        super().__init__()
+        self.model = module
+        self._local_data = threading.local()
+    @property
+    def n_tokens(self):
+        return getattr(self._local_data, "n_tokens", 0)
+    @n_tokens.setter
+    def n_tokens(self, value):
+        self._local_data.n_tokens = value
+    def forward(self, **kwargs):
+        attention_mask = kwargs.get("attention_mask")
+        # when batching, the attention mask 1 means there is a token
+        # thus we just sum up it to get the total number of tokens
+        if attention_mask is not None:
+            self.n_tokens += attention_mask.sum().item()
+        return self.model(**kwargs)
+    def __getattr__(self, attr):
+        try:
+            return super().__getattr__(attr)
+        except AttributeError:
+            return getattr(self.model, attr)
+class SentenceTransformerRerankModel(RerankModel):
+    def load(self):
+        # TODO: Split FlagReranker and sentence_transformers into different model_engines like FlagRerankModel
+        logger.info("Loading rerank model: %s", self._model_path)
+        enable_flash_attn = self._kwargs.pop(
+            "enable_flash_attn", is_flash_attn_available()
+        )
+        if enable_flash_attn:
+            logger.warning(
+                "flash_attn can only support fp16 and bf16, will force set `use_fp16` to True"
+            )
+            self._use_fp16 = True
+        if (
+            self.model_family.type == "normal"
+            and "qwen3" not in self.model_family.model_name.lower()
+        ):
+            try:
+                import sentence_transformers
+                from sentence_transformers.cross_encoder import CrossEncoder
+                if sentence_transformers.__version__ < "3.1.0":
+                    raise ValueError(
+                        "The sentence_transformers version must be greater than 3.1.0. "
+                        "Please upgrade your version via `pip install -U sentence_transformers` or refer to "
+                        "https://github.com/UKPLab/sentence-transformers"
+                    )
+            except ImportError:
+                error_message = "Failed to import module 'sentence-transformers'"
+                installation_guide = [
+                    "Please make sure 'sentence-transformers' is installed. ",
+                    "You can install it by `pip install sentence-transformers`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            self._model = CrossEncoder(
+                self._model_path,
+                device=self._device,
+                trust_remote_code=True,
+                max_length=getattr(self.model_family, "max_tokens"),
+                **self._kwargs,
+            )
+            if self._use_fp16:
+                self._model.model.half()
+        elif "qwen3" in self.model_family.model_name.lower():
+            # qwen3-reranker
+            # now we use transformers
+            # TODO: support engines for rerank models
+            try:
+                from transformers import AutoModelForCausalLM, AutoTokenizer
+            except ImportError:
+                error_message = "Failed to import module 'transformers'"
+                installation_guide = [
+                    "Please make sure 'transformers' is installed. ",
+                    "You can install it by `pip install transformers`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            tokenizer = AutoTokenizer.from_pretrained(
+                self._model_path, padding_side="left"
+            )
+            model_kwargs = {"device_map": "auto"}
+            if enable_flash_attn:
+                model_kwargs["attn_implementation"] = "flash_attention_2"
+                model_kwargs["torch_dtype"] = torch.float16
+            model_kwargs.update(self._kwargs)
+            logger.debug("Loading qwen3 rerank with kwargs %s", model_kwargs)
+            model = self._model = AutoModelForCausalLM.from_pretrained(
+                self._model_path, **model_kwargs
+            ).eval()
+            max_length = getattr(self.model_family, "max_tokens")
+            prefix = (
+                "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query "
+                'and the Instruct provided. Note that the answer can only be "yes" or "no".'
+                "<|im_end|>\n<|im_start|>user\n"
+            )
+            suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
+            suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
+            def process_inputs(pairs):
+                inputs = tokenizer(
+                    pairs,
+                    padding=False,
+                    truncation="longest_first",
+                    return_attention_mask=False,
+                    max_length=max_length - len(prefix_tokens) - len(suffix_tokens),
+                )
+                for i, ele in enumerate(inputs["input_ids"]):
+                    inputs["input_ids"][i] = prefix_tokens + ele + suffix_tokens
+                inputs = tokenizer.pad(
+                    inputs, padding=True, return_tensors="pt", max_length=max_length
+                )
+                for key in inputs:
+                    inputs[key] = inputs[key].to(model.device)
+                return inputs
+            token_false_id = tokenizer.convert_tokens_to_ids("no")
+            token_true_id = tokenizer.convert_tokens_to_ids("yes")
+            @torch.inference_mode()
+            def compute_logits(inputs, **kwargs):
+                batch_scores = model(**inputs).logits[:, -1, :]
+                true_vector = batch_scores[:, token_true_id]
+                false_vector = batch_scores[:, token_false_id]
+                batch_scores = torch.stack([false_vector, true_vector], dim=1)
+                batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+                scores = batch_scores[:, 1].exp().tolist()
+                return scores
+            self.process_inputs = process_inputs
+            self.compute_logits = compute_logits
+        else:
+            try:
+                if self.model_family.type == "LLM-based":
+                    from FlagEmbedding import FlagLLMReranker as FlagReranker
+                elif self.model_family.type == "LLM-based layerwise":
+                    from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
+                else:
+                    raise RuntimeError(
+                        f"Unsupported Rank model type: {self.model_family.type}"
+                    )
+            except ImportError:
+                error_message = "Failed to import module 'FlagEmbedding'"
+                installation_guide = [
+                    "Please make sure 'FlagEmbedding' is installed. ",
+                    "You can install it by `pip install FlagEmbedding`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            self._model = FlagReranker(self._model_path, use_fp16=self._use_fp16)
+        # Wrap transformers model to record number of tokens
+        self._model.model = _ModelWrapper(self._model.model)
+    def rerank(
+        self,
+        documents: List[str],
+        query: str,
+        top_n: Optional[int],
+        max_chunks_per_doc: Optional[int],
+        return_documents: Optional[bool],
+        return_len: Optional[bool],
+        **kwargs,
+    ) -> Rerank:
+        assert self._model is not None
+        if max_chunks_per_doc is not None:
+            raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
+        logger.info("Rerank with kwargs: %s, model: %s", kwargs, self._model)
+        pre_query = preprocess_sentence(
+            query, kwargs.get("instruction", None), self.model_family.model_name
+        )
+        sentence_combinations = [[pre_query, doc] for doc in documents]
+        # reset n tokens
+        self._model.model.n_tokens = 0
+        if (
+            self.model_family.type == "normal"
+            and "qwen3" not in self.model_family.model_name.lower()
+        ):
+            logger.debug("Passing processed sentences: %s", sentence_combinations)
+            similarity_scores = self._model.predict(
+                sentence_combinations,
+                convert_to_numpy=False,
+                convert_to_tensor=True,
+                **kwargs,
+            ).cpu()
+            if similarity_scores.dtype == torch.bfloat16:
+                similarity_scores = similarity_scores.float()
+        elif "qwen3" in self.model_family.model_name.lower():
+            def format_instruction(instruction, query, doc):
+                if instruction is None:
+                    instruction = "Given a web search query, retrieve relevant passages that answer the query"
+                output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
+                    instruction=instruction, query=query, doc=doc
+                )
+                return output
+            # reduce memory usage.
+            micro_bs = 4
+            similarity_scores = []
+            for i in range(0, len(documents), micro_bs):
+                sub_docs = documents[i : i + micro_bs]
+                pairs = [
+                    format_instruction(kwargs.get("instruction", None), query, doc)
+                    for doc in sub_docs
+                ]
+                # Tokenize the input texts
+                inputs = self.process_inputs(pairs)
+                similarity_scores.extend(self.compute_logits(inputs))
+        else:
+            # Related issue: https://github.com/xorbitsai/inference/issues/1775
+            similarity_scores = self._model.compute_score(
+                sentence_combinations, **kwargs
+            )
+            if not isinstance(similarity_scores, Sequence):
+                similarity_scores = [similarity_scores]
+            elif (
+                isinstance(similarity_scores, list)
+                and len(similarity_scores) > 0
+                and isinstance(similarity_scores[0], Sequence)
+            ):
+                similarity_scores = similarity_scores[0]
+        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
+        if top_n is not None:
+            sim_scores_argsort = sim_scores_argsort[:top_n]
+        if return_documents:
+            docs = [
+                DocumentObj(
+                    index=int(arg),
+                    relevance_score=float(similarity_scores[arg]),
+                    document=Document(text=documents[arg]),
+                )
+                for arg in sim_scores_argsort
+            ]
+        else:
+            docs = [
+                DocumentObj(
+                    index=int(arg),
+                    relevance_score=float(similarity_scores[arg]),
+                    document=None,
+                )
+                for arg in sim_scores_argsort
+            ]
+        if return_len:
+            input_len = self._model.model.n_tokens
+            # Rerank Model output is just score or documents
+            # while return_documents = True
+            output_len = input_len
+        # api_version, billed_units, warnings
+        # is for Cohere API compatibility, set to None
+        metadata = Meta(
+            api_version=None,
+            billed_units=None,
+            tokens=(
+                RerankTokens(input_tokens=input_len, output_tokens=output_len)
+                if return_len
+                else None
+            ),
+            warnings=None,
+        )
+        del similarity_scores
+        # clear cache if possible
+        self._counter += 1
+        if self._counter % RERANK_EMPTY_CACHE_COUNT == 0:
+            logger.debug("Empty rerank cache.")
+            gc.collect()
+            empty_cache()
+        return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata)
+    @classmethod
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("sentence_transformers") is not None
+    @classmethod
+    def match_json(
+        cls,
+        model_family: RerankModelFamilyV2,
+        model_spec: RerankSpecV1,
+        quantization: str,
+    ) -> bool:
+        # As default embedding engine, sentence-transformer support all models
+        return model_spec.model_format in ["pytorch"]

xinference/model/rerank/vllm/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference/model/rerank/vllm/core.py ADDED Viewed

@@ -0,0 +1,156 @@
+import importlib.util
+import uuid
+from typing import List, Optional
+from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
+from ...utils import cache_clean
+from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
+SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
+class VLLMRerankModel(RerankModel):
+    def load(self):
+        try:
+            from vllm import LLM
+        except ImportError:
+            error_message = "Failed to import module 'vllm'"
+            installation_guide = [
+                "Please make sure 'vllm' is installed. ",
+                "You can install it by `pip install vllm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        if self.model_family.model_name in {
+            "Qwen3-Reranker-0.6B",
+            "Qwen3-Reranker-4B",
+            "Qwen3-Reranker-8B",
+        }:
+            if "hf_overrides" not in self._kwargs:
+                self._kwargs["hf_overrides"] = {
+                    "architectures": ["Qwen3ForSequenceClassification"],
+                    "classifier_from_token": ["no", "yes"],
+                    "is_original_qwen3_reranker": True,
+                }
+            elif isinstance(self._kwargs["hf_overrides"], dict):
+                self._kwargs["hf_overrides"].update(
+                    architectures=["Qwen3ForSequenceClassification"],
+                    classifier_from_token=["no", "yes"],
+                    is_original_qwen3_reranker=True,
+                )
+        self._model = LLM(model=self._model_path, task="score", **self._kwargs)
+        self._tokenizer = self._model.get_tokenizer()
+    @cache_clean
+    def rerank(
+        self,
+        documents: List[str],
+        query: str,
+        top_n: Optional[int],
+        max_chunks_per_doc: Optional[int],
+        return_documents: Optional[bool],
+        return_len: Optional[bool],
+        **kwargs,
+    ) -> Rerank:
+        """
+        Rerank the documents based on the query using the VLLM model.
+        Args:
+            documents (List[str]): List of documents to be reranked.
+            query (str): The query string to rank the documents against.
+            top_n (Optional[int]): The number of top documents to return.
+            max_chunks_per_doc (Optional[int]): Maximum chunks per document.
+            return_documents (Optional[bool]): Whether to return the documents.
+            return_len (Optional[bool]): Whether to return the length of the documents.
+        Returns:
+            Rerank: The reranked results.
+        """
+        if kwargs:
+            raise RuntimeError("Unexpected keyword arguments: {}".format(kwargs))
+        assert self._model is not None
+        documents_size = len(documents)
+        query_list = [query] * documents_size
+        if self.model_family.model_name in {
+            "Qwen3-Reranker-0.6B",
+            "Qwen3-Reranker-4B",
+            "Qwen3-Reranker-8B",
+        }:
+            instruction = "Given a web search query, retrieve relevant passages that answer the query"
+            prefix = (
+                "<|im_start|>system\nJudge whether the Document meets the requirements based on"
+                " the Query and the Instruct provided. "
+                'Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
+            )
+            suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
+            document_template = "<Document>: {doc}{suffix}"
+            processed_queries = [
+                query_template.format(
+                    prefix=prefix, instruction=instruction, query=query
+                )
+                for query in query_list
+            ]
+            processed_documents = [
+                document_template.format(doc=doc, suffix=suffix) for doc in documents
+            ]
+            outputs = self._model.score(
+                processed_documents,
+                processed_queries,
+                use_tqdm=False,
+            )
+        else:
+            outputs = self._model.score(
+                documents,
+                query_list,
+                use_tqdm=False,
+            )
+        scores = map(lambda scoreoutput: scoreoutput.outputs.score, outputs)
+        documents = list(map(lambda doc: Document(text=doc), documents))
+        document_parts = list(zip(range(documents_size), scores, documents))
+        document_parts.sort(key=lambda x: x[1], reverse=True)
+        if top_n is not None:
+            document_parts = document_parts[:top_n]
+        reranked_docs = list(
+            map(
+                lambda doc: DocumentObj(
+                    index=doc[0],
+                    relevance_score=doc[1],
+                    document=doc[2] if return_documents else None,
+                ),
+                document_parts,
+            )
+        )
+        tokens = sum(map(lambda x: len(x.prompt_token_ids), outputs))
+        metadata = Meta(
+            api_version=None,
+            billed_units=None,
+            tokens=(
+                RerankTokens(input_tokens=tokens, output_tokens=tokens)
+                if return_len
+                else None
+            ),
+            warnings=None,
+        )
+        return Rerank(id=str(uuid.uuid4()), results=reranked_docs, meta=metadata)
+    @classmethod
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("vllm") is not None
+    @classmethod
+    def match_json(
+        cls,
+        model_family: RerankModelFamilyV2,
+        model_spec: RerankSpecV1,
+        quantization: str,
+    ) -> bool:
+        if model_spec.model_format in ["pytorch"]:
+            prefix = model_family.model_name.split("-", 1)[0]
+            if prefix in SUPPORTED_MODELS_PREFIXES:
+                return True
+        return False

xinference/model/utils.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import asyncio
+import functools
 import json
 import logging
 import os
@@ -454,6 +455,19 @@ def get_engine_params_by_name(
             for param in params:
                 del param["embedding_class"]
+        return engine_params
+    elif model_type == "rerank":
+        from .rerank.rerank_family import RERANK_ENGINES
+        if model_name not in RERANK_ENGINES:
+            return None
+        # filter rerank_class
+        engine_params = deepcopy(RERANK_ENGINES[model_name])
+        for engine, params in engine_params.items():
+            for param in params:
+                del param["rerank_class"]
         return engine_params
     else:
         raise ValueError(
@@ -558,3 +572,97 @@ class ModelInstanceInfoMixin(ABC):
     @abstractmethod
     def to_version_info(self):
         """"""
+def is_flash_attn_available() -> bool:
+    """
+    Check if flash_attention can be enabled in the current environment.
+    Checks the following conditions:
+    1. Whether the flash_attn package is installed
+    2. Whether CUDA GPU is available
+    3. Whether PyTorch supports CUDA
+    4. Whether GPU compute capability meets requirements (>= 8.0)
+    Returns:
+        bool: True if flash_attention can be enabled, False otherwise
+    """
+    import importlib.util
+    # Check if flash_attn is installed
+    if importlib.util.find_spec("flash_attn") is None:
+        logger.debug("flash_attn package not found")
+        return False
+    try:
+        import torch
+        # Check if CUDA is available
+        if not torch.cuda.is_available():
+            logger.debug("CUDA not available")
+            return False
+        # Check GPU count
+        if torch.cuda.device_count() == 0:
+            logger.debug("No CUDA devices found")
+            return False
+        # Check current GPU compute capability
+        # Flash Attention typically requires compute capability >= 8.0 (A100, H100, etc.)
+        current_device = torch.cuda.current_device()
+        capability = torch.cuda.get_device_capability(current_device)
+        major, minor = capability
+        compute_capability = major + minor * 0.1
+        if compute_capability < 8.0:
+            logger.debug(
+                f"GPU compute capability {compute_capability} < 8.0, "
+                "flash_attn may not work optimally"
+            )
+            return False
+        # Try to import flash_attn core module to verify correct installation
+        try:
+            import flash_attn
+            logger.debug(
+                f"flash_attn version: {getattr(flash_attn, '__version__', 'unknown')}"
+            )
+            return True
+        except ImportError as e:
+            logger.debug(f"Failed to import flash_attn: {e}")
+            return False
+    except Exception as e:
+        logger.debug(f"Error checking flash_attn availability: {e}")
+        return False
+def cache_clean(fn):
+    @functools.wraps(fn)
+    async def _async_wrapper(self, *args, **kwargs):
+        import gc
+        from ..device_utils import empty_cache
+        result = await fn(self, *args, **kwargs)
+        gc.collect()
+        empty_cache()
+        return result
+    @functools.wraps(fn)
+    def _wrapper(self, *args, **kwargs):
+        import gc
+        from ..device_utils import empty_cache
+        result = fn(self, *args, **kwargs)
+        gc.collect()
+        empty_cache()
+        return result
+    if asyncio.iscoroutinefunction(fn):
+        return _async_wrapper
+    else:
+        return _wrapper

xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl