PyPI - xinference - Versions diffs - 1.6.1__py3-none-any.whl → 1.7.0.post1__py3-none-any.whl - Mend

xinference 1.6.1py3-none-any.whl → 1.7.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show

xinference/model/embedding/embed_family.py ADDED Viewed

@@ -0,0 +1,133 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from threading import Lock
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Type
+from ..utils import is_valid_model_name
+if TYPE_CHECKING:
+    from .core import EmbeddingModel, EmbeddingModelSpec
+FLAG_EMBEDDER_CLASSES: List[Type["EmbeddingModel"]] = []
+SENTENCE_TRANSFORMER_CLASSES: List[Type["EmbeddingModel"]] = []
+VLLM_CLASSES: List[Type["EmbeddingModel"]] = []
+BUILTIN_EMBEDDING_MODELS: Dict[str, Any] = {}
+MODELSCOPE_EMBEDDING_MODELS: Dict[str, Any] = {}
+logger = logging.getLogger(__name__)
+# Desc: this file used to manage embedding models information.
+def match_embedding(
+    model_name: str,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
+) -> "EmbeddingModelSpec":
+    from ..utils import download_from_modelscope
+    # The model info has benn init by __init__.py with model_spec.json file
+    from .custom import get_user_defined_embeddings
+    # first, check whether it is a user-defined embedding model
+    for model_spec in get_user_defined_embeddings():
+        if model_name == model_spec.model_name:
+            return model_spec
+    if download_hub == "modelscope" and model_name in MODELSCOPE_EMBEDDING_MODELS:
+        logger.debug(f"Embedding model {model_name} found in ModelScope.")
+        return MODELSCOPE_EMBEDDING_MODELS[model_name]
+    elif download_hub == "huggingface" and model_name in BUILTIN_EMBEDDING_MODELS:
+        logger.debug(f"Embedding model {model_name} found in Huggingface.")
+        return BUILTIN_EMBEDDING_MODELS[model_name]
+    elif download_from_modelscope() and model_name in MODELSCOPE_EMBEDDING_MODELS:
+        logger.debug(f"Embedding model {model_name} found in ModelScope.")
+        return MODELSCOPE_EMBEDDING_MODELS[model_name]
+    elif model_name in BUILTIN_EMBEDDING_MODELS:
+        logger.debug(f"Embedding model {model_name} found in Huggingface.")
+        return BUILTIN_EMBEDDING_MODELS[model_name]
+    else:
+        raise ValueError(
+            f"Embedding model {model_name} not found, available"
+            f"Huggingface: {BUILTIN_EMBEDDING_MODELS.keys()}"
+            f"ModelScope: {MODELSCOPE_EMBEDDING_MODELS.keys()}"
+        )
+# { embedding model name -> { engine name -> engine params } }
+EMBEDDING_ENGINES: Dict[str, Dict[str, List[Dict[str, Type["EmbeddingModel"]]]]] = {}
+SUPPORTED_ENGINES: Dict[str, List[Type["EmbeddingModel"]]] = {}
+UD_EMBEDDING_FAMILIES_LOCK = Lock()
+# user defined embedding models
+UD_EMBEDDING_SPECS: Dict[str, "EmbeddingModelSpec"] = {}
+def register_embedding(custom_embedding_spec: "EmbeddingModelSpec", persist: bool):
+    from ..utils import is_valid_model_uri
+    from . import generate_engine_config_by_model_name
+    if not is_valid_model_name(custom_embedding_spec.model_name):
+        raise ValueError(f"Invalid model name {custom_embedding_spec.model_name}.")
+    model_uri = custom_embedding_spec.model_uri
+    if model_uri and not is_valid_model_uri(model_uri):
+        raise ValueError(f"Invalid model URI {model_uri}.")
+    with UD_EMBEDDING_FAMILIES_LOCK:
+        if (
+            custom_embedding_spec.model_name in BUILTIN_EMBEDDING_MODELS
+            or custom_embedding_spec.model_name in MODELSCOPE_EMBEDDING_MODELS
+            or custom_embedding_spec.model_name in UD_EMBEDDING_SPECS
+        ):
+            raise ValueError(
+                f"Model name conflicts with existing model {custom_embedding_spec.model_name}"
+            )
+    UD_EMBEDDING_SPECS[custom_embedding_spec.model_name] = custom_embedding_spec
+    generate_engine_config_by_model_name(custom_embedding_spec)
+# TODO: add persist feature
+def unregister_embedding(custom_embedding_spec: "EmbeddingModelSpec"):
+    with UD_EMBEDDING_FAMILIES_LOCK:
+        model_name = custom_embedding_spec.model_name
+        if model_name in UD_EMBEDDING_SPECS:
+            del UD_EMBEDDING_SPECS[model_name]
+        if model_name in EMBEDDING_ENGINES:
+            del EMBEDDING_ENGINES[model_name]
+def check_engine_by_model_name_and_engine(
+    model_name: str,
+    model_engine: str,
+) -> Type["EmbeddingModel"]:
+    def get_model_engine_from_spell(engine_str: str) -> str:
+        for engine in EMBEDDING_ENGINES[model_name].keys():
+            if engine.lower() == engine_str.lower():
+                return engine
+        return engine_str
+    if model_name not in EMBEDDING_ENGINES:
+        raise ValueError(f"Model {model_name} not found.")
+    model_engine = get_model_engine_from_spell(model_engine)
+    if model_engine not in EMBEDDING_ENGINES[model_name]:
+        raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
+    match_params = EMBEDDING_ENGINES[model_name][model_engine]
+    for param in match_params:
+        if model_name == param["model_name"]:
+            return param["embedding_class"]
+    raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")

xinference/model/embedding/flag/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference/model/embedding/flag/core.py ADDED Viewed

@@ -0,0 +1,282 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.util
+import logging
+from typing import List, Optional, Union, no_type_check
+import numpy as np
+import torch
+try:
+    from FlagEmbedding.inference.embedder.model_mapping import (
+        support_native_bge_model_list,
+    )
+    flag_installed = True
+except ImportError:
+    flag_installed = False
+from ....device_utils import get_available_device
+from ....types import Embedding, EmbeddingData, EmbeddingUsage
+from ..core import EmbeddingModel, EmbeddingModelSpec
+FLAG_EMBEDDER_MODEL_LIST = support_native_bge_model_list() if flag_installed else []
+logger = logging.getLogger(__name__)
+class FlagEmbeddingModel(EmbeddingModel):
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: EmbeddingModelSpec,
+        device: Optional[str] = None,
+        return_sparse: bool = False,
+        **kwargs,
+    ):
+        super().__init__(model_uid, model_path, model_spec, device, **kwargs)
+        self._return_sparse = return_sparse
+    def load(self):
+        try:
+            from FlagEmbedding import BGEM3FlagModel
+        except ImportError:
+            error_message = "Failed to import module 'BGEM3FlagModel'"
+            installation_guide = [
+                "Please make sure 'FlagEmbedding' is installed. ",
+                "You can install it by `pip install FlagEmbedding`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        torch_dtype = None
+        if torch_dtype_str := self._kwargs.get("torch_dtype"):
+            try:
+                torch_dtype = getattr(torch, torch_dtype_str)
+                if torch_dtype not in [
+                    torch.float16,
+                    torch.float32,
+                    torch.bfloat16,
+                ]:
+                    logger.warning(
+                        f"BGE engine only support fp16, but got {torch_dtype_str}. Using default torch dtype: fp16."
+                    )
+                    torch_dtype = torch.float16
+            except AttributeError:
+                logger.warning(
+                    f"Load embedding model with  unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
+                )
+                torch_dtype = torch.float16
+        if torch_dtype and torch_dtype == torch.float16:
+            model_kwargs = {"use_fp16": True}
+        else:
+            model_kwargs = {}
+        self._model = BGEM3FlagModel(
+            self._model_path,
+            device=self._device,
+            trust_remote_code=True,
+            return_sparse=self._return_sparse,
+            **model_kwargs,
+        )
+        self._tokenizer = self._model.tokenizer
+    def create_embedding(
+        self,
+        sentences: Union[str, List[str]],
+        **kwargs,
+    ):
+        from FlagEmbedding import BGEM3FlagModel
+        # flag embed dose not have this param
+        # kwargs.setdefault("normalize_embeddings", True)
+        model_uid = kwargs.pop("model_uid", None)
+        @no_type_check
+        def encode(
+            model: Union[BGEM3FlagModel],
+            sentences: Union[str, List[str]],
+            batch_size: int = 32,
+            show_progress_bar: bool = None,
+            output_value: str = "sparse_embedding",
+            convert_to_numpy: bool = True,
+            convert_to_tensor: bool = False,
+            device: str = None,
+            normalize_embeddings: bool = False,
+            **kwargs,
+        ):
+            """
+            Computes sentence embeddings with bge-m3 model
+            Nothing special here, just replace sentence-transformer with FlagEmbedding
+            TODO: think about how to solve the redundant code of encode method in the future
+            :param sentences: the sentences to embed
+            :param batch_size: the batch size used for the computation
+            :param show_progress_bar: Output a progress bar when encode sentences
+            :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
+            :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
+            :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
+            :param device: Which torch.device to use for the computation
+            :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+            :return:
+               By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
+            """
+            import torch
+            from tqdm.autonotebook import trange
+            if show_progress_bar is None:
+                show_progress_bar = (
+                    logger.getEffectiveLevel() == logging.INFO
+                    or logger.getEffectiveLevel() == logging.DEBUG
+                )
+            if convert_to_tensor:
+                convert_to_numpy = False
+            if output_value != "sparse_embedding":
+                convert_to_tensor = False
+                convert_to_numpy = False
+            input_was_string = False
+            if isinstance(sentences, str) or not hasattr(
+                sentences, "__len__"
+            ):  # Cast an individual sentence to a list with length 1
+                sentences = [sentences]
+                input_was_string = True
+            if device is None:
+                device = get_available_device()
+                logger.info(f"Use pytorch device_name: {device}")
+            all_embeddings = []
+            length_sorted_idx = np.argsort(
+                [-self._text_length(sen) for sen in sentences]
+            )
+            sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+            for start_index in trange(
+                0,
+                len(sentences),
+                batch_size,
+                desc="Batches",
+                disable=not show_progress_bar,
+            ):
+                sentences_batch = sentences_sorted[
+                    start_index : start_index + batch_size
+                ]
+                with torch.no_grad():
+                    out_features = model.encode(sentences_batch, **kwargs)
+                    if output_value == "token_embeddings":
+                        embeddings = []
+                        for token_emb, attention in zip(
+                            out_features[output_value], out_features["attention_mask"]
+                        ):
+                            last_mask_id = len(attention) - 1
+                            while (
+                                last_mask_id > 0 and attention[last_mask_id].item() == 0
+                            ):
+                                last_mask_id -= 1
+                            embeddings.append(token_emb[0 : last_mask_id + 1])
+                    elif output_value is None:  # Return all outputs
+                        embeddings = []
+                        for sent_idx in range(len(out_features["sentence_embedding"])):
+                            row = {
+                                name: out_features[name][sent_idx]
+                                for name in out_features
+                            }
+                            embeddings.append(row)
+                    # for sparse embedding
+                    else:
+                        # TODO: Here need check if we can return density_vecs and lexical_weights at the same time
+                        if kwargs.get("return_sparse"):
+                            embeddings = out_features["lexical_weights"]
+                        else:
+                            embeddings = out_features["dense_vecs"]
+                        if convert_to_numpy:
+                            embeddings = embeddings.cpu()
+                    all_embeddings.extend(embeddings)
+            all_embeddings = [
+                all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
+            ]
+            if convert_to_tensor:
+                if len(all_embeddings):
+                    all_embeddings = torch.stack(all_embeddings)
+                else:
+                    all_embeddings = torch.Tensor()
+            elif convert_to_numpy:
+                all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+            if input_was_string:
+                all_embeddings = all_embeddings[0]
+            return all_embeddings
+        all_embeddings = encode(
+            self._model,
+            sentences,
+            convert_to_numpy=False,
+            **kwargs,
+        )
+        if isinstance(sentences, str):
+            all_embeddings = [all_embeddings]
+        embedding_list = []
+        for index, data in enumerate(all_embeddings):
+            if kwargs.get("return_sparse"):
+                embedding_list.append(
+                    EmbeddingData(
+                        index=index,
+                        object="sparse_embedding",
+                        embedding={k: float(v) for k, v in data.items()},
+                    )
+                )
+            else:
+                embedding_list.append(
+                    EmbeddingData(
+                        index=index, object="embedding", embedding=data.tolist()
+                    )
+                )
+        usage = EmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
+        result = Embedding(
+            object=("list" if kwargs.get("return_sparse") else "dict"),  # type: ignore
+            model=model_uid,
+            model_replica=self._model_uid,
+            data=embedding_list,
+            usage=usage,
+        )
+        # clean cache if possible
+        # TODO: support token statistics
+        self._clean_cache_if_needed(all_token_nums=0)
+        return result
+    @classmethod
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("FlagEmbedding") is not None
+    @classmethod
+    def match_json(cls, model_spec: EmbeddingModelSpec) -> bool:
+        if model_spec.model_name in FLAG_EMBEDDER_MODEL_LIST:
+            return True
+        return False

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -239,6 +239,30 @@
     "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
     "model_revision": "e26182b2122f4435e8b3ebecbf363990f409b45b"
   },
+  {
+    "model_name": "Qwen3-Embedding-0.6B",
+    "dimensions": 1024,
+    "max_tokens": 32768,
+    "language": ["zh", "en"],
+    "model_id": "Qwen/Qwen3-Embedding-0.6B",
+    "model_revision": "744169034862c8eec56628663995004342e4e449"
+  },
+  {
+    "model_name": "Qwen3-Embedding-4B",
+    "dimensions": 2560,
+    "max_tokens": 32768,
+    "language": ["zh", "en"],
+    "model_id": "Qwen/Qwen3-Embedding-4B",
+    "model_revision": "408b81b7fab742073065d5b3661fa74c1b3ee0a1"
+  },
+  {
+    "model_name": "Qwen3-Embedding-8B",
+    "dimensions": 4096,
+    "max_tokens": 32768,
+    "language": ["zh", "en"],
+    "model_id": "Qwen/Qwen3-Embedding-8B",
+    "model_revision": "a3d38e32b9c835d5b3d0d0a3ef3c133bbea92539"
+  },
   {
     "model_name": "jina-embeddings-v3",
     "dimensions": 1024,

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -241,6 +241,30 @@
     "model_id": "iic/gte_Qwen2-7B-instruct",
     "model_hub": "modelscope"
   },
+  {
+    "model_name": "Qwen3-Embedding-0.6B",
+    "dimensions": 1024,
+    "max_tokens": 32768,
+    "language": ["zh", "en"],
+    "model_id": "Qwen/Qwen3-Embedding-0.6B",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "Qwen3-Embedding-4B",
+    "dimensions": 2560,
+    "max_tokens": 32768,
+    "language": ["zh", "en"],
+    "model_id": "Qwen/Qwen3-Embedding-4B",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "Qwen3-Embedding-8B",
+    "dimensions": 4096,
+    "max_tokens": 32768,
+    "language": ["zh", "en"],
+    "model_id": "Qwen/Qwen3-Embedding-8B",
+    "model_hub": "modelscope"
+  },
   {
     "model_name": "jina-embeddings-v3",
     "dimensions": 1024,

xinference/model/embedding/sentence_transformers/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference 1.6.1__py3-none-any.whl → 1.7.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.6.1py3-none-any.whl → 1.7.0.post1py3-none-any.whl