PyPI - xinference - Versions diffs - 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

xinference 1.0.1py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show

xinference/model/embedding/core.py CHANGED Viewed

@@ -208,12 +208,14 @@ class EmbeddingModel:
                 ]
                 raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-            model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
+            if torch_dtype and torch_dtype == torch.float16:
+                model_kwargs = {"use_fp16": True}
+            else:
+                model_kwargs = {}
             self._model = BGEM3FlagModel(
                 self._model_path,
                 device=self._device,
-                model_kwargs=model_kwargs,
-                trust_remote_code=True,
+                **model_kwargs,
             )
         else:
             model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
@@ -224,7 +226,9 @@ class EmbeddingModel:
                 trust_remote_code=True,
             )
-    def _fix_langchain_openai_inputs(self, sentences: Union[str, List[str]]):
+    def _fix_langchain_openai_inputs(
+        self, sentences: Union[str, List[str], Dict[str, str], List[Dict[str, str]]]
+    ):
         # Check if sentences is a two-dimensional list of integers
         if (
             isinstance(sentences, list)
@@ -258,157 +262,172 @@ class EmbeddingModel:
                 sentences = lines_decoded
         return sentences
-    def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
+    def create_embedding(
+        self,
+        sentences: Union[str, List[str]],
+        **kwargs,
+    ):
         sentences = self._fix_langchain_openai_inputs(sentences)
-        from FlagEmbedding import BGEM3FlagModel
         from sentence_transformers import SentenceTransformer
         kwargs.setdefault("normalize_embeddings", True)
-        @no_type_check
-        def _encode_bgem3(
-            model: Union[SentenceTransformer, BGEM3FlagModel],
-            sentences: Union[str, List[str]],
-            batch_size: int = 32,
-            show_progress_bar: bool = None,
-            output_value: str = "sparse_embedding",
-            convert_to_numpy: bool = True,
-            convert_to_tensor: bool = False,
-            device: str = None,
-            normalize_embeddings: bool = False,
-            **kwargs,
-        ):
-            """
-            Computes sentence embeddings with bge-m3 model
-            Nothing special here, just replace sentence-transformer with FlagEmbedding
-            TODO: think about how to solve the redundant code of encode method in the future
-            :param sentences: the sentences to embed
-            :param batch_size: the batch size used for the computation
-            :param show_progress_bar: Output a progress bar when encode sentences
-            :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
-            :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
-            :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
-            :param device: Which torch.device to use for the computation
-            :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+        try:
+            from FlagEmbedding import BGEM3FlagModel
+            @no_type_check
+            def _encode_bgem3(
+                model: Union[SentenceTransformer, BGEM3FlagModel],
+                sentences: Union[str, List[str]],
+                batch_size: int = 32,
+                show_progress_bar: bool = None,
+                output_value: str = "sparse_embedding",
+                convert_to_numpy: bool = True,
+                convert_to_tensor: bool = False,
+                device: str = None,
+                normalize_embeddings: bool = False,
+                **kwargs,
+            ):
+                """
+                Computes sentence embeddings with bge-m3 model
+                Nothing special here, just replace sentence-transformer with FlagEmbedding
+                TODO: think about how to solve the redundant code of encode method in the future
+                :param sentences: the sentences to embed
+                :param batch_size: the batch size used for the computation
+                :param show_progress_bar: Output a progress bar when encode sentences
+                :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
+                :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
+                :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
+                :param device: Which torch.device to use for the computation
+                :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+                :return:
+                    By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
+                """
+                import torch
+                from tqdm.autonotebook import trange
+                if show_progress_bar is None:
+                    show_progress_bar = (
+                        logger.getEffectiveLevel() == logging.INFO
+                        or logger.getEffectiveLevel() == logging.DEBUG
+                    )
-            :return:
-               By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
-            """
-            import torch
-            from tqdm.autonotebook import trange
+                if convert_to_tensor:
+                    convert_to_numpy = False
+                if output_value != "sparse_embedding":
+                    convert_to_tensor = False
+                    convert_to_numpy = False
+                input_was_string = False
+                if isinstance(sentences, str) or not hasattr(
+                    sentences, "__len__"
+                ):  # Cast an individual sentence to a list with length 1
+                    sentences = [sentences]
+                    input_was_string = True
+                if device is None:
+                    # Same as SentenceTransformer.py
+                    from sentence_transformers.util import get_device_name
+                    device = get_device_name()
+                    logger.info(f"Use pytorch device_name: {device}")
+                all_embeddings = []
+                all_token_nums = 0
+                # The original code does not support other inference engines
+                def _text_length(text):
+                    if isinstance(text, dict):  # {key: value} case
+                        return len(next(iter(text.values())))
+                    elif not hasattr(text, "__len__"):  # Object has no len() method
+                        return 1
+                    elif len(text) == 0 or isinstance(
+                        text[0], int
+                    ):  # Empty string or list of ints
+                        return len(text)
+                    else:
+                        return sum(
+                            [len(t) for t in text]
+                        )  # Sum of length of individual strings
-            if show_progress_bar is None:
-                show_progress_bar = (
-                    logger.getEffectiveLevel() == logging.INFO
-                    or logger.getEffectiveLevel() == logging.DEBUG
+                length_sorted_idx = np.argsort(
+                    [-_text_length(sen) for sen in sentences]
                 )
+                sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+                for start_index in trange(
+                    0,
+                    len(sentences),
+                    batch_size,
+                    desc="Batches",
+                    disable=not show_progress_bar,
+                ):
+                    sentences_batch = sentences_sorted[
+                        start_index : start_index + batch_size
+                    ]
+                    with torch.no_grad():
+                        out_features = model.encode(sentences_batch, **kwargs)
+                        if output_value == "token_embeddings":
+                            embeddings = []
+                            for token_emb, attention in zip(
+                                out_features[output_value],
+                                out_features["attention_mask"],
+                            ):
+                                last_mask_id = len(attention) - 1
+                                while (
+                                    last_mask_id > 0
+                                    and attention[last_mask_id].item() == 0
+                                ):
+                                    last_mask_id -= 1
+                                embeddings.append(token_emb[0 : last_mask_id + 1])
+                        elif output_value is None:  # Return all outputs
+                            embeddings = []
+                            for sent_idx in range(
+                                len(out_features["sentence_embedding"])
+                            ):
+                                row = {
+                                    name: out_features[name][sent_idx]
+                                    for name in out_features
+                                }
+                                embeddings.append(row)
+                        # for sparse embedding
+                        else:
+                            if kwargs.get("return_sparse"):
+                                embeddings = out_features["lexical_weights"]
+                            else:
+                                embeddings = out_features["dense_vecs"]
-            if convert_to_tensor:
-                convert_to_numpy = False
-            if output_value != "sparse_embedding":
-                convert_to_tensor = False
-                convert_to_numpy = False
-            input_was_string = False
-            if isinstance(sentences, str) or not hasattr(
-                sentences, "__len__"
-            ):  # Cast an individual sentence to a list with length 1
-                sentences = [sentences]
-                input_was_string = True
-            if device is None:
-                # Same as SentenceTransformer.py
-                from sentence_transformers.util import get_device_name
-                device = get_device_name()
-                logger.info(f"Use pytorch device_name: {device}")
-            all_embeddings = []
-            all_token_nums = 0
-            # The original code does not support other inference engines
-            def _text_length(text):
-                if isinstance(text, dict):  # {key: value} case
-                    return len(next(iter(text.values())))
-                elif not hasattr(text, "__len__"):  # Object has no len() method
-                    return 1
-                elif len(text) == 0 or isinstance(
-                    text[0], int
-                ):  # Empty string or list of ints
-                    return len(text)
-                else:
-                    return sum(
-                        [len(t) for t in text]
-                    )  # Sum of length of individual strings
+                            if convert_to_numpy:
+                                embeddings = embeddings.cpu()
-            length_sorted_idx = np.argsort([-_text_length(sen) for sen in sentences])
-            sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+                        all_embeddings.extend(embeddings)
-            for start_index in trange(
-                0,
-                len(sentences),
-                batch_size,
-                desc="Batches",
-                disable=not show_progress_bar,
-            ):
-                sentences_batch = sentences_sorted[
-                    start_index : start_index + batch_size
+                all_embeddings = [
+                    all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
                 ]
-                with torch.no_grad():
-                    out_features = model.encode(sentences_batch, **kwargs)
-                    if output_value == "token_embeddings":
-                        embeddings = []
-                        for token_emb, attention in zip(
-                            out_features[output_value], out_features["attention_mask"]
-                        ):
-                            last_mask_id = len(attention) - 1
-                            while (
-                                last_mask_id > 0 and attention[last_mask_id].item() == 0
-                            ):
-                                last_mask_id -= 1
-                            embeddings.append(token_emb[0 : last_mask_id + 1])
-                    elif output_value is None:  # Return all outputs
-                        embeddings = []
-                        for sent_idx in range(len(out_features["sentence_embedding"])):
-                            row = {
-                                name: out_features[name][sent_idx]
-                                for name in out_features
-                            }
-                            embeddings.append(row)
-                    # for sparse embedding
+                if convert_to_tensor:
+                    if len(all_embeddings):
+                        all_embeddings = torch.stack(all_embeddings)
                     else:
-                        if kwargs.get("return_sparse"):
-                            embeddings = out_features["lexical_weights"]
-                        else:
-                            embeddings = out_features["dense_vecs"]
-                        if convert_to_numpy:
-                            embeddings = embeddings.cpu()
-                    all_embeddings.extend(embeddings)
-            all_embeddings = [
-                all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
-            ]
+                        all_embeddings = torch.Tensor()
+                elif convert_to_numpy:
+                    all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
-            if convert_to_tensor:
-                if len(all_embeddings):
-                    all_embeddings = torch.stack(all_embeddings)
-                else:
-                    all_embeddings = torch.Tensor()
-            elif convert_to_numpy:
-                all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+                if input_was_string:
+                    all_embeddings = all_embeddings[0]
-            if input_was_string:
-                all_embeddings = all_embeddings[0]
+                return all_embeddings, all_token_nums
-            return all_embeddings, all_token_nums
+        except ImportError:
+            _encode_bgem3 = None
         # copied from sentence-transformers, and modify it to return tokens num
         @no_type_check
@@ -526,7 +545,11 @@ class EmbeddingModel:
                 features.update(extra_features)
                 # when batching, the attention mask 1 means there is a token
                 # thus we just sum up it to get the total number of tokens
-                all_token_nums += features["attention_mask"].sum().item()
+                if "clip" in self._model_spec.model_name.lower():
+                    all_token_nums += features["input_ids"].numel()
+                    all_token_nums += features["pixel_values"].numel()
+                else:
+                    all_token_nums += features["attention_mask"].sum().item()
                 with torch.no_grad():
                     out_features = model.forward(features, **kwargs)
@@ -582,6 +605,10 @@ class EmbeddingModel:
             return all_embeddings, all_token_nums
+        is_bge_m3_flag_model = (
+            self._kwargs.get("hybrid_mode")
+            and "m3" in self._model_spec.model_name.lower()
+        )
         if (
             "gte" in self._model_spec.model_name.lower()
             and "qwen2" in self._model_spec.model_name.lower()
@@ -593,10 +620,45 @@ class EmbeddingModel:
                 convert_to_numpy=False,
                 **kwargs,
             )
-        elif isinstance(self._model, BGEM3FlagModel):
+        elif is_bge_m3_flag_model:
+            assert _encode_bgem3 is not None
             all_embeddings, all_token_nums = _encode_bgem3(
                 self._model, sentences, convert_to_numpy=False, **kwargs
             )
+        elif "clip" in self._model_spec.model_name.lower():
+            import base64
+            import re
+            from io import BytesIO
+            from PIL import Image
+            def base64_to_image(base64_str: str) -> Image.Image:
+                # base64_data = re.sub("^data:image/.+;base64,", "", base64_str)
+                base64_data = base64_str.split(",", 1)[1]
+                byte_data = base64.b64decode(base64_data)
+                image_data = BytesIO(byte_data)
+                img = Image.open(image_data)
+                return img
+            objs: list[dict[str, str]] = []
+            for item in sentences:
+                if isinstance(item, dict):
+                    if item.get("text") is not None:
+                        objs.append(item["text"])
+                    elif item.get("image") is not None:
+                        if re.match(r"^data:image/.+;base64,", item["image"]):
+                            image = base64_to_image(item["image"])
+                            objs.append(image)
+                        else:
+                            objs.append(item["image"])
+                    else:
+                        logger.error("Please check the input data.")
+            all_embeddings, all_token_nums = encode(
+                self._model,
+                objs,
+                convert_to_numpy=False,
+                **self._kwargs,
+            )
         else:
             all_embeddings, all_token_nums = encode(
                 self._model,
@@ -608,7 +670,7 @@ class EmbeddingModel:
             all_embeddings = [all_embeddings]
         embedding_list = []
         for index, data in enumerate(all_embeddings):
-            if kwargs.get("return_sparse") and isinstance(self._model, BGEM3FlagModel):
+            if kwargs.get("return_sparse") and is_bge_m3_flag_model:
                 embedding_list.append(
                     EmbeddingData(
                         index=index,
@@ -628,8 +690,7 @@ class EmbeddingModel:
         result = Embedding(
             object=(
                 "list"  # type: ignore
-                if not isinstance(self._model, BGEM3FlagModel)
-                and not kwargs.get("return_sparse")
+                if not is_bge_m3_flag_model and not kwargs.get("return_sparse")
                 else "dict"
             ),
             model=self._model_uid,

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -245,5 +245,12 @@
     "max_tokens": 8192,
     "language": ["zh", "en"],
     "model_id": "jinaai/jina-embeddings-v3"
+  },
+  {
+    "model_name": "jina-clip-v2",
+    "dimensions": 1024,
+    "max_tokens": 8192,
+    "language": ["89 languages supported"],
+    "model_id": "jinaai/jina-clip-v2"
   }
 ]

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -248,5 +248,13 @@
     "language": ["zh", "en"],
     "model_id": "jinaai/jina-embeddings-v3",
     "model_hub": "modelscope"
+  },
+  {
+    "model_name": "jina-clip-v2",
+    "dimensions": 1024,
+    "max_tokens": 8192,
+    "language": ["89 languages supported"],
+    "model_id": "jinaai/jina-clip-v2",
+    "model_hub": "modelscope"
   }
 ]

xinference/model/image/core.py CHANGED Viewed

@@ -22,7 +22,12 @@ from typing import Dict, List, Literal, Optional, Tuple, Union
 from ...constants import XINFERENCE_CACHE_DIR
 from ...types import PeftModelConfig
 from ..core import CacheableModelSpec, ModelDescription
-from ..utils import valid_model_revision
+from ..utils import (
+    IS_NEW_HUGGINGFACE_HUB,
+    retry_download,
+    symlink_local_file,
+    valid_model_revision,
+)
 from .ocr.got_ocr2 import GotOCR2Model
 from .stable_diffusion.core import DiffusionModel
 from .stable_diffusion.mlx import MLXDiffusionModel
@@ -51,6 +56,9 @@ class ImageModelFamilyV1(CacheableModelSpec):
     controlnet: Optional[List["ImageModelFamilyV1"]]
     default_model_config: Optional[dict] = {}
     default_generate_config: Optional[dict] = {}
+    gguf_model_id: Optional[str]
+    gguf_quantizations: Optional[List[str]]
+    gguf_model_file_name_template: Optional[str]
 class ImageModelDescription(ModelDescription):
@@ -187,6 +195,61 @@ def get_cache_status(
         return valid_model_revision(meta_path, model_spec.model_revision)
+def cache_gguf(spec: ImageModelFamilyV1, quantization: Optional[str] = None):
+    if not quantization:
+        return
+    cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, spec.model_name))
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir, exist_ok=True)
+    if not spec.gguf_model_file_name_template:
+        raise NotImplementedError(
+            f"{spec.model_name} does not support GGUF quantization"
+        )
+    if quantization not in (spec.gguf_quantizations or []):
+        raise ValueError(
+            f"Cannot support quantization {quantization}, "
+            f"available quantizations: {spec.gguf_quantizations}"
+        )
+    filename = spec.gguf_model_file_name_template.format(quantization=quantization)  # type: ignore
+    full_path = os.path.join(cache_dir, filename)
+    if spec.model_hub == "huggingface":
+        import huggingface_hub
+        use_symlinks = {}
+        if not IS_NEW_HUGGINGFACE_HUB:
+            use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
+        download_file_path = retry_download(
+            huggingface_hub.hf_hub_download,
+            spec.model_name,
+            None,
+            spec.gguf_model_id,
+            filename=filename,
+            **use_symlinks,
+        )
+        if IS_NEW_HUGGINGFACE_HUB:
+            symlink_local_file(download_file_path, cache_dir, filename)
+    elif spec.model_hub == "modelscope":
+        from modelscope.hub.file_download import model_file_download
+        download_file_path = retry_download(
+            model_file_download,
+            spec.model_name,
+            None,
+            spec.gguf_model_id,
+            filename,
+            revision=spec.model_revision,
+        )
+        symlink_local_file(download_file_path, cache_dir, filename)
+    else:
+        raise NotImplementedError
+    return full_path
 def create_ocr_model_instance(
     subpool_addr: str,
     devices: List[str],
@@ -219,6 +282,8 @@ def create_image_model_instance(
         Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
     ] = None,
     model_path: Optional[str] = None,
+    gguf_quantization: Optional[str] = None,
+    gguf_model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[
     Union[DiffusionModel, MLXDiffusionModel, GotOCR2Model], ImageModelDescription
@@ -272,6 +337,8 @@ def create_image_model_instance(
             ]
     if not model_path:
         model_path = cache(model_spec)
+    if not gguf_model_path and gguf_quantization:
+        gguf_model_path = cache_gguf(model_spec, gguf_quantization)
     if peft_model_config is not None:
         lora_model = peft_model_config.peft_model
         lora_load_kwargs = peft_model_config.image_lora_load_kwargs
@@ -298,6 +365,7 @@ def create_image_model_instance(
         lora_load_kwargs=lora_load_kwargs,
         lora_fuse_kwargs=lora_fuse_kwargs,
         model_spec=model_spec,
+        gguf_model_path=gguf_model_path,
         **kwargs,
     )
     model_description = ImageModelDescription(

xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

Potentially problematic release.

xinference 1.0.1py3-none-any.whl → 1.2.1py3-none-any.whl