PyPI - xinference - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

xinference 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (29) hide show

xinference/_compat.py +22 -2
xinference/_version.py +3 -3
xinference/api/restful_api.py +86 -1
xinference/client/restful/restful_client.py +39 -0
xinference/core/model.py +35 -12
xinference/model/audio/__init__.py +12 -0
xinference/model/audio/core.py +21 -4
xinference/model/audio/fish_speech.py +70 -35
xinference/model/audio/model_spec.json +80 -0
xinference/model/audio/whisper_mlx.py +208 -0
xinference/model/embedding/core.py +259 -4
xinference/model/embedding/model_spec.json +1 -1
xinference/model/embedding/model_spec_modelscope.json +1 -1
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +309 -2
xinference/model/llm/llm_family_modelscope.json +308 -0
xinference/model/llm/mlx/core.py +0 -1
xinference/model/llm/sglang/core.py +1 -0
xinference/model/llm/transformers/core.py +1 -0
xinference/model/llm/transformers/glm_edge_v.py +230 -0
xinference/model/llm/utils.py +19 -0
xinference/model/llm/vllm/core.py +84 -2
xinference/types.py +2 -1
{xinference-1.0.0.dist-info → xinference-1.0.1.dist-info}/METADATA +8 -6
{xinference-1.0.0.dist-info → xinference-1.0.1.dist-info}/RECORD +29 -27
{xinference-1.0.0.dist-info → xinference-1.0.1.dist-info}/WHEEL +1 -1
{xinference-1.0.0.dist-info → xinference-1.0.1.dist-info}/LICENSE +0 -0
{xinference-1.0.0.dist-info → xinference-1.0.1.dist-info}/entry_points.txt +0 -0
{xinference-1.0.0.dist-info → xinference-1.0.1.dist-info}/top_level.txt +0 -0

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -103,6 +103,86 @@
     "model_ability": "audio-to-text",
     "multilingual": false
   },
+  {
+    "model_name": "whisper-tiny-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-tiny",
+    "model_ability": "audio-to-text",
+    "multilingual": true,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-tiny.en-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-tiny.en-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": false,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-base-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-base-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": true,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-base.en-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-base.en-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": false,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-small-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-small-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": true,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-small.en-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-small.en-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": false,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-medium-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-medium-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": true,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-medium.en-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-medium.en-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": false,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-large-v3-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-large-v3-mlx",
+    "model_ability": "audio-to-text",
+    "multilingual": true,
+    "engine": "mlx"
+  },
+  {
+    "model_name": "whisper-large-v3-turbo-mlx",
+    "model_family": "whisper",
+    "model_id": "mlx-community/whisper-large-v3-turbo",
+    "model_ability": "audio-to-text",
+    "multilingual": true,
+    "engine": "mlx"
+  },
   {
     "model_name": "SenseVoiceSmall",
     "model_family": "funasr",

xinference/model/audio/whisper_mlx.py ADDED Viewed

@@ -0,0 +1,208 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import itertools
+import logging
+import tempfile
+from typing import TYPE_CHECKING, List, Optional
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
+class WhisperMLXModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV1",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._model = None
+        self._kwargs = kwargs
+        self._use_lighting = False
+    @property
+    def model_ability(self):
+        return self._model_spec.model_ability
+    def load(self):
+        use_lightning = self._kwargs.get("use_lightning", "auto")
+        if use_lightning not in ("auto", True, False, None):
+            raise ValueError("use_lightning can only be True, False, None or auto")
+        if use_lightning == "auto" or use_lightning is True:
+            try:
+                import mlx.core as mx
+                from lightning_whisper_mlx.transcribe import ModelHolder
+            except ImportError:
+                if use_lightning == "auto":
+                    use_lightning = False
+                else:
+                    error_message = "Failed to import module 'lightning_whisper_mlx'"
+                    installation_guide = [
+                        "Please make sure 'lightning_whisper_mlx' is installed.\n",
+                    ]
+                    raise ImportError(
+                        f"{error_message}\n\n{''.join(installation_guide)}"
+                    )
+            else:
+                use_lightning = True
+        if not use_lightning:
+            try:
+                import mlx.core as mx  # noqa: F811
+                from mlx_whisper.transcribe import ModelHolder  # noqa: F811
+            except ImportError:
+                error_message = "Failed to import module 'mlx_whisper'"
+                installation_guide = [
+                    "Please make sure 'mlx_whisper' is installed.\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            else:
+                use_lightning = False
+        logger.info(
+            "Loading MLX whisper from %s, use lightning: %s",
+            self._model_path,
+            use_lightning,
+        )
+        self._use_lighting = use_lightning
+        self._model = ModelHolder.get_model(self._model_path, mx.float16)
+    def transcriptions(
+        self,
+        audio: bytes,
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: str = "json",
+        temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
+    ):
+        return self._call(
+            audio,
+            language=language,
+            prompt=prompt,
+            response_format=response_format,
+            temperature=temperature,
+            timestamp_granularities=timestamp_granularities,
+            task="transcribe",
+        )
+    def translations(
+        self,
+        audio: bytes,
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: str = "json",
+        temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
+    ):
+        if not self._model_spec.multilingual:
+            raise RuntimeError(
+                f"Model {self._model_spec.model_name} is not suitable for translations."
+            )
+        return self._call(
+            audio,
+            language=language,
+            prompt=prompt,
+            response_format=response_format,
+            temperature=temperature,
+            timestamp_granularities=timestamp_granularities,
+            task="translate",
+        )
+    def _call(
+        self,
+        audio: bytes,
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        response_format: str = "json",
+        temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
+        task: str = "transcribe",
+    ):
+        if self._use_lighting:
+            from lightning_whisper_mlx.transcribe import transcribe_audio
+            transcribe = functools.partial(
+                transcribe_audio, batch_size=self._kwargs.get("batch_size", 12)
+            )
+        else:
+            from mlx_whisper import transcribe  # type: ignore
+        with tempfile.NamedTemporaryFile(delete=True) as f:
+            f.write(audio)
+            kwargs = {"task": task}
+            if response_format == "verbose_json":
+                if timestamp_granularities == ["word"]:
+                    kwargs["word_timestamps"] = True  # type: ignore
+            result = transcribe(
+                f.name,
+                path_or_hf_repo=self._model_path,
+                language=language,
+                temperature=temperature,
+                initial_prompt=prompt,
+                **kwargs,
+            )
+            text = result["text"]
+            segments = result["segments"]
+            language = result["language"]
+            if response_format == "json":
+                return {"text": text}
+            elif response_format == "verbose_json":
+                if not timestamp_granularities or timestamp_granularities == [
+                    "segment"
+                ]:
+                    return {
+                        "task": task,
+                        "language": language,
+                        "duration": segments[-1]["end"] if segments else 0,
+                        "text": text,
+                        "segments": segments,
+                    }
+                else:
+                    assert timestamp_granularities == ["word"]
+                    def _extract_word(word: dict) -> dict:
+                        return {
+                            "start": word["start"].item(),
+                            "end": word["end"].item(),
+                            "word": word["word"],
+                        }
+                    words = [
+                        _extract_word(w)
+                        for w in itertools.chain(*[s["words"] for s in segments])
+                    ]
+                    return {
+                        "task": task,
+                        "language": language,
+                        "duration": words[-1]["end"] if words else 0,
+                        "text": text,
+                        "words": words,
+                    }
+            else:
+                raise ValueError(f"Unsupported response format: {response_format}")

xinference/model/embedding/core.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import Dict, List, Literal, Optional, Tuple, Union, no_type_check
 import numpy as np
 import torch
+from ..._compat import ROOT_KEY, ErrorWrapper, ValidationError
 from ...device_utils import empty_cache
 from ...types import Embedding, EmbeddingData, EmbeddingUsage
 from ..core import CacheableModelSpec, ModelDescription
@@ -193,6 +194,27 @@ class EmbeddingModel:
                 device=self._device,
                 model_kwargs=model_kwargs,
             )
+        elif (
+            self._kwargs.get("hybrid_mode")
+            and "m3" in self._model_spec.model_name.lower()
+        ):
+            try:
+                from FlagEmbedding import BGEM3FlagModel
+            except ImportError:
+                error_message = "Failed to import module 'BGEM3FlagModel'"
+                installation_guide = [
+                    "Please make sure 'FlagEmbedding' is installed. ",
+                    "You can install it by `pip install FlagEmbedding`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
+            self._model = BGEM3FlagModel(
+                self._model_path,
+                device=self._device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
+            )
         else:
             model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
             self._model = SentenceTransformer(
@@ -202,11 +224,192 @@ class EmbeddingModel:
                 trust_remote_code=True,
             )
+    def _fix_langchain_openai_inputs(self, sentences: Union[str, List[str]]):
+        # Check if sentences is a two-dimensional list of integers
+        if (
+            isinstance(sentences, list)
+            and len(sentences) > 0
+            and isinstance(sentences[0], list)
+            and len(sentences[0]) > 0
+            and isinstance(sentences[0][0], int)
+        ):
+            # List[List[int]] stands for encoded inputs
+            import tiktoken
+            enc = tiktoken.get_encoding("cl100k_base")
+            lines_decoded = []
+            for line in sentences:
+                try:
+                    # Decode each token into bytes, then join them into a complete string
+                    output = b"".join(
+                        enc.decode_single_token_bytes(token) for token in line
+                    )
+                    # Convert the byte sequence into a UTF-8 encoded string
+                    decoded_line = output.decode("utf-8")
+                    lines_decoded.append(decoded_line)
+                except (ValueError, TypeError, UnicodeDecodeError) as e:
+                    raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], self)
+            # Update sentences to be the list of decoded strings
+            if len(lines_decoded) == 1:
+                sentences = lines_decoded[0]
+            else:
+                sentences = lines_decoded
+        return sentences
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
+        sentences = self._fix_langchain_openai_inputs(sentences)
+        from FlagEmbedding import BGEM3FlagModel
         from sentence_transformers import SentenceTransformer
         kwargs.setdefault("normalize_embeddings", True)
+        @no_type_check
+        def _encode_bgem3(
+            model: Union[SentenceTransformer, BGEM3FlagModel],
+            sentences: Union[str, List[str]],
+            batch_size: int = 32,
+            show_progress_bar: bool = None,
+            output_value: str = "sparse_embedding",
+            convert_to_numpy: bool = True,
+            convert_to_tensor: bool = False,
+            device: str = None,
+            normalize_embeddings: bool = False,
+            **kwargs,
+        ):
+            """
+            Computes sentence embeddings with bge-m3 model
+            Nothing special here, just replace sentence-transformer with FlagEmbedding
+            TODO: think about how to solve the redundant code of encode method in the future
+            :param sentences: the sentences to embed
+            :param batch_size: the batch size used for the computation
+            :param show_progress_bar: Output a progress bar when encode sentences
+            :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
+            :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
+            :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
+            :param device: Which torch.device to use for the computation
+            :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+            :return:
+               By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
+            """
+            import torch
+            from tqdm.autonotebook import trange
+            if show_progress_bar is None:
+                show_progress_bar = (
+                    logger.getEffectiveLevel() == logging.INFO
+                    or logger.getEffectiveLevel() == logging.DEBUG
+                )
+            if convert_to_tensor:
+                convert_to_numpy = False
+            if output_value != "sparse_embedding":
+                convert_to_tensor = False
+                convert_to_numpy = False
+            input_was_string = False
+            if isinstance(sentences, str) or not hasattr(
+                sentences, "__len__"
+            ):  # Cast an individual sentence to a list with length 1
+                sentences = [sentences]
+                input_was_string = True
+            if device is None:
+                # Same as SentenceTransformer.py
+                from sentence_transformers.util import get_device_name
+                device = get_device_name()
+                logger.info(f"Use pytorch device_name: {device}")
+            all_embeddings = []
+            all_token_nums = 0
+            # The original code does not support other inference engines
+            def _text_length(text):
+                if isinstance(text, dict):  # {key: value} case
+                    return len(next(iter(text.values())))
+                elif not hasattr(text, "__len__"):  # Object has no len() method
+                    return 1
+                elif len(text) == 0 or isinstance(
+                    text[0], int
+                ):  # Empty string or list of ints
+                    return len(text)
+                else:
+                    return sum(
+                        [len(t) for t in text]
+                    )  # Sum of length of individual strings
+            length_sorted_idx = np.argsort([-_text_length(sen) for sen in sentences])
+            sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+            for start_index in trange(
+                0,
+                len(sentences),
+                batch_size,
+                desc="Batches",
+                disable=not show_progress_bar,
+            ):
+                sentences_batch = sentences_sorted[
+                    start_index : start_index + batch_size
+                ]
+                with torch.no_grad():
+                    out_features = model.encode(sentences_batch, **kwargs)
+                    if output_value == "token_embeddings":
+                        embeddings = []
+                        for token_emb, attention in zip(
+                            out_features[output_value], out_features["attention_mask"]
+                        ):
+                            last_mask_id = len(attention) - 1
+                            while (
+                                last_mask_id > 0 and attention[last_mask_id].item() == 0
+                            ):
+                                last_mask_id -= 1
+                            embeddings.append(token_emb[0 : last_mask_id + 1])
+                    elif output_value is None:  # Return all outputs
+                        embeddings = []
+                        for sent_idx in range(len(out_features["sentence_embedding"])):
+                            row = {
+                                name: out_features[name][sent_idx]
+                                for name in out_features
+                            }
+                            embeddings.append(row)
+                    # for sparse embedding
+                    else:
+                        if kwargs.get("return_sparse"):
+                            embeddings = out_features["lexical_weights"]
+                        else:
+                            embeddings = out_features["dense_vecs"]
+                        if convert_to_numpy:
+                            embeddings = embeddings.cpu()
+                    all_embeddings.extend(embeddings)
+            all_embeddings = [
+                all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
+            ]
+            if convert_to_tensor:
+                if len(all_embeddings):
+                    all_embeddings = torch.stack(all_embeddings)
+                else:
+                    all_embeddings = torch.Tensor()
+            elif convert_to_numpy:
+                all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+            if input_was_string:
+                all_embeddings = all_embeddings[0]
+            return all_embeddings, all_token_nums
         # copied from sentence-transformers, and modify it to return tokens num
         @no_type_check
         def encode(
@@ -390,6 +593,10 @@ class EmbeddingModel:
                 convert_to_numpy=False,
                 **kwargs,
             )
+        elif isinstance(self._model, BGEM3FlagModel):
+            all_embeddings, all_token_nums = _encode_bgem3(
+                self._model, sentences, convert_to_numpy=False, **kwargs
+            )
         else:
             all_embeddings, all_token_nums = encode(
                 self._model,
@@ -401,14 +608,30 @@ class EmbeddingModel:
             all_embeddings = [all_embeddings]
         embedding_list = []
         for index, data in enumerate(all_embeddings):
-            embedding_list.append(
-                EmbeddingData(index=index, object="embedding", embedding=data.tolist())
-            )
+            if kwargs.get("return_sparse") and isinstance(self._model, BGEM3FlagModel):
+                embedding_list.append(
+                    EmbeddingData(
+                        index=index,
+                        object="embedding",
+                        embedding={k: float(v) for k, v in data.items()},
+                    )
+                )
+            else:
+                embedding_list.append(
+                    EmbeddingData(
+                        index=index, object="embedding", embedding=data.tolist()
+                    )
+                )
         usage = EmbeddingUsage(
             prompt_tokens=all_token_nums, total_tokens=all_token_nums
         )
         result = Embedding(
-            object="list",
+            object=(
+                "list"  # type: ignore
+                if not isinstance(self._model, BGEM3FlagModel)
+                and not kwargs.get("return_sparse")
+                else "dict"
+            ),
             model=self._model_uid,
             data=embedding_list,
             usage=usage,
@@ -430,6 +653,38 @@ class EmbeddingModel:
         return result
+    def convert_ids_to_tokens(
+        self,
+        batch_token_ids: Union[List[Union[int, str]], List[List[Union[int, str]]]],
+        **kwargs,
+    ) -> Union[List[str]]:
+        batch_decoded_texts: List[str] = []
+        assert self._model is not None
+        if isinstance(batch_token_ids, (int, str)):
+            return self._model.tokenizer.convert_ids_to_tokens(
+                [int(str(batch_token_ids))]
+            )[0]
+        # check if it's a nested list
+        if (
+            isinstance(batch_token_ids, list)
+            and batch_token_ids
+            and isinstance(batch_token_ids[0], list)
+        ):
+            for token_ids in batch_token_ids:
+                token_ids = [int(token_id) for token_id in token_ids]
+                batch_decoded_texts.append(
+                    self._model.tokenizer.convert_ids_to_tokens(token_ids)
+                )
+        else:
+            batch_token_ids = [int(token_id) for token_id in batch_token_ids]
+            batch_decoded_texts = self._model.tokenizer.convert_ids_to_tokens(
+                batch_token_ids
+            )
+        return batch_decoded_texts
 def match_embedding(
     model_name: str,

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -233,7 +233,7 @@
   },
   {
     "model_name": "gte-Qwen2",
-    "dimensions": 4096,
+    "dimensions": 3584,
     "max_tokens": 32000,
     "language": ["zh", "en"],
     "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -235,7 +235,7 @@
   },
   {
     "model_name": "gte-Qwen2",
-    "dimensions": 4096,
+    "dimensions": 3584,
     "max_tokens": 32000,
     "language": ["zh", "en"],
     "model_id": "iic/gte_Qwen2-7B-instruct",

xinference/model/llm/__init__.py CHANGED Viewed

@@ -143,6 +143,7 @@ def _install():
     )
     from .transformers.deepseek_vl import DeepSeekVLChatModel
     from .transformers.glm4v import Glm4VModel
+    from .transformers.glm_edge_v import GlmEdgeVModel
     from .transformers.intern_vl import InternVLChatModel
     from .transformers.internlm2 import Internlm2PytorchChatModel
     from .transformers.minicpmv25 import MiniCPMV25Model
@@ -193,6 +194,7 @@ def _install():
             DeepSeekV2PytorchModel,
             DeepSeekV2PytorchChatModel,
             OptPytorchModel,
+            GlmEdgeVModel,
         ]
     )
     if OmniLMMModel:  # type: ignore

xinference 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

xinference 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl