PyPI - xinference - Versions diffs - 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

xinference 0.10.2.post1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/deploy/cmdline.py CHANGED Viewed

@@ -17,7 +17,7 @@ import logging
 import os
 import sys
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 import click
 from xoscar.utils import get_next_port
@@ -598,6 +598,13 @@ def list_model_registrations(
     default="LLM",
     help="Specify type of model, LLM as default.",
 )
+@click.option(
+    "--model-engine",
+    "-en",
+    type=str,
+    default=None,
+    help="Specify the inference engine of the model when launching LLM.",
+)
 @click.option(
     "--model-uid",
     "-u",
@@ -691,6 +698,7 @@ def model_launch(
     endpoint: Optional[str],
     model_name: str,
     model_type: str,
+    model_engine: Optional[str],
     model_uid: str,
     size_in_billions: str,
     model_format: str,
@@ -712,6 +720,9 @@ def model_launch(
         kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
     print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)
+    if model_type == "LLM" and model_engine is None:
+        raise ValueError("--model-engine is required for LLM models.")
     if n_gpu.lower() == "none":
         _n_gpu: Optional[Union[int, str]] = None
     elif n_gpu == "auto":
@@ -736,11 +747,15 @@ def model_launch(
         else []
     )
-    peft_model_config = {
-        "image_lora_load_kwargs": image_lora_load_params,
-        "image_lora_fuse_kwargs": image_lora_fuse_params,
-        "lora_list": lora_list,
-    }
+    peft_model_config = (
+        {
+            "image_lora_load_kwargs": image_lora_load_params,
+            "image_lora_fuse_kwargs": image_lora_fuse_params,
+            "lora_list": lora_list,
+        }
+        if lora_list or image_lora_load_params or image_lora_fuse_params
+        else None
+    )
     _gpu_idx: Optional[List[int]] = (
         None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
@@ -761,6 +776,7 @@ def model_launch(
     model_uid = client.launch_model(
         model_name=model_name,
         model_type=model_type,
+        model_engine=model_engine,
         model_uid=model_uid,
         model_size_in_billions=model_size,
         model_format=model_format,
@@ -1199,5 +1215,158 @@ def cluster_login(
             f.write(access_token)
+@cli.command(name="engine", help="Query the applicable inference engine by model name.")
+@click.option(
+    "--model-name",
+    "-n",
+    type=str,
+    required=True,
+    help="The model name you want to query.",
+)
+@click.option(
+    "--model-engine",
+    "-en",
+    type=str,
+    default=None,
+    help="Specify the `model_engine` to query the corresponding combination of other parameters.",
+)
+@click.option(
+    "--model-format",
+    "-f",
+    type=str,
+    default=None,
+    help="Specify the `model_format` to query the corresponding combination of other parameters.",
+)
+@click.option(
+    "--model-size-in-billions",
+    "-s",
+    type=str,
+    default=None,
+    help="Specify the `model_size_in_billions` to query the corresponding combination of other parameters.",
+)
+@click.option(
+    "--quantization",
+    "-q",
+    type=str,
+    default=None,
+    help="Specify the `quantization` to query the corresponding combination of other parameters.",
+)
+@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
+def query_engine_by_model_name(
+    model_name: str,
+    model_engine: Optional[str],
+    model_format: Optional[str],
+    model_size_in_billions: Optional[Union[str, int]],
+    quantization: Optional[str],
+    endpoint: Optional[str],
+    api_key: Optional[str],
+):
+    from tabulate import tabulate
+    def match_engine_from_spell(value: str, target: Sequence[str]) -> Tuple[bool, str]:
+        """
+        For better usage experience.
+        """
+        for t in target:
+            if value.lower() == t.lower():
+                return True, t
+        return False, value
+    def handle_user_passed_parameters() -> List[str]:
+        user_specified_parameters = []
+        if model_engine is not None:
+            user_specified_parameters.append(f"--model-engine {model_engine}")
+        if model_format is not None:
+            user_specified_parameters.append(f"--model-format {model_format}")
+        if model_size_in_billions is not None:
+            user_specified_parameters.append(
+                f"--model-size-in-billions {model_size_in_billions}"
+            )
+        if quantization is not None:
+            user_specified_parameters.append(f"--quantization {quantization}")
+        return user_specified_parameters
+    user_specified_params = handle_user_passed_parameters()
+    endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
+    llm_engines = client.query_engine_by_model_name(model_name)
+    if model_engine is not None:
+        is_matched, model_engine = match_engine_from_spell(
+            model_engine, list(llm_engines.keys())
+        )
+        if not is_matched:
+            print(
+                f'Xinference does not support this inference engine "{model_engine}".',
+                file=sys.stderr,
+            )
+            return
+    table = []
+    engines = [model_engine] if model_engine is not None else list(llm_engines.keys())
+    for engine in engines:
+        params = llm_engines[engine]
+        for param in params:
+            if (
+                (model_format is None or model_format == param["model_format"])
+                and (
+                    model_size_in_billions is None
+                    or model_size_in_billions == str(param["model_size_in_billions"])
+                )
+                and (quantization is None or quantization in param["quantizations"])
+            ):
+                if quantization is not None:
+                    table.append(
+                        [
+                            model_name,
+                            engine,
+                            param["model_format"],
+                            param["model_size_in_billions"],
+                            quantization,
+                        ]
+                    )
+                else:
+                    for quant in param["quantizations"]:
+                        table.append(
+                            [
+                                model_name,
+                                engine,
+                                param["model_format"],
+                                param["model_size_in_billions"],
+                                quant,
+                            ]
+                        )
+    if len(table) == 0:
+        print(
+            f"Xinference does not support "
+            f"your provided params: {', '.join(user_specified_params)} for the model {model_name}.",
+            file=sys.stderr,
+        )
+    else:
+        print(
+            tabulate(
+                table,
+                headers=[
+                    "Name",
+                    "Engine",
+                    "Format",
+                    "Size (in billions)",
+                    "Quantization",
+                ],
+            ),
+            file=sys.stderr,
+        )
 if __name__ == "__main__":
     cli()

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -65,6 +65,7 @@ def test_cmdline(setup, stream, model_uid):
     original_model_uid = model_uid
     model_uid = client.launch_model(
         model_name="orca",
+        model_engine="llama.cpp",
         model_uid=model_uid,
         model_size_in_billions=3,
         quantization="q4_0",
@@ -247,6 +248,7 @@ def test_rotate_logs(setup_with_file_logging):
     replica = 1 if os.name == "nt" else 2
     model_uid = client.launch_model(
         model_name="orca",
+        model_engine="llama.cpp",
         model_uid=None,
         model_size_in_billions=3,
         quantization="q4_0",

xinference/deploy/utils.py CHANGED Viewed

@@ -129,7 +129,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
             try:
                 from xinference.core.supervisor import SupervisorActor
-                supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref(
+                supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref(  # type: ignore
                     address=address, uid=SupervisorActor.uid()
                 )

xinference/device_utils.py CHANGED Viewed

@@ -17,13 +17,27 @@ import os
 import torch
 from typing_extensions import Literal, Union
-DeviceType = Literal["cuda", "mps", "xpu", "cpu"]
+DeviceType = Literal["cuda", "mps", "xpu", "npu", "cpu"]
+DEVICE_TO_ENV_NAME = {
+    "cuda": "CUDA_VISIBLE_DEVICES",
+    "npu": "ASCEND_RT_VISIBLE_DEVICES",
+}
 def is_xpu_available() -> bool:
     return hasattr(torch, "xpu") and torch.xpu.is_available()
+def is_npu_available() -> bool:
+    try:
+        import torch
+        import torch_npu  # noqa: F401
+        return torch.npu.is_available()
+    except ImportError:
+        return False
 def get_available_device() -> DeviceType:
     if torch.cuda.is_available():
         return "cuda"
@@ -31,6 +45,8 @@ def get_available_device() -> DeviceType:
         return "mps"
     elif is_xpu_available():
         return "xpu"
+    elif is_npu_available():
+        return "npu"
     return "cpu"
@@ -41,6 +57,8 @@ def is_device_available(device: str) -> bool:
         return torch.backends.mps.is_available()
     elif device == "xpu":
         return is_xpu_available()
+    elif device == "npu":
+        return is_npu_available()
     elif device == "cpu":
         return True
@@ -59,7 +77,7 @@ def move_model_to_available_device(model):
 def get_device_preferred_dtype(device: str) -> Union[torch.dtype, None]:
     if device == "cpu":
         return torch.float32
-    elif device == "cuda" or device == "mps":
+    elif device == "cuda" or device == "mps" or device == "npu":
         return torch.float16
     elif device == "xpu":
         return torch.bfloat16
@@ -68,7 +86,7 @@ def get_device_preferred_dtype(device: str) -> Union[torch.dtype, None]:
 def is_hf_accelerate_supported(device: str) -> bool:
-    return device == "cuda" or device == "xpu"
+    return device == "cuda" or device == "xpu" or device == "npu"
 def empty_cache():
@@ -78,6 +96,12 @@ def empty_cache():
         torch.mps.empty_cache()
     if is_xpu_available():
         torch.xpu.empty_cache()
+    if is_npu_available():
+        torch.npu.empty_cache()
+def get_available_device_env_name():
+    return DEVICE_TO_ENV_NAME.get(get_available_device())
 def gpu_count():
@@ -94,5 +118,7 @@ def gpu_count():
         return min(torch.cuda.device_count(), len(cuda_visible_devices))
     elif is_xpu_available():
         return torch.xpu.device_count()
+    elif is_npu_available():
+        return torch.npu.device_count()
     else:
         return 0

xinference/fields.py CHANGED Viewed

@@ -32,7 +32,6 @@ logprobs_field = Field(
 max_tokens_field = Field(
     default=1024,
     ge=1,
-    le=32768,
     description="The maximum number of tokens to generate.",
 )
@@ -75,6 +74,11 @@ stream_field = Field(
     description="Whether to stream the results as they are generated. Useful for chatbots.",
 )
+stream_option_field = Field(
+    default={},
+    description="If set, an additional chunk will be streamed before the `data: [DONE]` message.",
+)
 top_k_field = Field(
     default=40,
     ge=0,

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -75,5 +75,12 @@
     "model_id": "BELLE-2/Belle-whisper-large-v2-zh",
     "model_revision": "ec5bd5d78598545b7585814edde86dac2002b5b9",
     "multilingual": false
+  },
+  {
+    "model_name": "Belle-whisper-large-v3-zh",
+    "model_family": "whisper",
+    "model_id": "BELLE-2/Belle-whisper-large-v3-zh",
+    "model_revision": "3bebc7247696b39f5ab9ed22db426943ac33f600",
+    "multilingual": false
   }
-]
+]

xinference/model/audio/whisper.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-from typing import TYPE_CHECKING, Dict, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from xinference.device_utils import (
     get_available_device,
@@ -81,12 +81,87 @@ class WhisperModel:
         audio: bytes,
         generate_kwargs: Dict,
         response_format: str,
+        temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
+        if temperature != 0:
+            generate_kwargs.update({"temperature": temperature, "do_sample": True})
         if response_format == "json":
             logger.debug("Call whisper model with generate_kwargs: %s", generate_kwargs)
             assert callable(self._model)
             result = self._model(audio, generate_kwargs=generate_kwargs)
             return {"text": result["text"]}
+        elif response_format == "verbose_json":
+            return_timestamps: Union[bool, str] = False
+            if not timestamp_granularities:
+                return_timestamps = True
+            elif timestamp_granularities == ["segment"]:
+                return_timestamps = True
+            elif timestamp_granularities == ["word"]:
+                return_timestamps = "word"
+            else:
+                raise Exception(
+                    f"Unsupported timestamp_granularities: {timestamp_granularities}"
+                )
+            assert callable(self._model)
+            results = self._model(
+                audio,
+                generate_kwargs=generate_kwargs,
+                return_timestamps=return_timestamps,
+            )
+            language = generate_kwargs.get("language", "english")
+            if return_timestamps is True:
+                segments: List[dict] = []
+                def _get_chunk_segment_json(idx, text, start, end):
+                    find_start = 0
+                    if segments:
+                        find_start = segments[-1]["seek"] + len(segments[-1]["text"])
+                    return {
+                        "id": idx,
+                        "seek": results["text"].find(text, find_start),
+                        "start": start,
+                        "end": end,
+                        "text": text,
+                        "tokens": [],
+                        "temperature": temperature,
+                        # We can't provide these values.
+                        "avg_logprob": 0.0,
+                        "compression_ratio": 0.0,
+                        "no_speech_prob": 0.0,
+                    }
+                for idx, c in enumerate(results.get("chunks", [])):
+                    text = c["text"]
+                    start, end = c["timestamp"]
+                    segments.append(_get_chunk_segment_json(idx, text, start, end))
+                return {
+                    "task": "transcribe",
+                    "language": language,
+                    "duration": segments[-1]["end"] if segments else 0,
+                    "text": results["text"],
+                    "segments": segments,
+                }
+            else:
+                assert return_timestamps == "word"
+                words = []
+                for idx, c in enumerate(results.get("chunks", [])):
+                    text = c["text"]
+                    start, end = c["timestamp"]
+                    words.append({"word": text, "start": start, "end": end})
+                return {
+                    "task": "transcribe",
+                    "language": language,
+                    "duration": words[-1]["end"] if words else 0,
+                    "text": results["text"],
+                    "words": words,
+                }
         else:
             raise ValueError(f"Unsupported response format: {response_format}")
@@ -97,12 +172,8 @@ class WhisperModel:
         prompt: Optional[str] = None,
         response_format: str = "json",
         temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
-        if temperature != 0:
-            logger.warning(
-                "Temperature for whisper transcriptions will be ignored: %s.",
-                temperature,
-            )
         if prompt is not None:
             logger.warning(
                 "Prompt for whisper transcriptions will be ignored: %s", prompt
@@ -115,30 +186,35 @@ class WhisperModel:
                 else {"task": "transcribe"}
             ),
             response_format=response_format,
+            temperature=temperature,
+            timestamp_granularities=timestamp_granularities,
         )
     def translations(
         self,
         audio: bytes,
+        language: Optional[str] = None,
         prompt: Optional[str] = None,
         response_format: str = "json",
         temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
         if not self._model_spec.multilingual:
             raise RuntimeError(
                 f"Model {self._model_spec.model_name} is not suitable for translations."
             )
-        if temperature != 0:
-            logger.warning(
-                "Temperature for whisper transcriptions will be ignored: %s.",
-                temperature,
-            )
         if prompt is not None:
             logger.warning(
                 "Prompt for whisper transcriptions will be ignored: %s", prompt
             )
         return self._call_model(
             audio=audio,
-            generate_kwargs={"task": "translate"},
+            generate_kwargs=(
+                {"language": language, "task": "translate"}
+                if language is not None
+                else {"task": "translate"}
+            ),
             response_format=response_format,
+            temperature=temperature,
+            timestamp_granularities=timestamp_granularities,
         )

xinference/model/core.py CHANGED Viewed

@@ -50,11 +50,11 @@ def create_model_instance(
     model_uid: str,
     model_type: str,
     model_name: str,
+    model_engine: Optional[str],
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     peft_model_config: Optional[PeftModelConfig] = None,
-    is_local_deployment: bool = False,
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:
     from .audio.core import create_audio_model_instance
@@ -69,11 +69,11 @@ def create_model_instance(
             devices,
             model_uid,
             model_name,
+            model_engine,
             model_format,
             model_size_in_billions,
             quantization,
             peft_model_config,
-            is_local_deployment,
             **kwargs,
         )
     elif model_type == "embedding":

xinference/model/embedding/core.py CHANGED Viewed

@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import logging
+import os
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union, no_type_check
 import numpy as np
+from ...device_utils import empty_cache
 from ...types import Embedding, EmbeddingData, EmbeddingUsage
 from ..core import CacheableModelSpec, ModelDescription
 from ..utils import get_cache_dir, is_model_cached
@@ -28,6 +31,10 @@ logger = logging.getLogger(__name__)
 # Init when registering all the builtin models.
 MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
 EMBEDDING_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
+EMBEDDING_EMPTY_CACHE_COUNT = int(
+    os.getenv("XINFERENCE_EMBEDDING_EMPTY_CACHE_COUNT", "10")
+)
+assert EMBEDDING_EMPTY_CACHE_COUNT > 0
 def get_embedding_model_descriptions():
@@ -116,6 +123,7 @@ class EmbeddingModel:
         self._model_path = model_path
         self._device = device
         self._model = None
+        self._counter = 0
     def load(self):
         try:
@@ -134,6 +142,11 @@ class EmbeddingModel:
         self._model = SentenceTransformer(self._model_path, device=self._device)
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
+        self._counter += 1
+        if self._counter % EMBEDDING_EMPTY_CACHE_COUNT == 0:
+            logger.debug("Empty embedding cache.")
+            gc.collect()
+            empty_cache()
         from sentence_transformers import SentenceTransformer
         kwargs.setdefault("normalize_embeddings", True)

xinference/model/image/__init__.py CHANGED Viewed

@@ -20,12 +20,19 @@ from itertools import chain
 from .core import (
     BUILTIN_IMAGE_MODELS,
     IMAGE_MODEL_DESCRIPTIONS,
+    MODEL_NAME_TO_REVISION,
     MODELSCOPE_IMAGE_MODELS,
     ImageModelFamilyV1,
     generate_image_description,
     get_cache_status,
     get_image_model_descriptions,
 )
+from .custom import (
+    CustomImageModelFamilyV1,
+    get_user_defined_images,
+    register_image,
+    unregister_image,
+)
 _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
 _model_spec_modelscope_json = os.path.join(
@@ -37,6 +44,9 @@ BUILTIN_IMAGE_MODELS.update(
         for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
     )
 )
+for model_name, model_spec in BUILTIN_IMAGE_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 MODELSCOPE_IMAGE_MODELS.update(
     dict(
         (spec["model_name"], ImageModelFamilyV1(**spec))
@@ -45,6 +55,8 @@ MODELSCOPE_IMAGE_MODELS.update(
         )
     )
 )
+for model_name, model_spec in MODELSCOPE_IMAGE_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 # register model description
 for model_name, model_spec in chain(
@@ -52,4 +64,21 @@ for model_name, model_spec in chain(
 ):
     IMAGE_MODEL_DESCRIPTIONS.update(generate_image_description(model_spec))
+from ...constants import XINFERENCE_MODEL_DIR
+user_defined_image_dir = os.path.join(XINFERENCE_MODEL_DIR, "image")
+if os.path.isdir(user_defined_image_dir):
+    for f in os.listdir(user_defined_image_dir):
+        with codecs.open(
+            os.path.join(user_defined_image_dir, f), encoding="utf-8"
+        ) as fd:
+            user_defined_image_family = CustomImageModelFamilyV1.parse_obj(
+                json.load(fd)
+            )
+            register_image(user_defined_image_family, persist=False)
+for ud_image in get_user_defined_images():
+    IMAGE_MODEL_DESCRIPTIONS.update(generate_image_description(ud_image))
 del _model_spec_json
+del _model_spec_modelscope_json

xinference/model/image/core.py CHANGED Viewed

@@ -27,6 +27,7 @@ MAX_ATTEMPTS = 3
 logger = logging.getLogger(__name__)
+MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
 IMAGE_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
 BUILTIN_IMAGE_MODELS: Dict[str, "ImageModelFamilyV1"] = {}
 MODELSCOPE_IMAGE_MODELS: Dict[str, "ImageModelFamilyV1"] = {}
@@ -119,6 +120,11 @@ def generate_image_description(
 def match_diffusion(model_name: str) -> ImageModelFamilyV1:
     from ..utils import download_from_modelscope
     from . import BUILTIN_IMAGE_MODELS, MODELSCOPE_IMAGE_MODELS
+    from .custom import get_user_defined_images
+    for model_spec in get_user_defined_images():
+        if model_spec.model_name == model_name:
+            return model_spec
     if download_from_modelscope():
         if model_name in MODELSCOPE_IMAGE_MODELS:

xinference 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl

Potentially problematic release.

xinference 0.10.2.post1py3-none-any.whl → 0.11.0py3-none-any.whl