PyPI - xinference - Versions diffs - 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl - Mend

xinference 0.12.0py3-none-any.whl → 0.12.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (85) hide show

xinference/core/scheduler.py CHANGED Viewed

@@ -15,9 +15,10 @@
 import asyncio
 import functools
 import logging
+import uuid
 from collections import deque
 from enum import Enum
-from typing import List, Optional, Set
+from typing import List, Optional, Set, Tuple
 import xoscar as xo
@@ -50,9 +51,10 @@ class InferenceRequest:
         self._new_tokens = []
         # kv_cache used in decode phase
         self._kv_cache = None
-        # use passed args from `chat` interface
+        # use passed args from upstream interface
         self._inference_args = args
-        # use passed kwargs from `chat` interface, basically not used for now
+        # use passed kwargs from upstream interface, currently for getting raw generate config from upstream,
+        # which is useful for some special models
         self._inference_kwargs = kwargs
         # should this request be stopped
         self._stopped = False
@@ -63,6 +65,10 @@ class InferenceRequest:
         self._aborted = False
         # sanitized generate config
         self._sanitized_generate_config = None
+        # Chunk id for results. In stream mode, all the chunk ids should be same.
+        self._stream_chunk_id = str(uuid.uuid4())
+        # For calculate attention mask if needed
+        self.padding_len = 0
         # Use in stream mode
         self.last_output_length = 0
         # inference results,
@@ -81,19 +87,26 @@ class InferenceRequest:
         self._check_args()
     def _check_args(self):
-        assert len(self._inference_args) == 3
-        # system prompt
-        assert self._inference_args[0] is None or isinstance(
-            self._inference_args[0], str
-        )
-        # chat history
-        assert self._inference_args[1] is None or isinstance(
-            self._inference_args[1], list
-        )
-        # generate config
-        assert self._inference_args[2] is None or isinstance(
-            self._inference_args[2], dict
-        )
+        # chat
+        if len(self._inference_args) == 3:
+            # system prompt
+            assert self._inference_args[0] is None or isinstance(
+                self._inference_args[0], str
+            )
+            # chat history
+            assert self._inference_args[1] is None or isinstance(
+                self._inference_args[1], list
+            )
+            # generate config
+            assert self._inference_args[2] is None or isinstance(
+                self._inference_args[2], dict
+            )
+        else:  # generate
+            assert len(self._inference_args) == 1
+            # generate config
+            assert self._inference_args[0] is None or isinstance(
+                self._inference_args[0], dict
+            )
     @property
     def prompt(self):
@@ -148,7 +161,11 @@ class InferenceRequest:
     @property
     def generate_config(self):
-        return self._inference_args[2]
+        return (
+            self._inference_args[2]
+            if len(self._inference_args) == 3
+            else self._inference_args[0]
+        )
     @property
     def sanitized_generate_config(self):
@@ -158,6 +175,10 @@ class InferenceRequest:
     def sanitized_generate_config(self, value: dict):
         self._sanitized_generate_config = value
+    @property
+    def inference_kwargs(self):
+        return self._inference_kwargs
     @property
     def stopped(self):
         return self._stopped
@@ -174,6 +195,10 @@ class InferenceRequest:
     def finish_reason(self, value: Optional[str]):
         self._finish_reason = value
+    @property
+    def chunk_id(self):
+        return self._stream_chunk_id
     @property
     def stream(self) -> bool:
         return (
@@ -213,7 +238,9 @@ class InferenceRequest:
         )
     @functools.lru_cache
-    def get_generate_configs(self, eos_token_id: int):
+    def get_generate_configs(
+        self, eos_token_id: int, builtin_stop_token_ids: Optional[Tuple[int]] = None
+    ):
         from ..types import max_tokens_field
         max_new_tokens = int(
@@ -227,6 +254,7 @@ class InferenceRequest:
         )
         stop_token_ids = set(stop_token_ids)
         stop_token_ids.add(eos_token_id)
+        stop_token_ids.update(builtin_stop_token_ids or [])
         temperature = float(self.sanitized_generate_config.get("temperature", 1.0))
         repetition_penalty = float(
             self.sanitized_generate_config.get("repetition_penalty", 1.0)

xinference/core/supervisor.py CHANGED Viewed

@@ -982,32 +982,31 @@ class SupervisorActor(xo.StatelessActor):
         )
     @log_async(logger=logger)
-    async def list_cached_models(self) -> List[Dict[str, Any]]:
+    async def list_cached_models(
+        self, model_name: Optional[str] = None, worker_ip: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        target_ip_worker_ref = (
+            self._get_worker_ref_by_ip(worker_ip) if worker_ip is not None else None
+        )
+        if (
+            worker_ip is not None
+            and not self.is_local_deployment()
+            and target_ip_worker_ref is None
+        ):
+            raise ValueError(f"Worker ip address {worker_ip} is not in the cluster.")
+        # search assigned worker and return
+        if target_ip_worker_ref:
+            cached_models = await target_ip_worker_ref.list_cached_models(model_name)
+            cached_models = sorted(cached_models, key=lambda x: x["model_name"])
+            return cached_models
+        # search all worker
         cached_models = []
         for worker in self._worker_address_to_worker.values():
-            ret = await worker.list_cached_models()
-            for model_version in ret:
-                model_name = model_version.get("model_name", None)
-                model_format = model_version.get("model_format", None)
-                model_size_in_billions = model_version.get(
-                    "model_size_in_billions", None
-                )
-                quantizations = model_version.get("quantization", None)
-                actor_ip_address = model_version.get("actor_ip_address", None)
-                path = model_version.get("path", None)
-                real_path = model_version.get("real_path", None)
-                cache_entry = {
-                    "model_name": model_name,
-                    "model_format": model_format,
-                    "model_size_in_billions": model_size_in_billions,
-                    "quantizations": quantizations,
-                    "path": path,
-                    "Actor IP Address": actor_ip_address,
-                    "real_path": real_path,
-                }
-                cached_models.append(cache_entry)
+            res = await worker.list_cached_models(model_name)
+            cached_models.extend(res)
+        cached_models = sorted(cached_models, key=lambda x: x["model_name"])
         return cached_models
     @log_async(logger=logger)
@@ -1083,6 +1082,56 @@ class SupervisorActor(xo.StatelessActor):
             worker_status.update_time = time.time()
             worker_status.status = status
+    async def list_deletable_models(
+        self, model_version: str, worker_ip: Optional[str] = None
+    ) -> List[str]:
+        target_ip_worker_ref = (
+            self._get_worker_ref_by_ip(worker_ip) if worker_ip is not None else None
+        )
+        if (
+            worker_ip is not None
+            and not self.is_local_deployment()
+            and target_ip_worker_ref is None
+        ):
+            raise ValueError(f"Worker ip address {worker_ip} is not in the cluster.")
+        ret = []
+        if target_ip_worker_ref:
+            ret = await target_ip_worker_ref.list_deletable_models(
+                model_version=model_version,
+            )
+            return ret
+        for worker in self._worker_address_to_worker.values():
+            path = await worker.list_deletable_models(model_version=model_version)
+            ret.extend(path)
+        return ret
+    async def confirm_and_remove_model(
+        self, model_version: str, worker_ip: Optional[str] = None
+    ) -> bool:
+        target_ip_worker_ref = (
+            self._get_worker_ref_by_ip(worker_ip) if worker_ip is not None else None
+        )
+        if (
+            worker_ip is not None
+            and not self.is_local_deployment()
+            and target_ip_worker_ref is None
+        ):
+            raise ValueError(f"Worker ip address {worker_ip} is not in the cluster.")
+        if target_ip_worker_ref:
+            ret = await target_ip_worker_ref.confirm_and_remove_model(
+                model_version=model_version,
+            )
+            return ret
+        ret = True
+        for worker in self._worker_address_to_worker.values():
+            ret = ret and await worker.confirm_and_remove_model(
+                model_version=model_version,
+            )
+        return ret
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)

xinference/core/worker.py CHANGED Viewed

@@ -16,6 +16,7 @@ import asyncio
 import os
 import platform
 import queue
+import shutil
 import signal
 import threading
 import time
@@ -786,8 +787,73 @@ class WorkerActor(xo.StatelessActor):
             except asyncio.CancelledError:  # pragma: no cover
                 break
-    async def list_cached_models(self) -> List[Dict[Any, Any]]:
-        return self._cache_tracker_ref.list_cached_models()
+    async def list_cached_models(
+        self, model_name: Optional[str] = None
+    ) -> List[Dict[Any, Any]]:
+        lists = await self._cache_tracker_ref.list_cached_models(
+            self.address, model_name
+        )
+        cached_models = []
+        for list in lists:
+            cached_model = {
+                "model_name": list.get("model_name"),
+                "model_size_in_billions": list.get("model_size_in_billions"),
+                "model_format": list.get("model_format"),
+                "quantization": list.get("quantization"),
+                "model_version": list.get("model_version"),
+            }
+            path = list.get("model_file_location")
+            cached_model["path"] = path
+            # parsing soft links
+            if os.path.isdir(path):
+                files = os.listdir(path)
+                # dir has files
+                if files:
+                    resolved_file = os.path.realpath(os.path.join(path, files[0]))
+                    if resolved_file:
+                        cached_model["real_path"] = os.path.dirname(resolved_file)
+            else:
+                cached_model["real_path"] = os.path.realpath(path)
+            cached_model["actor_ip_address"] = self.address
+            cached_models.append(cached_model)
+        return cached_models
+    async def list_deletable_models(self, model_version: str) -> List[str]:
+        paths = set()
+        path = await self._cache_tracker_ref.list_deletable_models(
+            model_version, self.address
+        )
+        if os.path.isfile(path):
+            path = os.path.dirname(path)
+        if os.path.isdir(path):
+            files = os.listdir(path)
+            paths.update([os.path.join(path, file) for file in files])
+            # search real path
+            if paths:
+                paths.update([os.path.realpath(path) for path in paths])
+        return list(paths)
+    async def confirm_and_remove_model(self, model_version: str) -> bool:
+        paths = await self.list_deletable_models(model_version)
+        for path in paths:
+            try:
+                if os.path.islink(path):
+                    os.unlink(path)
+                elif os.path.isfile(path):
+                    os.remove(path)
+                elif os.path.isdir(path):
+                    shutil.rmtree(path)
+                else:
+                    logger.debug(f"{path} is not a valid path.")
+            except Exception as e:
+                logger.error(f"Fail to delete {path} with error:{e}.")
+                return False
+        await self._cache_tracker_ref.confirm_and_remove_model(
+            model_version, self.address
+        )
+        return True
     @staticmethod
     def record_metrics(name, op, kwargs):

xinference/deploy/cmdline.py CHANGED Viewed

@@ -577,6 +577,18 @@ def list_model_registrations(
     type=str,
     help="Xinference endpoint.",
 )
+@click.option(
+    "--model_name",
+    "-n",
+    type=str,
+    help="Provide the name of the models to be removed.",
+)
+@click.option(
+    "--worker-ip",
+    default=None,
+    type=str,
+    help="Specify which worker this model runs on by ip, for distributed situation.",
+)
 @click.option(
     "--api-key",
     "-ak",
@@ -587,6 +599,8 @@ def list_model_registrations(
 def list_cached_models(
     endpoint: Optional[str],
     api_key: Optional[str],
+    model_name: Optional[str],
+    worker_ip: Optional[str],
 ):
     from tabulate import tabulate
@@ -595,10 +609,13 @@ def list_cached_models(
     if api_key is None:
         client._set_token(get_stored_token(endpoint, client))
-    cached_models = client.list_cached_models()
+    cached_models = client.list_cached_models(model_name, worker_ip)
+    if not cached_models:
+        print("There are no cache files.")
+        return
+    headers = list(cached_models[0].keys())
     print("cached_model: ")
-    headers = list(cached_models[0].keys())
     table_data = []
     for model in cached_models:
         row_data = [
@@ -608,6 +625,73 @@ def list_cached_models(
     print(tabulate(table_data, headers=headers, tablefmt="pretty"))
+@cli.command("remove-cache", help="Remove selected cached models in Xinference.")
+@click.option(
+    "--endpoint",
+    "-e",
+    type=str,
+    help="Xinference endpoint.",
+)
+@click.option(
+    "--model_version",
+    "-n",
+    type=str,
+    help="Provide the version of the models to be removed.",
+)
+@click.option(
+    "--worker-ip",
+    default=None,
+    type=str,
+    help="Specify which worker this model runs on by ip, for distributed situation.",
+)
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
+@click.option("--check", is_flag=True, help="Confirm the deletion of the cache.")
+def remove_cache(
+    endpoint: Optional[str],
+    model_version: str,
+    api_key: Optional[str],
+    check: bool,
+    worker_ip: Optional[str] = None,
+):
+    endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
+    if not check:
+        response = client.list_deletable_models(
+            model_version=model_version, worker_ip=worker_ip
+        )
+        paths: List[str] = response.get("paths", [])
+        if not paths:
+            click.echo(f"There is no model version named {model_version}.")
+            return
+        click.echo(f"Model {model_version} cache directory to be deleted:")
+        for path in response.get("paths", []):
+            click.echo(f"{path}")
+        if click.confirm("Do you want to proceed with the deletion?", abort=True):
+            check = True
+    try:
+        result = client.confirm_and_remove_model(
+            model_version=model_version, worker_ip=worker_ip
+        )
+        if result:
+            click.echo(f"Cache directory {model_version} has been deleted.")
+        else:
+            click.echo(
+                f"Cache directory {model_version} fail to be deleted. Please check the log."
+            )
+    except Exception as e:
+        click.echo(f"An error occurred while deleting the cache: {e}")
 @cli.command(
     "launch",
     help="Launch a model with the Xinference framework with the given parameters.",

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -26,6 +26,7 @@ from ..cmdline import (
     model_list,
     model_terminate,
     register_model,
+    remove_cache,
     unregister_model,
 )
@@ -287,18 +288,26 @@ def test_list_cached_models(setup):
     result = runner.invoke(
         list_cached_models,
-        [
-            "--endpoint",
-            endpoint,
-        ],
+        ["--endpoint", endpoint, "--model_name", "orca"],
     )
-    assert result.exit_code == 0
-    assert "cached_model: " in result.stdout
-    # check if the output is in tabular format
     assert "model_name" in result.stdout
     assert "model_format" in result.stdout
     assert "model_size_in_billions" in result.stdout
-    assert "quantizations" in result.stdout
+    assert "quantization" in result.stdout
+    assert "model_version" in result.stdout
     assert "path" in result.stdout
-    assert "Actor IP Address" in result.stdout
+    assert "actor_ip_address" in result.stdout
+def test_remove_cache(setup):
+    endpoint, _ = setup
+    runner = CliRunner()
+    result = runner.invoke(
+        remove_cache,
+        ["--endpoint", endpoint, "--model_version", "orca"],
+        input="y\n",
+    )
+    assert result.exit_code == 0
+    assert "Cache directory orca has been deleted."

xinference/model/audio/__init__.py CHANGED Viewed

@@ -32,6 +32,9 @@ from .custom import (
 )
 _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
+_model_spec_modelscope_json = os.path.join(
+    os.path.dirname(__file__), "model_spec_modelscope.json"
+)
 BUILTIN_AUDIO_MODELS = dict(
     (spec["model_name"], AudioModelFamilyV1(**spec))
     for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
@@ -39,8 +42,17 @@ BUILTIN_AUDIO_MODELS = dict(
 for model_name, model_spec in BUILTIN_AUDIO_MODELS.items():
     MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+MODELSCOPE_AUDIO_MODELS = dict(
+    (spec["model_name"], AudioModelFamilyV1(**spec))
+    for spec in json.load(
+        codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
+    )
+)
+for model_name, model_spec in MODELSCOPE_AUDIO_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 # register model description after recording model revision
-for model_spec_info in [BUILTIN_AUDIO_MODELS]:
+for model_spec_info in [BUILTIN_AUDIO_MODELS, MODELSCOPE_AUDIO_MODELS]:
     for model_name, model_spec in model_spec_info.items():
         if model_spec.model_name not in AUDIO_MODEL_DESCRIPTIONS:
             AUDIO_MODEL_DESCRIPTIONS.update(generate_audio_description(model_spec))
@@ -64,3 +76,4 @@ for ud_audio in get_user_defined_audios():
     AUDIO_MODEL_DESCRIPTIONS.update(generate_audio_description(ud_audio))
 del _model_spec_json
+del _model_spec_modelscope_json

xinference/model/audio/core.py CHANGED Viewed

@@ -95,13 +95,24 @@ def generate_audio_description(
 def match_audio(model_name: str) -> AudioModelFamilyV1:
-    from . import BUILTIN_AUDIO_MODELS
+    from ..utils import download_from_modelscope
+    from . import BUILTIN_AUDIO_MODELS, MODELSCOPE_AUDIO_MODELS
     from .custom import get_user_defined_audios
     for model_spec in get_user_defined_audios():
         if model_spec.model_name == model_name:
             return model_spec
+    if download_from_modelscope():
+        if model_name in MODELSCOPE_AUDIO_MODELS:
+            logger.debug(f"Audio model {model_name} found in ModelScope.")
+            return MODELSCOPE_AUDIO_MODELS[model_name]
+        else:
+            logger.debug(
+                f"Audio model {model_name} not found in ModelScope, "
+                f"now try to load it via builtin way."
+            )
     if model_name in BUILTIN_AUDIO_MODELS:
         return BUILTIN_AUDIO_MODELS[model_name]
     else:

xinference/model/audio/custom.py CHANGED Viewed

@@ -83,15 +83,17 @@ def get_user_defined_audios() -> List[CustomAudioModelFamilyV1]:
 def register_audio(model_spec: CustomAudioModelFamilyV1, persist: bool):
     from ...constants import XINFERENCE_MODEL_DIR
     from ..utils import is_valid_model_name, is_valid_model_uri
-    from . import BUILTIN_AUDIO_MODELS
+    from . import BUILTIN_AUDIO_MODELS, MODELSCOPE_AUDIO_MODELS
     if not is_valid_model_name(model_spec.model_name):
         raise ValueError(f"Invalid model name {model_spec.model_name}.")
     with UD_AUDIO_LOCK:
-        for model_name in list(BUILTIN_AUDIO_MODELS.keys()) + [
-            spec.model_name for spec in UD_AUDIOS
-        ]:
+        for model_name in (
+            list(BUILTIN_AUDIO_MODELS.keys())
+            + list(MODELSCOPE_AUDIO_MODELS.keys())
+            + [spec.model_name for spec in UD_AUDIOS]
+        ):
             if model_spec.model_name == model_name:
                 raise ValueError(
                     f"Model name conflicts with existing model {model_spec.model_name}"

xinference/model/audio/model_spec_modelscope.json ADDED Viewed

@@ -0,0 +1,20 @@
+[
+  {
+    "model_name": "whisper-large-v3",
+    "model_family": "whisper",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/whisper-large-v3",
+    "model_revision": "master",
+    "ability": "audio-to-text",
+    "multilingual": true
+  },
+  {
+    "model_name": "ChatTTS",
+    "model_family": "ChatTTS",
+    "model_hub": "modelscope",
+    "model_id": "pzc163/chatTTS",
+    "model_revision": "master",
+    "ability": "text-to-audio",
+    "multilingual": true
+  }
+]

xinference/model/llm/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .core import (
     get_llm_model_descriptions,
 )
 from .llm_family import (
+    BUILTIN_CSGHUB_LLM_FAMILIES,
     BUILTIN_LLM_FAMILIES,
     BUILTIN_LLM_MODEL_CHAT_FAMILIES,
     BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
@@ -221,13 +222,44 @@ def _install():
         if "tools" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
-    for llm_specs in [BUILTIN_LLM_FAMILIES, BUILTIN_MODELSCOPE_LLM_FAMILIES]:
+    csghub_json_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "llm_family_csghub.json"
+    )
+    for json_obj in json.load(codecs.open(csghub_json_path, "r", encoding="utf-8")):
+        model_spec = LLMFamilyV1.parse_obj(json_obj)
+        BUILTIN_CSGHUB_LLM_FAMILIES.append(model_spec)
+        # register prompt style, in case that we have something missed
+        # if duplicated with huggingface json, keep it as the huggingface style
+        if (
+            "chat" in model_spec.model_ability
+            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
+        ):
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+        # register model family
+        if "chat" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
+        else:
+            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
+        if "tools" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
+    for llm_specs in [
+        BUILTIN_LLM_FAMILIES,
+        BUILTIN_MODELSCOPE_LLM_FAMILIES,
+        BUILTIN_CSGHUB_LLM_FAMILIES,
+    ]:
         for llm_spec in llm_specs:
             if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS:
                 LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec))
     # traverse all families and add engine parameters corresponding to the model name
-    for families in [BUILTIN_LLM_FAMILIES, BUILTIN_MODELSCOPE_LLM_FAMILIES]:
+    for families in [
+        BUILTIN_LLM_FAMILIES,
+        BUILTIN_MODELSCOPE_LLM_FAMILIES,
+        BUILTIN_CSGHUB_LLM_FAMILIES,
+    ]:
         for family in families:
             generate_engine_config_by_model_family(family)

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -939,6 +939,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 9,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "THUDM/glm-4v-9b",
@@ -2288,7 +2290,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
     "model_specs": [
@@ -2593,7 +2596,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen2 is the new series of Qwen large language models. ",
     "model_specs": [
@@ -5673,9 +5677,11 @@
       ],
       "intra_message_sep": "<|im_end|>",
       "stop_token_ids": [
+        2,
         92542
       ],
       "stop": [
+        "</s>",
         "<|im_end|>"
       ]
     }

xinference 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl

Potentially problematic release.

xinference 0.12.0py3-none-any.whl → 0.12.2py3-none-any.whl