PyPI - xinference - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

xinference 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (95) hide show

xinference/core/utils.py CHANGED Viewed

@@ -11,26 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import logging
 import os
 import random
 import string
-from typing import Generator, Tuple
+from typing import Dict, Generator, List, Tuple, Union
 import orjson
 from pydantic import BaseModel
+from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
 logger = logging.getLogger(__name__)
-def log_async(logger):
+def log_async(logger, args_formatter=None):
     import time
     from functools import wraps
     def decorator(func):
         @wraps(func)
         async def wrapped(*args, **kwargs):
-            logger.debug(f"Enter {func.__name__}, args: {args}, kwargs: {kwargs}")
+            if args_formatter is not None:
+                formatted_args, formatted_kwargs = copy.copy(args), copy.copy(kwargs)
+                args_formatter(formatted_args, formatted_kwargs)
+            else:
+                formatted_args, formatted_kwargs = args, kwargs
+            logger.debug(
+                f"Enter {func.__name__}, args: {formatted_args}, kwargs: {formatted_kwargs}"
+            )
             start = time.time()
             ret = await func(*args, **kwargs)
             logger.debug(
@@ -125,3 +134,59 @@ def purge_dir(d):
                 os.rmdir(subdir)
         except Exception:
             pass
+def parse_model_version(model_version: str, model_type: str) -> Tuple:
+    results: List[str] = model_version.split("--")
+    if model_type == "LLM":
+        if len(results) != 4:
+            raise ValueError(
+                f"LLM model_version parses failed! model_version: {model_version}"
+            )
+        model_name = results[0]
+        size = results[1]
+        if not size.endswith("B"):
+            raise ValueError(f"Cannot parse model_size_in_billions: {size}")
+        size = size.rstrip("B")
+        size_in_billions: Union[int, str] = size if "_" in size else int(size)
+        model_format = results[2]
+        quantization = results[3]
+        return model_name, size_in_billions, model_format, quantization
+    elif model_type == "embedding":
+        assert len(results) > 0, "Embedding model_version parses failed!"
+        return (results[0],)
+    elif model_type == "rerank":
+        assert len(results) > 0, "Rerank model_version parses failed!"
+        return (results[0],)
+    elif model_type == "image":
+        assert 2 >= len(results) >= 1, "Image model_version parses failed!"
+        return tuple(results)
+    else:
+        raise ValueError(f"Not supported model_type: {model_type}")
+def _get_nvidia_gpu_mem_info(gpu_id: int) -> Dict[str, float]:
+    from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
+    handler = nvmlDeviceGetHandleByIndex(gpu_id)
+    mem_info = nvmlDeviceGetMemoryInfo(handler)
+    return {"total": mem_info.total, "used": mem_info.used, "free": mem_info.free}
+def get_nvidia_gpu_info() -> Dict:
+    try:
+        nvmlInit()
+        device_count = nvmlDeviceGetCount()
+        res = {}
+        for i in range(device_count):
+            res[f"gpu-{i}"] = _get_nvidia_gpu_mem_info(i)
+        return res
+    except:
+        # TODO: add log here
+        # logger.debug(f"Cannot init nvml. Maybe due to lack of NVIDIA GPUs or incorrect installation of CUDA.")
+        return {}
+    finally:
+        try:
+            nvmlShutdown()
+        except:
+            pass

xinference/core/worker.py CHANGED Viewed

@@ -18,11 +18,13 @@ import platform
 import queue
 import signal
 import threading
+import time
 from collections import defaultdict
 from logging import getLogger
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import xoscar as xo
+from async_timeout import timeout
 from xoscar import MainActorPoolType
 from ..constants import XINFERENCE_CACHE_DIR
@@ -30,6 +32,7 @@ from ..core import ModelActor
 from ..core.status_guard import LaunchStatus
 from ..model.core import ModelDescription, create_model_instance
 from ..utils import cuda_count
+from .event import Event, EventCollectorActor, EventType
 from .metrics import launch_metrics_export_server, record_metrics
 from .resource import gather_node_info
 from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
@@ -125,6 +128,15 @@ class WorkerActor(xo.StatelessActor):
                                 model_uid,
                                 recover_count - 1,
                             )
+                            event_model_uid, _, __ = parse_replica_model_uid(model_uid)
+                            await self._event_collector_ref.report_event(
+                                event_model_uid,
+                                Event(
+                                    event_type=EventType.WARNING,
+                                    event_ts=int(time.time()),
+                                    event_content="Recreate model",
+                                ),
+                            )
                             self._model_uid_to_recover_count[model_uid] = (
                                 recover_count - 1
                             )
@@ -141,6 +153,8 @@ class WorkerActor(xo.StatelessActor):
         return "worker"
     async def __post_create__(self):
+        from ..isolation import Isolation
+        from .cache_tracker import CacheTrackerActor
         from .status_guard import StatusGuardActor
         from .supervisor import SupervisorActor
@@ -149,24 +163,46 @@ class WorkerActor(xo.StatelessActor):
         ] = await xo.actor_ref(
             address=self._supervisor_address, uid=StatusGuardActor.uid()
         )
+        self._event_collector_ref: xo.ActorRefType[
+            EventCollectorActor
+        ] = await xo.actor_ref(
+            address=self._supervisor_address, uid=EventCollectorActor.uid()
+        )
+        self._cache_tracker_ref: xo.ActorRefType[
+            "CacheTrackerActor"
+        ] = await xo.actor_ref(
+            address=self._supervisor_address, uid=CacheTrackerActor.uid()
+        )
         self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
             address=self._supervisor_address, uid=SupervisorActor.uid()
         )
         await self._supervisor_ref.add_worker(self.address)
-        self._upload_task = asyncio.create_task(self._periodical_report_status())
+        # Run _periodical_report_status() in a dedicated thread.
+        self._isolation = Isolation(asyncio.new_event_loop(), threaded=True)
+        self._isolation.start()
+        asyncio.run_coroutine_threadsafe(
+            self._periodical_report_status(), loop=self._isolation.loop
+        )
         logger.info(f"Xinference worker {self.address} started")
         logger.info("Purge cache directory: %s", XINFERENCE_CACHE_DIR)
         purge_dir(XINFERENCE_CACHE_DIR)
         from ..model.embedding import (
             CustomEmbeddingModelSpec,
+            get_embedding_model_descriptions,
             register_embedding,
             unregister_embedding,
         )
-        from ..model.llm import register_llm, unregister_llm
-        from ..model.llm.llm_family import CustomLLMFamilyV1
-        from ..model.rerank.custom import (
+        from ..model.image import get_image_model_descriptions
+        from ..model.llm import (
+            CustomLLMFamilyV1,
+            get_llm_model_descriptions,
+            register_llm,
+            unregister_llm,
+        )
+        from ..model.rerank import (
             CustomRerankModelSpec,
+            get_rerank_model_descriptions,
             register_rerank,
             unregister_rerank,
         )
@@ -181,6 +217,16 @@ class WorkerActor(xo.StatelessActor):
             "rerank": (CustomRerankModelSpec, register_rerank, unregister_rerank),
         }
+        # record model version
+        model_version_infos: Dict[str, List[Dict]] = {}
+        model_version_infos.update(get_llm_model_descriptions())
+        model_version_infos.update(get_embedding_model_descriptions())
+        model_version_infos.update(get_rerank_model_descriptions())
+        model_version_infos.update(get_image_model_descriptions())
+        await self._cache_tracker_ref.record_model_version(
+            model_version_infos, self.address
+        )
         # Windows does not have signal handler
         if os.name != "nt":
@@ -194,7 +240,7 @@ class WorkerActor(xo.StatelessActor):
             )
     async def __pre_destroy__(self):
-        self._upload_task.cancel()
+        self._isolation.stop()
     @staticmethod
     def get_devices_count():
@@ -407,13 +453,30 @@ class WorkerActor(xo.StatelessActor):
             return ["rerank"]
         elif model_type == "image":
             return ["text_to_image"]
-        elif model_type == "multimodal":
-            return ["multimodal"]
+        elif model_type == "audio":
+            return ["audio_to_text"]
         else:
             assert model_type == "LLM"
             assert isinstance(model, LLM)
             return model.model_family.model_ability  # type: ignore
+    async def update_cache_status(
+        self, model_name: str, model_description: ModelDescription
+    ):
+        version_info = model_description.to_version_info()
+        if isinstance(version_info, list):  # image model
+            model_path = version_info[0]["model_file_location"]
+            await self._cache_tracker_ref.update_cache_status(
+                self.address, model_name, None, model_path
+            )
+        else:
+            await self._cache_tracker_ref.update_cache_status(
+                self.address,
+                model_name,
+                version_info["model_version"],
+                version_info["model_file_location"],
+            )
     @log_async(logger=logger)
     async def launch_builtin_model(
         self,
@@ -427,6 +490,15 @@ class WorkerActor(xo.StatelessActor):
         request_limits: Optional[int] = None,
         **kwargs,
     ):
+        event_model_uid, _, __ = parse_replica_model_uid(model_uid)
+        await self._event_collector_ref.report_event(
+            event_model_uid,
+            Event(
+                event_type=EventType.INFO,
+                event_ts=int(time.time()),
+                event_content="Launch model",
+            ),
+        )
         launch_args = locals()
         launch_args.pop("self")
         launch_args.pop("kwargs")
@@ -464,6 +536,7 @@ class WorkerActor(xo.StatelessActor):
                 is_local_deployment,
                 **kwargs,
             )
+            await self.update_cache_status(model_name, model_description)
             model_ref = await xo.create_actor(
                 ModelActor,
                 address=subpool_address,
@@ -497,6 +570,15 @@ class WorkerActor(xo.StatelessActor):
     @log_async(logger=logger)
     async def terminate_model(self, model_uid: str):
+        event_model_uid, _, __ = parse_replica_model_uid(model_uid)
+        await self._event_collector_ref.report_event(
+            event_model_uid,
+            Event(
+                event_type=EventType.INFO,
+                event_ts=int(time.time()),
+                event_content="Terminate model",
+            ),
+        )
         origin_uid, _, _ = parse_replica_model_uid(model_uid)
         await self._status_guard_ref.update_instance_info(
             origin_uid, {"status": LaunchStatus.TERMINATING.name}
@@ -553,7 +635,15 @@ class WorkerActor(xo.StatelessActor):
         return model_desc.to_dict()
     async def report_status(self):
-        status = await asyncio.to_thread(gather_node_info)
+        status = dict()
+        try:
+            # asyncio.timeout is only available in Python >= 3.11
+            async with timeout(2):
+                status = await asyncio.to_thread(gather_node_info)
+        except asyncio.CancelledError:
+            raise
+        except Exception:
+            logger.exception("Report status got error.")
         await self._supervisor_ref.report_worker_status(self.address, status)
     async def _periodical_report_status(self):

xinference/deploy/cmdline.py CHANGED Viewed

@@ -499,7 +499,7 @@ def list_model_registrations(
             tabulate(table, headers=["Type", "Name", "Family", "Is-built-in"]),
             file=sys.stderr,
         )
-    elif model_type == "multimodal":
+    elif model_type == "audio":
         for registration in registrations:
             model_name = registration["model_name"]
             model_family = client.get_model_registration(model_type, model_name)
@@ -507,12 +507,15 @@ def list_model_registrations(
                 [
                     model_type,
                     model_family["model_name"],
-                    model_family["model_lang"],
+                    model_family["model_family"],
+                    model_family["multilingual"],
                     registration["is_builtin"],
                 ]
             )
         print(
-            tabulate(table, headers=["Type", "Name", "Language", "Is-built-in"]),
+            tabulate(
+                table, headers=["Type", "Name", "Family", "Multilingual", "Is-built-in"]
+            ),
             file=sys.stderr,
         )
     else:

xinference/deploy/local.py CHANGED Viewed

@@ -23,7 +23,7 @@ import xoscar as xo
 from xoscar.utils import get_next_port
 from ..constants import (
-    XINFERENCE_HEALTH_CHECK_ATTEMPTS,
+    XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
     XINFERENCE_HEALTH_CHECK_INTERVAL,
 )
 from ..core.supervisor import SupervisorActor
@@ -116,7 +116,7 @@ def main(
     if not health_check(
         address=supervisor_address,
-        max_attempts=XINFERENCE_HEALTH_CHECK_ATTEMPTS,
+        max_attempts=XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
         sleep_interval=XINFERENCE_HEALTH_CHECK_INTERVAL,
     ):
         raise RuntimeError("Cluster is not available after multiple attempts")

xinference/deploy/supervisor.py CHANGED Viewed

@@ -23,7 +23,7 @@ import xoscar as xo
 from xoscar.utils import get_next_port
 from ..constants import (
-    XINFERENCE_HEALTH_CHECK_ATTEMPTS,
+    XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
     XINFERENCE_HEALTH_CHECK_INTERVAL,
 )
 from ..core.supervisor import SupervisorActor
@@ -82,7 +82,7 @@ def main(
     if not health_check(
         address=supervisor_address,
-        max_attempts=XINFERENCE_HEALTH_CHECK_ATTEMPTS,
+        max_attempts=XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
         sleep_interval=XINFERENCE_HEALTH_CHECK_INTERVAL,
     ):
         raise RuntimeError("Supervisor is not available after multiple attempts")

xinference/model/audio/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import json
+import os
+from .core import AudioModelFamilyV1, generate_audio_description, get_cache_status
+_model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
+BUILTIN_AUDIO_MODELS = dict(
+    (spec["model_name"], AudioModelFamilyV1(**spec))
+    for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+)
+del _model_spec_json

xinference/model/audio/core.py ADDED Viewed

@@ -0,0 +1,161 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+from pydantic import BaseModel
+from ...constants import XINFERENCE_CACHE_DIR
+from ..core import ModelDescription
+from ..utils import valid_model_revision
+from .whisper import WhisperModel
+MAX_ATTEMPTS = 3
+logger = logging.getLogger(__name__)
+class AudioModelFamilyV1(BaseModel):
+    model_family: str
+    model_name: str
+    model_id: str
+    model_revision: str
+    multilingual: bool
+class AudioModelDescription(ModelDescription):
+    def __init__(
+        self,
+        address: Optional[str],
+        devices: Optional[List[str]],
+        model_spec: AudioModelFamilyV1,
+        model_path: Optional[str] = None,
+    ):
+        super().__init__(address, devices, model_path=model_path)
+        self._model_spec = model_spec
+    def to_dict(self):
+        return {
+            "model_type": "audio",
+            "address": self.address,
+            "accelerators": self.devices,
+            "model_name": self._model_spec.model_name,
+            "model_family": self._model_spec.model_family,
+            "model_revision": self._model_spec.model_revision,
+        }
+    def to_version_info(self):
+        from .utils import get_model_version
+        if self._model_path is None:
+            is_cached = get_cache_status(self._model_spec)
+            file_location = get_cache_dir(self._model_spec)
+        else:
+            is_cached = True
+            file_location = self._model_path
+        return {
+            "model_version": get_model_version(self._model_spec),
+            "model_file_location": file_location,
+            "cache_status": is_cached,
+        }
+def generate_audio_description(
+    image_model: AudioModelFamilyV1,
+) -> Dict[str, List[Dict]]:
+    res = defaultdict(list)
+    res[image_model.model_name].extend(
+        AudioModelDescription(None, None, image_model).to_dict()
+    )
+    return res
+def match_model(model_name: str) -> AudioModelFamilyV1:
+    from . import BUILTIN_AUDIO_MODELS
+    if model_name in BUILTIN_AUDIO_MODELS:
+        return BUILTIN_AUDIO_MODELS[model_name]
+    else:
+        raise ValueError(
+            f"Image model {model_name} not found, available"
+            f"model list: {BUILTIN_AUDIO_MODELS.keys()}"
+        )
+def cache(model_spec: AudioModelFamilyV1):
+    # TODO: cache from uri
+    import huggingface_hub
+    cache_dir = get_cache_dir(model_spec)
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir, exist_ok=True)
+    meta_path = os.path.join(cache_dir, "__valid_download")
+    if valid_model_revision(meta_path, model_spec.model_revision):
+        return cache_dir
+    for current_attempt in range(1, MAX_ATTEMPTS + 1):
+        try:
+            huggingface_hub.snapshot_download(
+                model_spec.model_id,
+                revision=model_spec.model_revision,
+                local_dir=cache_dir,
+                local_dir_use_symlinks=True,
+                resume_download=True,
+            )
+            break
+        except huggingface_hub.utils.LocalEntryNotFoundError:
+            remaining_attempts = MAX_ATTEMPTS - current_attempt
+            logger.warning(
+                f"Attempt {current_attempt} failed. Remaining attempts: {remaining_attempts}"
+            )
+    else:
+        raise RuntimeError(
+            f"Failed to download model '{model_spec.model_name}' after {MAX_ATTEMPTS} attempts"
+        )
+    with open(meta_path, "w") as f:
+        import json
+        desc = AudioModelDescription(None, None, model_spec)
+        json.dump(desc.to_dict(), f)
+    return cache_dir
+def get_cache_dir(model_spec: AudioModelFamilyV1):
+    return os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name))
+def get_cache_status(
+    model_spec: AudioModelFamilyV1,
+) -> bool:
+    cache_dir = get_cache_dir(model_spec)
+    meta_path = os.path.join(cache_dir, "__valid_download")
+    return valid_model_revision(meta_path, model_spec.model_revision)
+def create_audio_model_instance(
+    subpool_addr: str, devices: List[str], model_uid: str, model_name: str, **kwargs
+) -> Tuple[WhisperModel, AudioModelDescription]:
+    model_spec = match_model(model_name)
+    model_path = cache(model_spec)
+    model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
+    model_description = AudioModelDescription(
+        subpool_addr, devices, model_spec, model_path=model_path
+    )
+    return model, model_description

xinference/model/audio/model_spec.json ADDED Viewed

@@ -0,0 +1,79 @@
+[
+  {
+    "model_name": "whisper-tiny",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-tiny",
+    "model_revision": "167c219b21f11ef214220b8fdb7536b8a88c2475",
+    "multilingual": true
+  },
+  {
+    "model_name": "whisper-tiny.en",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-tiny.en",
+    "model_revision": "87c7102498dcde7456f24cfd30239ca606ed9063",
+    "multilingual": false
+  },
+  {
+    "model_name": "whisper-base",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-base",
+    "model_revision": "8c1db9b51951100007a96a525d83a8ec81b3c237",
+    "multilingual": true
+  },
+  {
+    "model_name": "whisper-base.en",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-base.en",
+    "model_revision": "911407f4214e0e1d82085af863093ec0b66f9cd6",
+    "multilingual": false
+  },
+  {
+    "model_name": "whisper-small",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-small",
+    "model_revision": "998cb1a777c20db53d6033a61b977ed4c3792cac",
+    "multilingual": true
+  },
+  {
+    "model_name": "whisper-small.en",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-small.en",
+    "model_revision": "e8727524f962ee844a7319d92be39ac1bd25655a",
+    "multilingual": false
+  },
+  {
+    "model_name": "whisper-medium",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-medium",
+    "model_revision": "16688beb1294bedd0a6f5cd86fe7eec57bce41ed",
+    "multilingual": true
+  },
+  {
+    "model_name": "whisper-medium.en",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-medium.en",
+    "model_revision": "2e98eb6279edf5095af0c8dedb36bdec0acd172b",
+    "multilingual": false
+  },
+  {
+    "model_name": "whisper-large-v3",
+    "model_family": "whisper",
+    "model_id": "openai/whisper-large-v3",
+    "model_revision": "6cdf07a7e3ec3806e5d55f787915b85d4cd020b1",
+    "multilingual": true
+  },
+  {
+    "model_name": "Belle-distilwhisper-large-v2-zh",
+    "model_family": "whisper",
+    "model_id": "BELLE-2/Belle-distilwhisper-large-v2-zh",
+    "model_revision": "ed25d13498fa5bac758b2fc479435b698532dfe8",
+    "multilingual": false
+  },
+  {
+    "model_name": "Belle-whisper-large-v2-zh",
+    "model_family": "whisper",
+    "model_id": "BELLE-2/Belle-whisper-large-v2-zh",
+    "model_revision": "ec5bd5d78598545b7585814edde86dac2002b5b9",
+    "multilingual": false
+  }
+]

xinference/model/audio/utils.py ADDED Viewed

@@ -0,0 +1,18 @@
+# Copyright 2022-2024 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .core import AudioModelFamilyV1
+def get_model_version(audio_model: AudioModelFamilyV1) -> str:
+    return audio_model.model_name

xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

Potentially problematic release.

xinference 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl