PyPI - xinference - Versions diffs - 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show

xinference/core/utils.py CHANGED Viewed

@@ -11,11 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import logging
 import os
 import random
 import string
 import uuid
+import weakref
 from enum import Enum
 from typing import Dict, Generator, List, Optional, Tuple, Union
@@ -23,7 +25,10 @@ import orjson
 from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
 from .._compat import BaseModel
-from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
+from ..constants import (
+    XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
+    XINFERENCE_LOG_ARG_MAX_LENGTH,
+)
 logger = logging.getLogger(__name__)
@@ -49,13 +54,24 @@ def log_async(
 ):
     import time
     from functools import wraps
+    from inspect import signature
     def decorator(func):
         func_name = func.__name__
+        sig = signature(func)
         @wraps(func)
         async def wrapped(*args, **kwargs):
-            request_id_str = kwargs.get("request_id", "")
+            request_id_str = kwargs.get("request_id")
+            if not request_id_str:
+                # sometimes `request_id` not in kwargs
+                # we try to bind the arguments
+                try:
+                    bound_args = sig.bind_partial(*args, **kwargs)
+                    arguments = bound_args.arguments
+                except TypeError:
+                    arguments = {}
+                request_id_str = arguments.get("request_id", "")
             if not request_id_str:
                 request_id_str = uuid.uuid1()
                 if func_name == "text_to_image":
@@ -260,8 +276,8 @@ def get_nvidia_gpu_info() -> Dict:
 def assign_replica_gpu(
-    _replica_model_uid: str, replica: int, gpu_idx: Union[int, List[int]]
-) -> List[int]:
+    _replica_model_uid: str, replica: int, gpu_idx: Optional[Union[int, List[int]]]
+) -> Optional[List[int]]:
     model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
     rep_id, replica = int(rep_id), int(replica)
     if isinstance(gpu_idx, int):
@@ -269,3 +285,56 @@ def assign_replica_gpu(
     if isinstance(gpu_idx, list) and gpu_idx:
         return gpu_idx[rep_id::replica]
     return gpu_idx
+class CancelMixin:
+    _CANCEL_TASK_NAME = "abort_block"
+    def __init__(self):
+        self._running_tasks: weakref.WeakValueDictionary[
+            str, asyncio.Task
+        ] = weakref.WeakValueDictionary()
+    def _add_running_task(self, request_id: Optional[str]):
+        """Add current asyncio task to the running task.
+        :param request_id: The corresponding request id.
+        """
+        if request_id is None:
+            return
+        running_task = self._running_tasks.get(request_id)
+        if running_task is not None:
+            if running_task.get_name() == self._CANCEL_TASK_NAME:
+                raise Exception(f"The request has been aborted: {request_id}")
+            raise Exception(f"Duplicate request id: {request_id}")
+        current_task = asyncio.current_task()
+        assert current_task is not None
+        self._running_tasks[request_id] = current_task
+    def _cancel_running_task(
+        self,
+        request_id: Optional[str],
+        block_duration: int = XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
+    ):
+        """Cancel the running asyncio task.
+        :param request_id: The request id to cancel.
+        :param block_duration: The duration seconds to ensure the request can't be executed.
+        """
+        if request_id is None:
+            return
+        running_task = self._running_tasks.pop(request_id, None)
+        if running_task is not None:
+            running_task.cancel()
+        async def block_task():
+            """This task is for blocking the request for a duration."""
+            try:
+                await asyncio.sleep(block_duration)
+                logger.info("Abort block end for request: %s", request_id)
+            except asyncio.CancelledError:
+                logger.info("Abort block is cancelled for request: %s", request_id)
+        if block_duration > 0:
+            logger.info("Abort block start for request: %s", request_id)
+            self._running_tasks[request_id] = asyncio.create_task(
+                block_task(), name=self._CANCEL_TASK_NAME
+            )

xinference/core/worker.py CHANGED Viewed

@@ -22,8 +22,9 @@ import signal
 import threading
 import time
 from collections import defaultdict
+from dataclasses import dataclass
 from logging import getLogger
-from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union, no_type_check
 import xoscar as xo
 from async_timeout import timeout
@@ -58,6 +59,11 @@ else:
     MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
+@dataclass
+class ModelStatus:
+    last_error: str = ""
 class WorkerActor(xo.StatelessActor):
     def __init__(
         self,
@@ -90,6 +96,7 @@ class WorkerActor(xo.StatelessActor):
         # attributes maintained after model launched:
         self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
         self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
+        self._model_uid_to_model_status: Dict[str, ModelStatus] = {}
         self._gpu_to_model_uid: Dict[int, str] = {}
         self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
         # Dict structure: gpu_index: {(replica_model_uid, model_type)}
@@ -177,12 +184,12 @@ class WorkerActor(xo.StatelessActor):
                             self._model_uid_to_recover_count[model_uid] = (
                                 recover_count - 1
                             )
-                            await self.launch_builtin_model(**launch_args)
+                            await self.recover_model(launch_args)
                         else:
                             logger.warning("Stop recreating model actor.")
                     else:
                         logger.warning("Recreating model actor %s ...", model_uid)
-                        await self.launch_builtin_model(**launch_args)
+                        await self.recover_model(launch_args)
                 break
     @classmethod
@@ -866,6 +873,9 @@ class WorkerActor(xo.StatelessActor):
             )
             try:
+                xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
+                if xavier_config is not None:
+                    xavier_config["rank_address"] = subpool_address
                 model, model_description = await asyncio.to_thread(
                     create_model_instance,
                     subpool_address,
@@ -893,6 +903,7 @@ class WorkerActor(xo.StatelessActor):
                     model=model,
                     model_description=model_description,
                     request_limits=request_limits,
+                    xavier_config=xavier_config,
                 )
                 await model_ref.load()
             except:
@@ -902,6 +913,7 @@ class WorkerActor(xo.StatelessActor):
                 raise
             self._model_uid_to_model[model_uid] = model_ref
             self._model_uid_to_model_spec[model_uid] = model_description
+            self._model_uid_to_model_status[model_uid] = ModelStatus()
             self._model_uid_to_addr[model_uid] = subpool_address
             self._model_uid_to_recover_count.setdefault(
                 model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
@@ -921,13 +933,18 @@ class WorkerActor(xo.StatelessActor):
             origin_uid,
             {"model_ability": abilities, "status": LaunchStatus.READY.name},
         )
+        return subpool_address
     @log_async(logger=logger, level=logging.INFO)
     async def terminate_model(self, model_uid: str, is_model_die=False):
         # Terminate model while its launching is not allow
         if model_uid in self._model_uid_launching_guard:
             raise ValueError(f"{model_uid} is launching")
-        origin_uid, _ = parse_replica_model_uid(model_uid)
+        # In special cases, if the suffix is `-rank0`, this is the Xavier's rank 0 model actor.
+        if model_uid.endswith("-rank0"):
+            origin_uid = model_uid.removesuffix("-rank0")
+        else:
+            origin_uid, _ = parse_replica_model_uid(model_uid)
         try:
             _ = await self.get_supervisor_ref()
             if self._event_collector_ref is not None:
@@ -976,6 +993,7 @@ class WorkerActor(xo.StatelessActor):
                 status = LaunchStatus.ERROR.name
             else:
                 status = LaunchStatus.TERMINATED.name
+                self._model_uid_to_model_status.pop(model_uid, None)
             if self._status_guard_ref is None:
                 _ = await self.get_supervisor_ref()
@@ -1010,6 +1028,9 @@ class WorkerActor(xo.StatelessActor):
     @log_sync(logger=logger)
     def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
+        model_status = self._model_uid_to_model_status.get(model_uid)
+        if model_status and model_status.last_error:
+            raise Exception(model_status.last_error)
         model_ref = self._model_uid_to_model.get(model_uid, None)
         if model_ref is None:
             raise ValueError(f"Model not found, uid: {model_uid}")
@@ -1138,6 +1159,83 @@ class WorkerActor(xo.StatelessActor):
         }
         return ret
+    def update_model_status(self, model_uid: str, **kwargs):
+        model_status = self._model_uid_to_model_status.get(model_uid)
+        if model_status is not None:
+            for k, v in kwargs.items():
+                setattr(model_status, k, v)
+    def get_model_status(self, model_uid: str):
+        return self._model_uid_to_model_status.get(model_uid)
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)
+    async def start_transfer_for_vllm(
+        self, rep_model_uid: str, rank_addresses: List[str]
+    ):
+        model_ref = self._model_uid_to_model[rep_model_uid]
+        await model_ref.start_transfer_for_vllm(rank_addresses)
+    @log_async(logger=logger, level=logging.INFO)
+    async def launch_rank0_model(
+        self, rep_model_uid: str, xavier_config: Dict[str, Any]
+    ) -> Tuple[str, int]:
+        from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
+        if os.name != "nt" and platform.system() != "Darwin":
+            # Linux
+            start_method = "forkserver"
+        else:
+            # Windows and macOS
+            start_method = "spawn"
+        subpool_address = await self._main_pool.append_sub_pool(
+            start_method=start_method
+        )
+        store_address = subpool_address.split(":")[0]
+        # Note that `store_port` needs to be generated on the worker,
+        # as the TCP store is on rank 0, not on the supervisor.
+        store_port = xo.utils.get_next_port()
+        self._model_uid_launching_guard[rep_model_uid] = True
+        try:
+            try:
+                xavier_config["rank_address"] = subpool_address
+                xavier_config["store_address"] = store_address
+                xavier_config["store_port"] = store_port
+                model_ref = await xo.create_actor(
+                    Rank0ModelActor,
+                    address=subpool_address,
+                    uid=rep_model_uid,
+                    xavier_config=xavier_config,
+                )
+            except:
+                await self._main_pool.remove_sub_pool(subpool_address)
+                raise
+            self._model_uid_to_model[rep_model_uid] = model_ref
+            self._model_uid_to_addr[rep_model_uid] = subpool_address
+        finally:
+            del self._model_uid_launching_guard[rep_model_uid]
+        return subpool_address, store_port
+    @no_type_check
+    async def recover_model(self, launch_args: Dict[str, Any]):
+        rep_model_uid = launch_args.get("model_uid")
+        origin_uid, _ = parse_replica_model_uid(rep_model_uid)
+        xavier_config: Optional[Dict[str, Any]] = launch_args.get("xavier_config", None)
+        is_xavier: bool = xavier_config is not None
+        supervisor_ref = await self.get_supervisor_ref(add_worker=False)
+        if is_xavier:
+            rank = xavier_config.get("rank")
+            await supervisor_ref.call_collective_manager(
+                origin_uid, "unregister_rank", rank
+            )
+        subpool_address = await self.launch_builtin_model(**launch_args)
+        if is_xavier:
+            model_ref = self._model_uid_to_model[rep_model_uid]
+            await model_ref.start_transfer_for_vllm([])
+            rank = xavier_config.get("rank")
+            await supervisor_ref.call_collective_manager(
+                origin_uid, "register_rank", rank, subpool_address, update=True
+            )

xinference/deploy/cmdline.py CHANGED Viewed

@@ -846,7 +846,9 @@ def model_launch(
     kwargs = {}
     for i in range(0, len(ctx.args), 2):
         if not ctx.args[i].startswith("--"):
-            raise ValueError("You must specify extra kwargs with `--` prefix.")
+            raise ValueError(
+                f"You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is {ctx.args[i]}."
+            )
         kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
     print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ..cmdline import (
     list_model_registrations,
     model_chat,
     model_generate,
+    model_launch,
     model_list,
     model_terminate,
     register_model,
@@ -311,3 +312,58 @@ def test_remove_cache(setup):
     assert result.exit_code == 0
     assert "Cache directory qwen1.5-chat has been deleted."
+def test_launch_error_in_passing_parameters():
+    runner = CliRunner()
+    # Known parameter but not provided with value.
+    result = runner.invoke(
+        model_launch,
+        [
+            "--model-engine",
+            "transformers",
+            "--model-name",
+            "qwen2.5-instruct",
+            "--model-uid",
+            "-s",
+            "0.5",
+            "-f",
+            "gptq",
+            "-q",
+            "INT4",
+            "111",
+            "-l",
+        ],
+    )
+    assert result.exit_code == 1
+    assert (
+        "You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is 0.5."
+        in str(result)
+    )
+    # Unknown parameter
+    result = runner.invoke(
+        model_launch,
+        [
+            "--model-engine",
+            "transformers",
+            "--model-name",
+            "qwen2.5-instruct",
+            "--model-uid",
+            "123",
+            "-s",
+            "0.5",
+            "-f",
+            "gptq",
+            "-q",
+            "INT4",
+            "-l",
+            "111",
+        ],
+    )
+    assert result.exit_code == 1
+    assert (
+        "You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is -l."
+        in str(result)
+    )

xinference/isolation.py CHANGED Viewed

@@ -37,6 +37,30 @@ class Isolation:
         asyncio.set_event_loop(self._loop)
         self._stopped = asyncio.Event()
         self._loop.run_until_complete(self._stopped.wait())
+        self._cancel_all_tasks(self._loop)
+    @staticmethod
+    def _cancel_all_tasks(loop):
+        to_cancel = asyncio.all_tasks(loop)
+        if not to_cancel:
+            return
+        for task in to_cancel:
+            task.cancel()
+        loop.run_until_complete(asyncio.gather(*to_cancel, return_exceptions=True))
+        for task in to_cancel:
+            if task.cancelled():
+                continue
+            if task.exception() is not None:
+                loop.call_exception_handler(
+                    {
+                        "message": "unhandled exception during asyncio.run() shutdown",
+                        "exception": task.exception(),
+                        "task": task,
+                    }
+                )
     def start(self):
         if self._threaded:

xinference/model/audio/__init__.py CHANGED Viewed

@@ -15,6 +15,8 @@
 import codecs
 import json
 import os
+import platform
+import sys
 import warnings
 from typing import Any, Dict
@@ -55,6 +57,14 @@ def register_custom_model():
                 warnings.warn(f"{user_defined_audio_dir}/{f} has error, {e}")
+def _need_filter(spec: dict):
+    if (sys.platform != "darwin" or platform.processor() != "arm") and spec.get(
+        "engine", ""
+    ).upper() == "MLX":
+        return True
+    return False
 def _install():
     _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
     _model_spec_modelscope_json = os.path.join(
@@ -64,6 +74,7 @@ def _install():
         dict(
             (spec["model_name"], AudioModelFamilyV1(**spec))
             for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+            if not _need_filter(spec)
         )
     )
     for model_name, model_spec in BUILTIN_AUDIO_MODELS.items():
@@ -75,6 +86,7 @@ def _install():
             for spec in json.load(
                 codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
             )
+            if not _need_filter(spec)
         )
     )
     for model_name, model_spec in MODELSCOPE_AUDIO_MODELS.items():

xinference/model/audio/core.py CHANGED Viewed

@@ -21,9 +21,13 @@ from ..core import CacheableModelSpec, ModelDescription
 from ..utils import valid_model_revision
 from .chattts import ChatTTSModel
 from .cosyvoice import CosyVoiceModel
+from .f5tts import F5TTSModel
+from .f5tts_mlx import F5TTSMLXModel
 from .fish_speech import FishSpeechModel
 from .funasr import FunASRModel
+from .melotts import MeloTTSModel
 from .whisper import WhisperModel
+from .whisper_mlx import WhisperMLXModel
 logger = logging.getLogger(__name__)
@@ -43,11 +47,13 @@ class AudioModelFamilyV1(CacheableModelSpec):
     model_family: str
     model_name: str
     model_id: str
-    model_revision: str
+    model_revision: Optional[str]
     multilingual: bool
+    language: Optional[str]
     model_ability: Optional[str]
     default_model_config: Optional[Dict[str, Any]]
     default_transcription_config: Optional[Dict[str, Any]]
+    engine: Optional[str]
 class AudioModelDescription(ModelDescription):
@@ -160,17 +166,38 @@ def create_audio_model_instance(
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[
-    Union[WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel, FishSpeechModel],
+    Union[
+        WhisperModel,
+        WhisperMLXModel,
+        FunASRModel,
+        ChatTTSModel,
+        CosyVoiceModel,
+        FishSpeechModel,
+        F5TTSModel,
+        F5TTSMLXModel,
+        MeloTTSModel,
+    ],
     AudioModelDescription,
 ]:
     model_spec = match_audio(model_name, download_hub)
     if model_path is None:
         model_path = cache(model_spec)
     model: Union[
-        WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel, FishSpeechModel
+        WhisperModel,
+        WhisperMLXModel,
+        FunASRModel,
+        ChatTTSModel,
+        CosyVoiceModel,
+        FishSpeechModel,
+        F5TTSModel,
+        F5TTSMLXModel,
+        MeloTTSModel,
     ]
     if model_spec.model_family == "whisper":
-        model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
+        if not model_spec.engine:
+            model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
+        else:
+            model = WhisperMLXModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "funasr":
         model = FunASRModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "ChatTTS":
@@ -179,6 +206,12 @@ def create_audio_model_instance(
         model = CosyVoiceModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "FishAudio":
         model = FishSpeechModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "F5-TTS":
+        model = F5TTSModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "F5-TTS-MLX":
+        model = F5TTSMLXModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "MeloTTS":
+        model = MeloTTSModel(model_uid, model_path, model_spec, **kwargs)
     else:
         raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
     model_description = AudioModelDescription(

xinference/model/audio/cosyvoice.py CHANGED Viewed

@@ -39,6 +39,7 @@ class CosyVoiceModel:
         self._device = device
         self._model = None
         self._kwargs = kwargs
+        self._is_cosyvoice2 = False
     @property
     def model_ability(self):
@@ -48,14 +49,32 @@ class CosyVoiceModel:
         import os
         import sys
+        import torch
         # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
-        sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
+        thirdparty_dir = os.path.join(os.path.dirname(__file__), "../../thirdparty")
+        sys.path.insert(0, thirdparty_dir)
+        if "CosyVoice2" in self._model_spec.model_name:
+            from cosyvoice.cli.cosyvoice import CosyVoice2 as CosyVoice
+            self._is_cosyvoice2 = True
+        else:
+            from cosyvoice.cli.cosyvoice import CosyVoice
-        from cosyvoice.cli.cosyvoice import CosyVoice
+            self._is_cosyvoice2 = False
-        self._model = CosyVoice(
-            self._model_path, load_jit=self._kwargs.get("load_jit", False)
+        # Unify this configuration name as 'compile' to be compatible with the name 'load_jit'.
+        load_jit = self._kwargs.get("load_jit", False) or self._kwargs.get(
+            "compile", False
         )
+        logger.info("Loading CosyVoice model, compile=%s...", load_jit)
+        self._model = CosyVoice(self._model_path, load_jit=load_jit)
+        if self._is_cosyvoice2:
+            spk2info_file = os.path.join(thirdparty_dir, "cosyvoice/bin/spk2info.pt")
+            self._model.frontend.spk2info = torch.load(
+                spk2info_file, map_location=self._device
+            )
     def _speech_handle(
         self,
@@ -78,6 +97,15 @@ class CosyVoiceModel:
                 output = self._model.inference_zero_shot(
                     input, prompt_text, prompt_speech_16k, stream=stream
                 )
+            elif instruct_text:
+                assert self._is_cosyvoice2
+                logger.info("CosyVoice inference_instruct")
+                output = self._model.inference_instruct2(
+                    input,
+                    instruct_text=instruct_text,
+                    prompt_speech_16k=prompt_speech_16k,
+                    stream=stream,
+                )
             else:
                 logger.info("CosyVoice inference_cross_lingual")
                 output = self._model.inference_cross_lingual(
@@ -87,6 +115,7 @@ class CosyVoiceModel:
             available_speakers = self._model.list_avaliable_spks()
             if not voice:
                 voice = available_speakers[0]
+                logger.info("Auto select speaker: %s", voice)
             else:
                 assert (
                     voice in available_speakers
@@ -106,7 +135,9 @@ class CosyVoiceModel:
         def _generator_stream():
             with BytesIO() as out:
                 writer = torchaudio.io.StreamWriter(out, format=response_format)
-                writer.add_audio_stream(sample_rate=22050, num_channels=1)
+                writer.add_audio_stream(
+                    sample_rate=self._model.sample_rate, num_channels=1
+                )
                 i = 0
                 last_pos = 0
                 with writer.open():
@@ -125,7 +156,7 @@ class CosyVoiceModel:
             chunks = [o["tts_speech"] for o in output]
             t = torch.cat(chunks, dim=1)
             with BytesIO() as out:
-                torchaudio.save(out, t, 22050, format=response_format)
+                torchaudio.save(out, t, self._model.sample_rate, format=response_format)
                 return out.getvalue()
         return _generator_stream() if stream else _generator_block()
@@ -163,6 +194,8 @@ class CosyVoiceModel:
             assert (
                 prompt_text is None
             ), "CosyVoice Instruct model does not support prompt_text"
+        elif self._is_cosyvoice2:
+            pass
         else:
             # inference_zero_shot
             # inference_cross_lingual

xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl