PyPI - xinference - Versions diffs - 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

xinference 0.9.4py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show

xinference/core/worker.py CHANGED Viewed

@@ -74,6 +74,10 @@ class WorkerActor(xo.StatelessActor):
         self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
         self._gpu_to_model_uid: Dict[int, str] = {}
         self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
+        # Dict structure: gpu_index: {(replica_model_uid, model_type)}
+        self._user_specified_gpu_to_model_uids: Dict[
+            int, Set[Tuple[str, str]]
+        ] = defaultdict(set)
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, int] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
@@ -268,12 +272,27 @@ class WorkerActor(xo.StatelessActor):
         """
         candidates = []
         for _dev in self._total_gpu_devices:
-            if _dev not in self._gpu_to_model_uid:
+            if (
+                _dev not in self._gpu_to_model_uid
+                and _dev not in self._user_specified_gpu_to_model_uids
+            ):  # no possible vllm model on it, add it to candidates
                 candidates.append(_dev)
-            else:
-                existing_model_uid = self._gpu_to_model_uid[_dev]
-                is_vllm_model = await self.is_model_vllm_backend(existing_model_uid)
-                if not is_vllm_model:
+            else:  # need to judge that whether to have vllm model on this device
+                has_vllm_model = False
+                if _dev in self._gpu_to_model_uid:
+                    existing_model_uid = self._gpu_to_model_uid[_dev]
+                    has_vllm_model = await self.is_model_vllm_backend(
+                        existing_model_uid
+                    )
+                if (
+                    not has_vllm_model
+                    and _dev in self._user_specified_gpu_to_model_uids
+                ):
+                    for rep_uid, _ in self._user_specified_gpu_to_model_uids[_dev]:
+                        has_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                        if has_vllm_model:
+                            break
+                if not has_vllm_model:
                     candidates.append(_dev)
         if len(candidates) == 0:
@@ -285,9 +304,13 @@ class WorkerActor(xo.StatelessActor):
         device, min_cnt = -1, -1
         # Pick the device with the fewest existing models among all the candidate devices.
         for _dev in candidates:
-            existing_cnt = len(self._gpu_to_embedding_model_uids[_dev])
+            existing_cnt = 0
+            if _dev in self._gpu_to_embedding_model_uids:
+                existing_cnt += len(self._gpu_to_embedding_model_uids[_dev])
             if _dev in self._gpu_to_model_uid:
                 existing_cnt += 1
+            if _dev in self._user_specified_gpu_to_model_uids:
+                existing_cnt += len(self._user_specified_gpu_to_model_uids[_dev])
             if min_cnt == -1 or existing_cnt < min_cnt:
                 device, min_cnt = _dev, existing_cnt
@@ -295,17 +318,82 @@ class WorkerActor(xo.StatelessActor):
         return device
     def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
-        if n_gpu > len(self._total_gpu_devices) - len(self._gpu_to_model_uid):
+        user_specified_allocated_devices: Set[int] = set()
+        for dev, model_infos in self._user_specified_gpu_to_model_uids.items():
+            allocated_non_embedding_rerank_models = False
+            for _, model_type in model_infos:
+                allocated_non_embedding_rerank_models = model_type not in [
+                    "embedding",
+                    "rerank",
+                ]
+                if allocated_non_embedding_rerank_models:
+                    break
+            if allocated_non_embedding_rerank_models:
+                user_specified_allocated_devices.add(dev)
+        allocated_devices = set(self._gpu_to_model_uid.keys()).union(
+            user_specified_allocated_devices
+        )
+        if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
             raise RuntimeError("No available slot found for the model")
         devices: List[int] = [
-            dev for dev in self._total_gpu_devices if dev not in self._gpu_to_model_uid
+            dev
+            for dev in self._total_gpu_devices
+            if dev not in self._gpu_to_model_uid
+            and dev not in user_specified_allocated_devices
         ][:n_gpu]
         for dev in devices:
             self._gpu_to_model_uid[int(dev)] = model_uid
         return sorted(devices)
+    async def allocate_devices_with_gpu_idx(
+        self, model_uid: str, model_type: str, gpu_idx: List[int]
+    ) -> List[int]:
+        """
+        When user specifies the gpu_idx, allocate models on user-specified GPUs whenever possible
+        """
+        # must be subset of total devices visible to this worker
+        if not set(gpu_idx) <= set(self._total_gpu_devices):
+            raise ValueError(
+                f"Worker {self.address} cannot use the GPUs with these indexes: {gpu_idx}. "
+                f"Worker {self.address} can only see these GPUs: {self._total_gpu_devices}."
+            )
+        # currently just report a warning log when there are already models on these GPUs
+        for idx in gpu_idx:
+            existing_model_uids = []
+            if idx in self._gpu_to_model_uid:
+                rep_uid = self._gpu_to_model_uid[idx]
+                is_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                if is_vllm_model:
+                    raise RuntimeError(
+                        f"GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
+                        f"therefore cannot allocate GPU memory for a new model."
+                    )
+                existing_model_uids.append(rep_uid)
+            if idx in self._gpu_to_embedding_model_uids:
+                existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
+            # If user has run the vLLM model on the GPU that was forced to be specified,
+            # it is not possible to force this GPU to be allocated again
+            if idx in self._user_specified_gpu_to_model_uids:
+                for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
+                    is_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                    if is_vllm_model:
+                        raise RuntimeError(
+                            f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
+                            f"therefore cannot allocate GPU memory for a new model."
+                        )
+            if existing_model_uids:
+                logger.warning(
+                    f"WARNING!!! GPU index {idx} has been occupied "
+                    f"with these models on it: {existing_model_uids}"
+                )
+        for idx in gpu_idx:
+            self._user_specified_gpu_to_model_uids[idx].add((model_uid, model_type))
+        return sorted(gpu_idx)
     def release_devices(self, model_uid: str):
         devices = [
             dev
@@ -320,27 +408,46 @@ class WorkerActor(xo.StatelessActor):
             if model_uid in self._gpu_to_embedding_model_uids[dev]:
                 self._gpu_to_embedding_model_uids[dev].remove(model_uid)
+        # check user-specified slots
+        for dev in self._user_specified_gpu_to_model_uids:
+            model_infos = list(
+                filter(
+                    lambda x: x[0] == model_uid,
+                    self._user_specified_gpu_to_model_uids[dev],
+                )
+            )
+            for model_info in model_infos:
+                self._user_specified_gpu_to_model_uids[dev].remove(model_info)
     async def _create_subpool(
         self,
         model_uid: str,
         model_type: Optional[str] = None,
         n_gpu: Optional[Union[int, str]] = "auto",
+        gpu_idx: Optional[List[int]] = None,
     ) -> Tuple[str, List[str]]:
         env = {}
         devices = []
-        if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
-            # Currently, n_gpu=auto means using 1 GPU
-            gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
-            devices = (
-                [await self.allocate_devices_for_embedding(model_uid)]
-                if model_type in ["embedding", "rerank"]
-                else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
+        if gpu_idx is None:
+            if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
+                # Currently, n_gpu=auto means using 1 GPU
+                gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
+                devices = (
+                    [await self.allocate_devices_for_embedding(model_uid)]
+                    if model_type in ["embedding", "rerank"]
+                    else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
+                )
+                env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
+                logger.debug(f"GPU selected: {devices} for model {model_uid}")
+            if n_gpu is None:
+                env["CUDA_VISIBLE_DEVICES"] = "-1"
+                logger.debug(f"GPU disabled for model {model_uid}")
+        else:
+            assert isinstance(gpu_idx, list)
+            devices = await self.allocate_devices_with_gpu_idx(
+                model_uid, model_type, gpu_idx  # type: ignore
             )
             env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
-            logger.debug(f"GPU selected: {devices} for model {model_uid}")
-        if n_gpu is None:
-            env["CUDA_VISIBLE_DEVICES"] = "-1"
-            logger.debug(f"GPU disabled for model {model_uid}")
         if os.name != "nt" and platform.system() != "Darwin":
             # Linux
@@ -495,6 +602,7 @@ class WorkerActor(xo.StatelessActor):
         image_lora_load_kwargs: Optional[Dict] = None,
         image_lora_fuse_kwargs: Optional[Dict] = None,
         request_limits: Optional[int] = None,
+        gpu_idx: Optional[Union[int, List[int]]] = None,
         **kwargs,
     ):
         event_model_uid, _, __ = parse_replica_model_uid(model_uid)
@@ -510,6 +618,17 @@ class WorkerActor(xo.StatelessActor):
         launch_args.pop("self")
         launch_args.pop("kwargs")
         launch_args.update(kwargs)
+        if gpu_idx is not None:
+            logger.info(
+                f"You specify to launch the model: {model_name} on GPU index: {gpu_idx} "
+                f"of the worker: {self.address}, "
+                f"xinference will automatically ignore the `n_gpu` option."
+            )
+            if isinstance(gpu_idx, int):
+                gpu_idx = [gpu_idx]
+            assert isinstance(gpu_idx, list)
         if n_gpu is not None:
             if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > gpu_count()):
                 raise ValueError(
@@ -535,7 +654,7 @@ class WorkerActor(xo.StatelessActor):
         is_local_deployment = await self._supervisor_ref.is_local_deployment()
         subpool_address, devices = await self._create_subpool(
-            model_uid, model_type, n_gpu=n_gpu
+            model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
         )
         try:

xinference/deploy/cmdline.py CHANGED Viewed

@@ -376,18 +376,27 @@ def worker(
     is_flag=True,
     help="Persist the model configuration to the filesystem, retains the model registration after server restarts.",
 )
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 def register_model(
     endpoint: Optional[str],
     model_type: str,
     file: str,
     persist: bool,
+    api_key: Optional[str],
 ):
     endpoint = get_endpoint(endpoint)
     with open(file) as fd:
         model = fd.read()
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     client.register_model(
         model_type=model_type,
         model=model,
@@ -408,15 +417,24 @@ def register_model(
     help="Type of model to unregister (default is 'LLM').",
 )
 @click.option("--model-name", "-n", type=str, help="Name of the model to unregister.")
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 def unregister_model(
     endpoint: Optional[str],
     model_type: str,
     model_name: str,
+    api_key: Optional[str],
 ):
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     client.unregister_model(
         model_type=model_type,
         model_name=model_name,
@@ -437,15 +455,24 @@ def unregister_model(
     type=str,
     help="Filter by model type (default is 'LLM').",
 )
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 def list_model_registrations(
     endpoint: Optional[str],
     model_type: str,
+    api_key: Optional[str],
 ):
     from tabulate import tabulate
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     registrations = client.list_model_registrations(model_type=model_type)
@@ -632,12 +659,31 @@ def list_model_registrations(
     type=(str, str),
     multiple=True,
 )
+@click.option(
+    "--worker-ip",
+    default=None,
+    type=str,
+    help="Specify which worker this model runs on by ip, for distributed situation.",
+)
+@click.option(
+    "--gpu-idx",
+    default=None,
+    type=str,
+    help="Specify which GPUs of a worker this model can run on, separated with commas.",
+)
 @click.option(
     "--trust-remote-code",
     default=True,
     type=bool,
     help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
 )
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 @click.pass_context
 def model_launch(
     ctx,
@@ -653,7 +699,10 @@ def model_launch(
     peft_model_path: Optional[str],
     image_lora_load_kwargs: Optional[Tuple],
     image_lora_fuse_kwargs: Optional[Tuple],
+    worker_ip: Optional[str],
+    gpu_idx: Optional[str],
     trust_remote_code: bool,
+    api_key: Optional[str],
 ):
     kwargs = {}
     for i in range(0, len(ctx.args), 2):
@@ -680,14 +729,19 @@ def model_launch(
         else None
     )
+    _gpu_idx: Optional[List[int]] = (
+        None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
+    )
     endpoint = get_endpoint(endpoint)
     model_size: Optional[Union[str, int]] = (
         size_in_billions
         if size_in_billions is None or "_" in size_in_billions
         else int(size_in_billions)
     )
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     model_uid = client.launch_model(
         model_name=model_name,
@@ -701,6 +755,8 @@ def model_launch(
         peft_model_path=peft_model_path,
         image_lora_load_kwargs=image_lora_load_params,
         image_lora_fuse_kwargs=image_lora_fuse_params,
+        worker_ip=worker_ip,
+        gpu_idx=_gpu_idx,
         trust_remote_code=trust_remote_code,
         **kwargs,
     )
@@ -718,12 +774,20 @@ def model_launch(
     type=str,
     help="Xinference endpoint.",
 )
-def model_list(endpoint: Optional[str]):
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
+def model_list(endpoint: Optional[str], api_key: Optional[str]):
     from tabulate import tabulate
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     llm_table = []
     embedding_table = []
@@ -844,13 +908,22 @@ def model_list(endpoint: Optional[str]):
     required=True,
     help="The unique identifier (UID) of the model.",
 )
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 def model_terminate(
     endpoint: Optional[str],
     model_uid: str,
+    api_key: Optional[str],
 ):
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     client.terminate_model(model_uid=model_uid)
@@ -873,15 +946,24 @@ def model_terminate(
     type=bool,
     help="Whether to stream the generated text. Use 'True' for streaming (default is True).",
 )
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 def model_generate(
     endpoint: Optional[str],
     model_uid: str,
     max_tokens: int,
     stream: bool,
+    api_key: Optional[str],
 ):
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     if stream:
         # TODO: when stream=True, RestfulClient cannot generate words one by one.
         # So use Client in temporary. The implementation needs to be changed to
@@ -959,16 +1041,25 @@ def model_generate(
     type=bool,
     help="Whether to stream the chat messages. Use 'True' for streaming (default is True).",
 )
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
 def model_chat(
     endpoint: Optional[str],
     model_uid: str,
     max_tokens: int,
     stream: bool,
+    api_key: Optional[str],
 ):
     # TODO: chat model roles may not be user and assistant.
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     chat_history: "List[ChatCompletionMessage]" = []
     if stream:
@@ -1048,10 +1139,18 @@ def model_chat(
 @cli.command("vllm-models", help="Query and display models compatible with vLLM.")
 @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
-def vllm_models(endpoint: Optional[str]):
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
+def vllm_models(endpoint: Optional[str], api_key: Optional[str]):
     endpoint = get_endpoint(endpoint)
-    client = RESTfulClient(base_url=endpoint)
-    client._set_token(get_stored_token(endpoint, client))
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
     vllm_models_dict = client.vllm_models()
     print("VLLM supported model families:")
     chat_models = vllm_models_dict["chat"]

xinference/model/embedding/core.py CHANGED Viewed

@@ -136,7 +136,7 @@ class EmbeddingModel:
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
         from sentence_transformers import SentenceTransformer
-        normalize_embeddings = kwargs.pop("normalize_embeddings", True)
+        kwargs.setdefault("normalize_embeddings", True)
         # copied from sentence-transformers, and modify it to return tokens num
         @no_type_check
@@ -272,7 +272,6 @@ class EmbeddingModel:
             self._model,
             sentences,
             convert_to_numpy=False,
-            normalize_embeddings=normalize_embeddings,
             **kwargs,
         )
         if isinstance(sentences, str):

xinference/model/llm/__init__.py CHANGED Viewed

@@ -49,14 +49,15 @@ from .llm_family import (
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
-    from .ggml.ctransformers import CtransformersModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.core import PytorchChatModel, PytorchModel
+    from .pytorch.deepseek_vl import DeepSeekVLChatModel
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
     from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
+    from .pytorch.omnilmm import OmniLMMModel
     from .pytorch.qwen_vl import QwenVLChatModel
     from .pytorch.vicuna import VicunaPytorchChatModel
     from .pytorch.yi_vl import YiVLChatModel
@@ -75,11 +76,6 @@ def _install():
             ChatglmCppChatModel,
         ]
     )
-    LLM_CLASSES.extend(
-        [
-            CtransformersModel,
-        ]
-    )
     LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
     LLM_CLASSES.extend(
@@ -94,7 +90,9 @@ def _install():
             FalconPytorchModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
+            OmniLMMModel,
             YiVLChatModel,
+            DeepSeekVLChatModel,
             PytorchModel,
         ]
     )

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -30,7 +30,6 @@ from ....types import (
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import ChatModelMixin
-from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
 logger = logging.getLogger(__name__)
@@ -182,11 +181,7 @@ class LlamaCppModel(LLM):
     ) -> bool:
         if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
             return False
-        if (
-            "chatglm" in llm_family.model_name
-            or "qwen" in llm_family.model_name
-            or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
-        ):
+        if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
             return False
         if "generate" not in llm_family.model_ability:
             return False
@@ -250,10 +245,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
     ) -> bool:
         if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
             return False
-        if (
-            "chatglm" in llm_family.model_name
-            or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
-        ):
+        if "chatglm" in llm_family.model_name:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

xinference 0.9.4py3-none-any.whl → 0.10.1py3-none-any.whl