PyPI - xinference - Versions diffs - 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

xinference 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (59) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-03-21T14:58:01+0800",
+ "date": "2024-03-29T12:46:14+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "2c9465ade7f358d57d4bc087277882d896a8de15",
- "version": "0.9.4"
+ "full-revisionid": "2857ec497afbd2a6895d3658384ff3b4022b2840",
+ "version": "0.10.0"
 }
 '''  # END VERSION_JSON

xinference/api/oauth2/auth_service.py CHANGED Viewed

@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from datetime import timedelta
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from fastapi import Depends, HTTPException, status
 from fastapi.security import OAuth2PasswordBearer, SecurityScopes
@@ -40,13 +41,30 @@ class AuthService:
     def config(self):
         return self._config
+    @staticmethod
+    def is_legal_api_key(key: str) -> bool:
+        pattern = re.compile("^sk-[a-zA-Z0-9]{13}$")
+        return re.match(pattern, key) is not None
     def init_auth_config(self):
         if self._auth_config_file:
             config: AuthStartupConfig = parse_file_as(
                 path=self._auth_config_file, type_=AuthStartupConfig
             )
+            all_api_keys = set()
             for user in config.user_config:
                 user.password = get_password_hash(user.password)
+                for api_key in user.api_keys:
+                    if not self.is_legal_api_key(api_key):
+                        raise ValueError(
+                            "Api-Key should be a string started with 'sk-' with a total length of 16"
+                        )
+                    if api_key in all_api_keys:
+                        raise ValueError(
+                            "Duplicate api-keys exists, please check your configuration"
+                        )
+                    else:
+                        all_api_keys.add(api_key)
             return config
     def __call__(
@@ -67,28 +85,30 @@ class AuthService:
             headers={"WWW-Authenticate": authenticate_value},
         )
-        try:
-            assert self._config is not None
-            payload = jwt.decode(
-                token,
-                self._config.auth_config.secret_key,
-                algorithms=[self._config.auth_config.algorithm],
-                options={"verify_exp": False},  # TODO: supports token expiration
-            )
-            username: str = payload.get("sub")
-            if username is None:
+        if self.is_legal_api_key(token):
+            user, token_scopes = self.get_user_and_scopes_with_api_key(token)
+        else:
+            try:
+                assert self._config is not None
+                payload = jwt.decode(
+                    token,
+                    self._config.auth_config.secret_key,
+                    algorithms=[self._config.auth_config.algorithm],
+                    options={"verify_exp": False},  # TODO: supports token expiration
+                )
+                username: str = payload.get("sub")
+                if username is None:
+                    raise credentials_exception
+                token_scopes = payload.get("scopes", [])
+                user = self.get_user(username)
+            except (JWTError, ValidationError):
                 raise credentials_exception
-            token_scopes = payload.get("scopes", [])
-            token_data = TokenData(scopes=token_scopes, username=username)
-        except (JWTError, ValidationError):
-            raise credentials_exception
-        user = self.get_user(token_data.username)
         if user is None:
             raise credentials_exception
-        if "admin" in token_data.scopes:
+        if "admin" in token_scopes:
             return user
         for scope in security_scopes.scopes:
-            if scope not in token_data.scopes:
+            if scope not in token_scopes:
                 raise HTTPException(
                     status_code=status.HTTP_403_FORBIDDEN,
                     detail="Not enough permissions",
@@ -102,6 +122,15 @@ class AuthService:
                 return user
         return None
+    def get_user_and_scopes_with_api_key(
+        self, api_key: str
+    ) -> Tuple[Optional[User], List]:
+        for user in self._config.user_config:
+            for key in user.api_keys:
+                if api_key == key:
+                    return user, user.permissions
+        return None, []
     def authenticate_user(self, username: str, password: str):
         user = self.get_user(username)
         if not user:

xinference/api/oauth2/types.py CHANGED Viewed

@@ -23,6 +23,7 @@ class LoginUserForm(BaseModel):
 class User(LoginUserForm):
     permissions: List[str]
+    api_keys: List[str]
 class AuthConfig(BaseModel):

xinference/api/restful_api.py CHANGED Viewed

@@ -89,7 +89,9 @@ class CreateCompletionRequest(CreateCompletion):
 class CreateEmbeddingRequest(BaseModel):
     model: str
-    input: Union[str, List[str]] = Field(description="The input to embed.")
+    input: Union[str, List[str], List[int], List[List[int]]] = Field(
+        description="The input to embed."
+    )
     user: Optional[str] = None
     class Config:
@@ -693,6 +695,8 @@ class RESTfulAPI:
         peft_model_path = payload.get("peft_model_path", None)
         image_lora_load_kwargs = payload.get("image_lora_load_kwargs", None)
         image_lora_fuse_kwargs = payload.get("image_lora_fuse_kwargs", None)
+        worker_ip = payload.get("worker_ip", None)
+        gpu_idx = payload.get("gpu_idx", None)
         exclude_keys = {
             "model_uid",
@@ -707,6 +711,8 @@ class RESTfulAPI:
             "peft_model_path",
             "image_lora_load_kwargs",
             "image_lora_fuse_kwargs",
+            "worker_ip",
+            "gpu_idx",
         }
         kwargs = {
@@ -734,6 +740,8 @@ class RESTfulAPI:
                 peft_model_path=peft_model_path,
                 image_lora_load_kwargs=image_lora_load_kwargs,
                 image_lora_fuse_kwargs=image_lora_fuse_kwargs,
+                worker_ip=worker_ip,
+                gpu_idx=gpu_idx,
                 **kwargs,
             )

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -651,11 +651,13 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
 class Client:
-    def __init__(self, base_url):
+    def __init__(self, base_url, api_key: Optional[str] = None):
         self.base_url = base_url
-        self._headers = {}
+        self._headers: Dict[str, str] = {}
         self._cluster_authed = False
         self._check_cluster_authenticated()
+        if api_key is not None and self._cluster_authed:
+            self._headers["Authorization"] = f"Bearer {api_key}"
     def _set_token(self, token: Optional[str]):
         if not self._cluster_authed or token is None:
@@ -795,6 +797,8 @@ class Client:
         peft_model_path: Optional[str] = None,
         image_lora_load_kwargs: Optional[Dict] = None,
         image_lora_fuse_kwargs: Optional[Dict] = None,
+        worker_ip: Optional[str] = None,
+        gpu_idx: Optional[Union[int, List[int]]] = None,
         **kwargs,
     ) -> str:
         """
@@ -828,6 +832,10 @@ class Client:
             lora load parameters for image model
         image_lora_fuse_kwargs: Optional[Dict]
             lora fuse parameters for image model
+        worker_ip: Optional[str]
+            Specify the worker ip where the model is located in a distributed scenario.
+        gpu_idx: Optional[Union[int, List[int]]]
+            Specify the GPU index where the model is located.
         **kwargs:
             Any other parameters been specified.
@@ -853,6 +861,8 @@ class Client:
             "peft_model_path": peft_model_path,
             "image_lora_load_kwargs": image_lora_load_kwargs,
             "image_lora_fuse_kwargs": image_lora_fuse_kwargs,
+            "worker_ip": worker_ip,
+            "gpu_idx": gpu_idx,
         }
         for key, value in kwargs.items():

xinference/conftest.py CHANGED Viewed

@@ -261,12 +261,23 @@ def setup_with_auth():
     if not cluster_health_check(supervisor_addr, max_attempts=10, sleep_interval=3):
         raise RuntimeError("Cluster is not available after multiple attempts")
-    user1 = User(username="user1", password="pass1", permissions=["admin"])
-    user2 = User(username="user2", password="pass2", permissions=["models:list"])
+    user1 = User(
+        username="user1",
+        password="pass1",
+        permissions=["admin"],
+        api_keys=["sk-3sjLbdwqAhhAF", "sk-0HCRO1rauFQDL"],
+    )
+    user2 = User(
+        username="user2",
+        password="pass2",
+        permissions=["models:list"],
+        api_keys=["sk-72tkvudyGLPMi"],
+    )
     user3 = User(
         username="user3",
         password="pass3",
         permissions=["models:list", "models:read", "models:start"],
+        api_keys=["sk-m6jEzEwmCc4iQ", "sk-ZOTLIY4gt9w11"],
     )
     auth_config = AuthConfig(
         algorithm="HS256",

xinference/core/supervisor.py CHANGED Viewed

@@ -92,6 +92,15 @@ class SupervisorActor(xo.StatelessActor):
     def uid(cls) -> str:
         return "supervisor"
+    def _get_worker_ref_by_ip(
+        self, ip: str
+    ) -> Optional[xo.ActorRefType["WorkerActor"]]:
+        for addr, ref in self._worker_address_to_worker.items():
+            existing_ip = addr.split(":")[0]
+            if existing_ip == ip:
+                return ref
+        return None
     async def __post_create__(self):
         self._uptime = time.time()
         if not XINFERENCE_DISABLE_HEALTH_CHECK:
@@ -717,8 +726,25 @@ class SupervisorActor(xo.StatelessActor):
         peft_model_path: Optional[str] = None,
         image_lora_load_kwargs: Optional[Dict] = None,
         image_lora_fuse_kwargs: Optional[Dict] = None,
+        worker_ip: Optional[str] = None,
+        gpu_idx: Optional[Union[int, List[int]]] = None,
         **kwargs,
     ) -> str:
+        target_ip_worker_ref = (
+            self._get_worker_ref_by_ip(worker_ip) if worker_ip is not None else None
+        )
+        if (
+            worker_ip is not None
+            and not self.is_local_deployment()
+            and target_ip_worker_ref is None
+        ):
+            raise ValueError(f"Worker ip address {worker_ip} is not in the cluster.")
+        if worker_ip is not None and self.is_local_deployment():
+            logger.warning(
+                f"You specified the worker ip: {worker_ip} in local mode, "
+                f"xinference will ignore this option."
+            )
         if model_uid is None:
             model_uid = self._gen_model_uid(model_name)
@@ -735,7 +761,11 @@ class SupervisorActor(xo.StatelessActor):
                 )
             nonlocal model_type
-            worker_ref = await self._choose_worker()
+            worker_ref = (
+                target_ip_worker_ref
+                if target_ip_worker_ref is not None
+                else await self._choose_worker()
+            )
             # LLM as default for compatibility
             model_type = model_type or "LLM"
             await worker_ref.launch_builtin_model(
@@ -750,6 +780,7 @@ class SupervisorActor(xo.StatelessActor):
                 peft_model_path=peft_model_path,
                 image_lora_load_kwargs=image_lora_load_kwargs,
                 image_lora_fuse_kwargs=image_lora_fuse_kwargs,
+                gpu_idx=gpu_idx,
                 **kwargs,
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref

xinference/core/worker.py CHANGED Viewed

@@ -74,6 +74,10 @@ class WorkerActor(xo.StatelessActor):
         self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
         self._gpu_to_model_uid: Dict[int, str] = {}
         self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
+        # Dict structure: gpu_index: {(replica_model_uid, model_type)}
+        self._user_specified_gpu_to_model_uids: Dict[
+            int, Set[Tuple[str, str]]
+        ] = defaultdict(set)
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, int] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
@@ -268,12 +272,27 @@ class WorkerActor(xo.StatelessActor):
         """
         candidates = []
         for _dev in self._total_gpu_devices:
-            if _dev not in self._gpu_to_model_uid:
+            if (
+                _dev not in self._gpu_to_model_uid
+                and _dev not in self._user_specified_gpu_to_model_uids
+            ):  # no possible vllm model on it, add it to candidates
                 candidates.append(_dev)
-            else:
-                existing_model_uid = self._gpu_to_model_uid[_dev]
-                is_vllm_model = await self.is_model_vllm_backend(existing_model_uid)
-                if not is_vllm_model:
+            else:  # need to judge that whether to have vllm model on this device
+                has_vllm_model = False
+                if _dev in self._gpu_to_model_uid:
+                    existing_model_uid = self._gpu_to_model_uid[_dev]
+                    has_vllm_model = await self.is_model_vllm_backend(
+                        existing_model_uid
+                    )
+                if (
+                    not has_vllm_model
+                    and _dev in self._user_specified_gpu_to_model_uids
+                ):
+                    for rep_uid, _ in self._user_specified_gpu_to_model_uids[_dev]:
+                        has_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                        if has_vllm_model:
+                            break
+                if not has_vllm_model:
                     candidates.append(_dev)
         if len(candidates) == 0:
@@ -285,9 +304,13 @@ class WorkerActor(xo.StatelessActor):
         device, min_cnt = -1, -1
         # Pick the device with the fewest existing models among all the candidate devices.
         for _dev in candidates:
-            existing_cnt = len(self._gpu_to_embedding_model_uids[_dev])
+            existing_cnt = 0
+            if _dev in self._gpu_to_embedding_model_uids:
+                existing_cnt += len(self._gpu_to_embedding_model_uids[_dev])
             if _dev in self._gpu_to_model_uid:
                 existing_cnt += 1
+            if _dev in self._user_specified_gpu_to_model_uids:
+                existing_cnt += len(self._user_specified_gpu_to_model_uids[_dev])
             if min_cnt == -1 or existing_cnt < min_cnt:
                 device, min_cnt = _dev, existing_cnt
@@ -295,17 +318,82 @@ class WorkerActor(xo.StatelessActor):
         return device
     def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
-        if n_gpu > len(self._total_gpu_devices) - len(self._gpu_to_model_uid):
+        user_specified_allocated_devices: Set[int] = set()
+        for dev, model_infos in self._user_specified_gpu_to_model_uids.items():
+            allocated_non_embedding_rerank_models = False
+            for _, model_type in model_infos:
+                allocated_non_embedding_rerank_models = model_type not in [
+                    "embedding",
+                    "rerank",
+                ]
+                if allocated_non_embedding_rerank_models:
+                    break
+            if allocated_non_embedding_rerank_models:
+                user_specified_allocated_devices.add(dev)
+        allocated_devices = set(self._gpu_to_model_uid.keys()).union(
+            user_specified_allocated_devices
+        )
+        if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
             raise RuntimeError("No available slot found for the model")
         devices: List[int] = [
-            dev for dev in self._total_gpu_devices if dev not in self._gpu_to_model_uid
+            dev
+            for dev in self._total_gpu_devices
+            if dev not in self._gpu_to_model_uid
+            and dev not in user_specified_allocated_devices
         ][:n_gpu]
         for dev in devices:
             self._gpu_to_model_uid[int(dev)] = model_uid
         return sorted(devices)
+    async def allocate_devices_with_gpu_idx(
+        self, model_uid: str, model_type: str, gpu_idx: List[int]
+    ) -> List[int]:
+        """
+        When user specifies the gpu_idx, allocate models on user-specified GPUs whenever possible
+        """
+        # must be subset of total devices visible to this worker
+        if not set(gpu_idx) <= set(self._total_gpu_devices):
+            raise ValueError(
+                f"Worker {self.address} cannot use the GPUs with these indexes: {gpu_idx}. "
+                f"Worker {self.address} can only see these GPUs: {self._total_gpu_devices}."
+            )
+        # currently just report a warning log when there are already models on these GPUs
+        for idx in gpu_idx:
+            existing_model_uids = []
+            if idx in self._gpu_to_model_uid:
+                rep_uid = self._gpu_to_model_uid[idx]
+                is_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                if is_vllm_model:
+                    raise RuntimeError(
+                        f"GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
+                        f"therefore cannot allocate GPU memory for a new model."
+                    )
+                existing_model_uids.append(rep_uid)
+            if idx in self._gpu_to_embedding_model_uids:
+                existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
+            # If user has run the vLLM model on the GPU that was forced to be specified,
+            # it is not possible to force this GPU to be allocated again
+            if idx in self._user_specified_gpu_to_model_uids:
+                for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
+                    is_vllm_model = await self.is_model_vllm_backend(rep_uid)
+                    if is_vllm_model:
+                        raise RuntimeError(
+                            f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
+                            f"therefore cannot allocate GPU memory for a new model."
+                        )
+            if existing_model_uids:
+                logger.warning(
+                    f"WARNING!!! GPU index {idx} has been occupied "
+                    f"with these models on it: {existing_model_uids}"
+                )
+        for idx in gpu_idx:
+            self._user_specified_gpu_to_model_uids[idx].add((model_uid, model_type))
+        return sorted(gpu_idx)
     def release_devices(self, model_uid: str):
         devices = [
             dev
@@ -320,27 +408,46 @@ class WorkerActor(xo.StatelessActor):
             if model_uid in self._gpu_to_embedding_model_uids[dev]:
                 self._gpu_to_embedding_model_uids[dev].remove(model_uid)
+        # check user-specified slots
+        for dev in self._user_specified_gpu_to_model_uids:
+            model_infos = list(
+                filter(
+                    lambda x: x[0] == model_uid,
+                    self._user_specified_gpu_to_model_uids[dev],
+                )
+            )
+            for model_info in model_infos:
+                self._user_specified_gpu_to_model_uids[dev].remove(model_info)
     async def _create_subpool(
         self,
         model_uid: str,
         model_type: Optional[str] = None,
         n_gpu: Optional[Union[int, str]] = "auto",
+        gpu_idx: Optional[List[int]] = None,
     ) -> Tuple[str, List[str]]:
         env = {}
         devices = []
-        if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
-            # Currently, n_gpu=auto means using 1 GPU
-            gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
-            devices = (
-                [await self.allocate_devices_for_embedding(model_uid)]
-                if model_type in ["embedding", "rerank"]
-                else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
+        if gpu_idx is None:
+            if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
+                # Currently, n_gpu=auto means using 1 GPU
+                gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
+                devices = (
+                    [await self.allocate_devices_for_embedding(model_uid)]
+                    if model_type in ["embedding", "rerank"]
+                    else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
+                )
+                env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
+                logger.debug(f"GPU selected: {devices} for model {model_uid}")
+            if n_gpu is None:
+                env["CUDA_VISIBLE_DEVICES"] = "-1"
+                logger.debug(f"GPU disabled for model {model_uid}")
+        else:
+            assert isinstance(gpu_idx, list)
+            devices = await self.allocate_devices_with_gpu_idx(
+                model_uid, model_type, gpu_idx  # type: ignore
             )
             env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
-            logger.debug(f"GPU selected: {devices} for model {model_uid}")
-        if n_gpu is None:
-            env["CUDA_VISIBLE_DEVICES"] = "-1"
-            logger.debug(f"GPU disabled for model {model_uid}")
         if os.name != "nt" and platform.system() != "Darwin":
             # Linux
@@ -495,6 +602,7 @@ class WorkerActor(xo.StatelessActor):
         image_lora_load_kwargs: Optional[Dict] = None,
         image_lora_fuse_kwargs: Optional[Dict] = None,
         request_limits: Optional[int] = None,
+        gpu_idx: Optional[Union[int, List[int]]] = None,
         **kwargs,
     ):
         event_model_uid, _, __ = parse_replica_model_uid(model_uid)
@@ -510,6 +618,17 @@ class WorkerActor(xo.StatelessActor):
         launch_args.pop("self")
         launch_args.pop("kwargs")
         launch_args.update(kwargs)
+        if gpu_idx is not None:
+            logger.info(
+                f"You specify to launch the model: {model_name} on GPU index: {gpu_idx} "
+                f"of the worker: {self.address}, "
+                f"xinference will automatically ignore the `n_gpu` option."
+            )
+            if isinstance(gpu_idx, int):
+                gpu_idx = [gpu_idx]
+            assert isinstance(gpu_idx, list)
         if n_gpu is not None:
             if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > gpu_count()):
                 raise ValueError(
@@ -535,7 +654,7 @@ class WorkerActor(xo.StatelessActor):
         is_local_deployment = await self._supervisor_ref.is_local_deployment()
         subpool_address, devices = await self._create_subpool(
-            model_uid, model_type, n_gpu=n_gpu
+            model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
         )
         try:

xinference 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

Potentially problematic release.

xinference 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl