PyPI - xinference - Versions diffs - 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

xinference 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (42) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-03-01T14:36:49+0800",
+ "date": "2024-03-08T13:28:03+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "7b20f76ff35c3ca1824656fcd792837d909b0351",
- "version": "0.9.1"
+ "full-revisionid": "29f4c10a854cfec684dcf8398a0974f64bf8ce2b",
+ "version": "0.9.2"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -219,6 +219,11 @@ class RESTfulAPI:
         self._router.add_api_route(
             "/v1/models/families", self._get_builtin_families, methods=["GET"]
         )
+        self._router.add_api_route(
+            "/v1/models/vllm-supported",
+            self.list_vllm_supported_model_families,
+            methods=["GET"],
+        )
         self._router.add_api_route(
             "/v1/cluster/info", self.get_cluster_device_info, methods=["GET"]
         )
@@ -651,6 +656,9 @@ class RESTfulAPI:
         replica = payload.get("replica", 1)
         n_gpu = payload.get("n_gpu", "auto")
         request_limits = payload.get("request_limits", None)
+        peft_model_path = payload.get("peft_model_path", None)
+        image_lora_load_kwargs = payload.get("image_lora_load_kwargs", None)
+        image_lora_fuse_kwargs = payload.get("image_lora_fuse_kwargs", None)
         exclude_keys = {
             "model_uid",
@@ -662,6 +670,9 @@ class RESTfulAPI:
             "replica",
             "n_gpu",
             "request_limits",
+            "peft_model_path",
+            "image_lora_load_kwargs",
+            "image_lora_fuse_kwargs",
         }
         kwargs = {
@@ -686,6 +697,9 @@ class RESTfulAPI:
                 n_gpu=n_gpu,
                 request_limits=request_limits,
                 wait_ready=wait_ready,
+                peft_model_path=peft_model_path,
+                image_lora_load_kwargs=image_lora_load_kwargs,
+                image_lora_fuse_kwargs=image_lora_fuse_kwargs,
                 **kwargs,
             )
@@ -1258,6 +1272,7 @@ class RESTfulAPI:
                         self.handle_request_limit_error(re)
                     async for item in iterator:
                         yield item
+                    yield "[DONE]"
                 except Exception as ex:
                     logger.exception("Chat completion stream got an error: %s", ex)
                     await self._report_error_event(model_uid, str(ex))
@@ -1350,6 +1365,22 @@ class RESTfulAPI:
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+    async def list_vllm_supported_model_families(self) -> JSONResponse:
+        try:
+            from ..model.llm.vllm.core import (
+                VLLM_SUPPORTED_CHAT_MODELS,
+                VLLM_SUPPORTED_MODELS,
+            )
+            data = {
+                "chat": VLLM_SUPPORTED_CHAT_MODELS,
+                "generate": VLLM_SUPPORTED_MODELS,
+            }
+            return JSONResponse(content=data)
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
     async def get_cluster_device_info(
         self, detailed: bool = Query(False)
     ) -> JSONResponse:

xinference/client/common.py CHANGED Viewed

@@ -43,6 +43,8 @@ def streaming_response_iterator(
         line = line.strip()
         if line.startswith(b"data:"):
             json_str = line[len(b"data:") :].strip()
+            if json_str == b"[DONE]":
+                continue
             data = json.loads(json_str.decode("utf-8"))
             error = data.get("error")
             if error is not None:

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -683,6 +683,19 @@ class Client:
             response_data = response.json()
             self._cluster_authed = bool(response_data["auth"])
+    def vllm_models(self) -> Dict[str, Any]:
+        url = f"{self.base_url}/v1/models/vllm-supported"
+        response = requests.get(url, headers=self._headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to fetch VLLM models. detail: {response.json()['detail']}"
+            )
+        try:
+            return response.json()
+        except Exception as e:
+            raise RuntimeError(f"Error parsing JSON response: {e}")
     def login(self, username: str, password: str):
         if not self._cluster_authed:
             return
@@ -778,6 +791,9 @@ class Client:
         replica: int = 1,
         n_gpu: Optional[Union[int, str]] = "auto",
         request_limits: Optional[int] = None,
+        peft_model_path: Optional[str] = None,
+        image_lora_load_kwargs: Optional[Dict] = None,
+        image_lora_fuse_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> str:
         """
@@ -805,6 +821,12 @@ class Client:
         request_limits: Optional[int]
             The number of request limits for this model， default is None.
             ``request_limits=None`` means no limits for this model.
+        peft_model_path: Optional[str]
+            PEFT (Parameter-Efficient Fine-Tuning) model path.
+        image_lora_load_kwargs: Optional[Dict]
+            lora load parameters for image model
+        image_lora_fuse_kwargs: Optional[Dict]
+            lora fuse parameters for image model
         **kwargs:
             Any other parameters been specified.
@@ -827,6 +849,9 @@ class Client:
             "replica": replica,
             "n_gpu": n_gpu,
             "request_limits": request_limits,
+            "peft_model_path": peft_model_path,
+            "image_lora_load_kwargs": image_lora_load_kwargs,
+            "image_lora_fuse_kwargs": image_lora_fuse_kwargs,
         }
         for key, value in kwargs.items():

xinference/core/supervisor.py CHANGED Viewed

@@ -714,6 +714,9 @@ class SupervisorActor(xo.StatelessActor):
         request_limits: Optional[int] = None,
         wait_ready: bool = True,
         model_version: Optional[str] = None,
+        peft_model_path: Optional[str] = None,
+        image_lora_load_kwargs: Optional[Dict] = None,
+        image_lora_fuse_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> str:
         if model_uid is None:
@@ -751,6 +754,9 @@ class SupervisorActor(xo.StatelessActor):
                 model_type=model_type,
                 n_gpu=n_gpu,
                 request_limits=request_limits,
+                peft_model_path=peft_model_path,
+                image_lora_load_kwargs=image_lora_load_kwargs,
+                image_lora_fuse_kwargs=image_lora_fuse_kwargs,
                 **kwargs,
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
@@ -922,7 +928,11 @@ class SupervisorActor(xo.StatelessActor):
         workers = list(self._worker_address_to_worker.values())
         for worker in workers:
             ret.update(await worker.list_models())
-        return {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
+        running_model_info = {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
+        # add replica count
+        for k, v in running_model_info.items():
+            v["replica"] = self._model_uid_to_replica_info[k].replica
+        return running_model_info
     def is_local_deployment(self) -> bool:
         # TODO: temporary.

xinference/core/worker.py CHANGED Viewed

@@ -491,6 +491,9 @@ class WorkerActor(xo.StatelessActor):
         quantization: Optional[str],
         model_type: str = "LLM",
         n_gpu: Optional[Union[int, str]] = "auto",
+        peft_model_path: Optional[str] = None,
+        image_lora_load_kwargs: Optional[Dict] = None,
+        image_lora_fuse_kwargs: Optional[Dict] = None,
         request_limits: Optional[int] = None,
         **kwargs,
     ):
@@ -516,6 +519,16 @@ class WorkerActor(xo.StatelessActor):
             if isinstance(n_gpu, str) and n_gpu != "auto":
                 raise ValueError("Currently `n_gpu` only supports `auto`.")
+        if peft_model_path is not None:
+            if model_type in ("embedding", "rerank"):
+                raise ValueError(
+                    f"PEFT adaptors cannot be applied to embedding or rerank models."
+                )
+            if model_type == "LLM" and model_format in ("ggufv2", "ggmlv3"):
+                raise ValueError(
+                    f"PEFT adaptors can only be applied to pytorch-like models"
+                )
         assert model_uid not in self._model_uid_to_model
         self._check_model_is_valid(model_name, model_format)
         assert self._supervisor_ref is not None
@@ -537,6 +550,9 @@ class WorkerActor(xo.StatelessActor):
                 model_format,
                 model_size_in_billions,
                 quantization,
+                peft_model_path,
+                image_lora_load_kwargs,
+                image_lora_fuse_kwargs,
                 is_local_deployment,
                 **kwargs,
             )

xinference/deploy/cmdline.py CHANGED Viewed

@@ -17,7 +17,7 @@ import logging
 import os
 import sys
 import warnings
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import click
 from xoscar.utils import get_next_port
@@ -596,6 +596,26 @@ def list_model_registrations(
     type=str,
     help='The number of GPUs used by the model, default is "auto".',
 )
+@click.option(
+    "--peft-model-path",
+    default=None,
+    type=str,
+    help="PEFT model path.",
+)
+@click.option(
+    "--image-lora-load-kwargs",
+    "-ld",
+    "image_lora_load_kwargs",
+    type=(str, str),
+    multiple=True,
+)
+@click.option(
+    "--image-lora-fuse-kwargs",
+    "-fd",
+    "image_lora_fuse_kwargs",
+    type=(str, str),
+    multiple=True,
+)
 @click.option(
     "--trust-remote-code",
     default=True,
@@ -614,6 +634,9 @@ def model_launch(
     quantization: str,
     replica: int,
     n_gpu: str,
+    peft_model_path: Optional[str],
+    image_lora_load_kwargs: Optional[Tuple],
+    image_lora_fuse_kwargs: Optional[Tuple],
     trust_remote_code: bool,
 ):
     kwargs = {}
@@ -630,6 +653,17 @@ def model_launch(
     else:
         _n_gpu = int(n_gpu)
+    image_lora_load_params = (
+        {k: handle_click_args_type(v) for k, v in dict(image_lora_load_kwargs).items()}
+        if image_lora_load_kwargs
+        else None
+    )
+    image_lora_fuse_params = (
+        {k: handle_click_args_type(v) for k, v in dict(image_lora_fuse_kwargs).items()}
+        if image_lora_fuse_kwargs
+        else None
+    )
     endpoint = get_endpoint(endpoint)
     model_size: Optional[Union[str, int]] = (
         size_in_billions
@@ -648,6 +682,9 @@ def model_launch(
         quantization=quantization,
         replica=replica,
         n_gpu=_n_gpu,
+        peft_model_path=peft_model_path,
+        image_lora_load_kwargs=image_lora_load_params,
+        image_lora_fuse_kwargs=image_lora_fuse_params,
         trust_remote_code=trust_remote_code,
         **kwargs,
     )
@@ -944,6 +981,21 @@ def model_chat(
             )
+@cli.command("vllm-models", help="Query and display models compatible with VLLM.")
+@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
+def vllm_models(endpoint: Optional[str]):
+    endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
+    vllm_models_dict = client.vllm_models()
+    print("VLLM supported model families:")
+    chat_models = vllm_models_dict["chat"]
+    supported_models = vllm_models_dict["generate"]
+    print("VLLM supported chat model families:", chat_models)
+    print("VLLM supported generate model families:", supported_models)
 @cli.command("login", help="Login when the cluster is authenticated.")
 @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
 @click.option("--username", type=str, required=True, help="Username.")

xinference/device_utils.py CHANGED Viewed

@@ -92,8 +92,6 @@ def gpu_count():
         )
         return min(torch.cuda.device_count(), len(cuda_visible_devices))
-    elif torch.backends.mps.is_available():
-        return 1
     elif is_xpu_available():
         return torch.xpu.device_count()
     else:

xinference/model/core.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 from .._compat import BaseModel
@@ -52,6 +52,9 @@ def create_model_instance(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[int] = None,
     quantization: Optional[str] = None,
+    peft_model_path: Optional[str] = None,
+    image_lora_load_kwargs: Optional[Dict] = None,
+    image_lora_fuse_kwargs: Optional[Dict] = None,
     is_local_deployment: bool = False,
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:
@@ -70,6 +73,7 @@ def create_model_instance(
             model_format,
             model_size_in_billions,
             quantization,
+            peft_model_path,
             is_local_deployment,
             **kwargs,
         )
@@ -82,7 +86,14 @@ def create_model_instance(
     elif model_type == "image":
         kwargs.pop("trust_remote_code", None)
         return create_image_model_instance(
-            subpool_addr, devices, model_uid, model_name, **kwargs
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            lora_model_path=peft_model_path,
+            lora_load_kwargs=image_lora_load_kwargs,
+            lora_fuse_kwargs=image_lora_fuse_kwargs,
+            **kwargs,
         )
     elif model_type == "rerank":
         kwargs.pop("trust_remote_code", None)

xinference/model/image/core.py CHANGED Viewed

@@ -155,7 +155,14 @@ def get_cache_status(
 def create_image_model_instance(
-    subpool_addr: str, devices: List[str], model_uid: str, model_name: str, **kwargs
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    lora_model_path: Optional[str] = None,
+    lora_load_kwargs: Optional[Dict] = None,
+    lora_fuse_kwargs: Optional[Dict] = None,
+    **kwargs,
 ) -> Tuple[DiffusionModel, ImageModelDescription]:
     model_spec = match_diffusion(model_name)
     controlnet = kwargs.get("controlnet")
@@ -187,7 +194,14 @@ def create_image_model_instance(
         else:
             kwargs["controlnet"] = controlnet_model_paths
     model_path = cache(model_spec)
-    model = DiffusionModel(model_uid, model_path, **kwargs)
+    model = DiffusionModel(
+        model_uid,
+        model_path,
+        lora_model_path=lora_model_path,
+        lora_load_kwargs=lora_load_kwargs,
+        lora_fuse_kwargs=lora_fuse_kwargs,
+        **kwargs,
+    )
     model_description = ImageModelDescription(
         subpool_addr, devices, model_spec, model_path=model_path
     )

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -21,7 +21,7 @@ import uuid
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from io import BytesIO
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 from ....constants import XINFERENCE_IMAGE_DIR
 from ....device_utils import move_model_to_available_device
@@ -32,14 +32,36 @@ logger = logging.getLogger(__name__)
 class DiffusionModel:
     def __init__(
-        self, model_uid: str, model_path: str, device: Optional[str] = None, **kwargs
+        self,
+        model_uid: str,
+        model_path: str,
+        device: Optional[str] = None,
+        lora_model_path: Optional[str] = None,
+        lora_load_kwargs: Optional[Dict] = None,
+        lora_fuse_kwargs: Optional[Dict] = None,
+        **kwargs,
     ):
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
         self._model = None
+        self._lora_model_path = lora_model_path
+        self._lora_load_kwargs = lora_load_kwargs or {}
+        self._lora_fuse_kwargs = lora_fuse_kwargs or {}
         self._kwargs = kwargs
+    def _apply_lora(self):
+        if self._lora_model_path is not None:
+            logger.info(
+                f"Loading the LoRA with load kwargs: {self._lora_load_kwargs}, fuse kwargs: {self._lora_fuse_kwargs}."
+            )
+            assert self._model is not None
+            self._model.load_lora_weights(
+                self._lora_model_path, **self._lora_load_kwargs
+            )
+            self._model.fuse_lora(**self._lora_fuse_kwargs)
+            logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
     def load(self):
         # import torch
         from diffusers import AutoPipelineForText2Image
@@ -61,6 +83,7 @@ class DiffusionModel:
         self._model = move_model_to_available_device(self._model)
         # Recommended if your computer has < 64 GB of RAM
         self._model.enable_attention_slicing()
+        self._apply_lora()
     def _call_model(
         self,

xinference/model/llm/__init__.py CHANGED Viewed

@@ -31,6 +31,7 @@ from .llm_family import (
     BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLM_CLASSES,
+    PEFT_SUPPORTED_CLASSES,
     CustomLLMFamilyV1,
     GgmlLLMSpecV1,
     LLMFamilyV1,
@@ -95,6 +96,22 @@ def _install():
             PytorchModel,
         ]
     )
+    PEFT_SUPPORTED_CLASSES.extend(
+        [
+            BaichuanPytorchChatModel,
+            VicunaPytorchChatModel,
+            FalconPytorchChatModel,
+            ChatglmPytorchChatModel,
+            LlamaPytorchModel,
+            LlamaPytorchChatModel,
+            PytorchChatModel,
+            FalconPytorchModel,
+            Internlm2PytorchChatModel,
+            QwenVLChatModel,
+            YiVLChatModel,
+            PytorchModel,
+        ]
+    )
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"

xinference/model/llm/core.py CHANGED Viewed

@@ -180,6 +180,7 @@ def create_llm_model_instance(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[int] = None,
     quantization: Optional[str] = None,
+    peft_model_path: Optional[str] = None,
     is_local_deployment: bool = False,
     **kwargs,
 ) -> Tuple[LLM, LLMDescription]:
@@ -203,7 +204,9 @@ def create_llm_model_instance(
     assert quantization is not None
     save_path = cache(llm_family, llm_spec, quantization)
-    llm_cls = match_llm_cls(llm_family, llm_spec, quantization)
+    llm_cls = match_llm_cls(
+        llm_family, llm_spec, quantization, peft_model_path=peft_model_path
+    )
     if not llm_cls:
         raise ValueError(
             f"Model not supported, name: {model_name}, format: {model_format},"
@@ -211,7 +214,20 @@ def create_llm_model_instance(
         )
     logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
-    model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs)
+    if peft_model_path is not None:
+        model = llm_cls(
+            model_uid,
+            llm_family,
+            llm_spec,
+            quantization,
+            save_path,
+            kwargs,
+            peft_model_path,
+        )
+    else:
+        model = llm_cls(
+            model_uid, llm_family, llm_spec, quantization, save_path, kwargs
+        )
     return model, LLMDescription(
         subpool_addr, devices, llm_family, llm_spec, quantization
     )

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -35,15 +35,6 @@ from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
 logger = logging.getLogger(__name__)
-SIZE_TO_GPU_LAYERS = {
-    3: 26,
-    7: 32,
-    13: 40,
-    30: 60,
-    65: 80,
-}
 class LlamaCppModel(LLM):
     def __init__(
         self,
@@ -56,13 +47,6 @@ class LlamaCppModel(LLM):
     ):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        closest_size = min(
-            SIZE_TO_GPU_LAYERS.keys(),
-            key=lambda x: abs(
-                x - self.handle_model_size(model_spec.model_size_in_billions)
-            ),
-        )
-        self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
         self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
             llamacpp_model_config
         )
@@ -96,9 +80,9 @@ class LlamaCppModel(LLM):
         if self._is_darwin_and_apple_silicon() and self._can_apply_metal():
             # TODO: platform.processor() is not safe, need to be replaced to other method.
-            llamacpp_model_config.setdefault("n_gpu_layers", 1)
+            llamacpp_model_config.setdefault("n_gpu_layers", -1)
         elif self._is_linux() and self._can_apply_cublas():
-            llamacpp_model_config.setdefault("n_gpu_layers", self._gpu_layers)
+            llamacpp_model_config.setdefault("n_gpu_layers", -1)
         return llamacpp_model_config
@@ -313,7 +297,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
                 generate_config["stop"] = [stop, "Observation:"]
             elif isinstance(stop, Iterable):
                 assert not isinstance(stop, str)
-                generate_config["stop"] = stop + ["Observation:"]
+                generate_config["stop"] = stop + ["Observation:"]  # type: ignore
             else:
                 generate_config["stop"] = "Observation:"

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -1599,10 +1599,15 @@
         "model_size_in_billions": 72,
         "quantizations": [
           "q2_k",
-          "q3_k_m"
+          "q3_k_m",
+          "q4_k_m"
         ],
         "model_id": "Qwen/Qwen1.5-72B-Chat-GGUF",
-        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
+        "model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
+        "quantization_parts": {
+          "q4_k_m": ["a", "b"]
+        }
       }
     ],
     "prompt_style": {
@@ -2967,7 +2972,7 @@
   },
   {
     "version": 1,
-    "context_length": 100000,
+    "context_length": 16384,
     "model_name": "glaive-coder",
     "model_description": "A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.",
     "model_lang": [

xinference 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

Potentially problematic release.

xinference 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl