PyPI - xinference - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

xinference 0.9.0py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (47) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-02-22T15:40:53+0800",
+ "date": "2024-03-08T13:28:03+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "c653c975847f9f6a81382033a9c8f5bd81bf70f2",
- "version": "0.9.0"
+ "full-revisionid": "29f4c10a854cfec684dcf8398a0974f64bf8ce2b",
+ "version": "0.9.2"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -219,6 +219,11 @@ class RESTfulAPI:
         self._router.add_api_route(
             "/v1/models/families", self._get_builtin_families, methods=["GET"]
         )
+        self._router.add_api_route(
+            "/v1/models/vllm-supported",
+            self.list_vllm_supported_model_families,
+            methods=["GET"],
+        )
         self._router.add_api_route(
             "/v1/cluster/info", self.get_cluster_device_info, methods=["GET"]
         )
@@ -651,6 +656,9 @@ class RESTfulAPI:
         replica = payload.get("replica", 1)
         n_gpu = payload.get("n_gpu", "auto")
         request_limits = payload.get("request_limits", None)
+        peft_model_path = payload.get("peft_model_path", None)
+        image_lora_load_kwargs = payload.get("image_lora_load_kwargs", None)
+        image_lora_fuse_kwargs = payload.get("image_lora_fuse_kwargs", None)
         exclude_keys = {
             "model_uid",
@@ -662,6 +670,9 @@ class RESTfulAPI:
             "replica",
             "n_gpu",
             "request_limits",
+            "peft_model_path",
+            "image_lora_load_kwargs",
+            "image_lora_fuse_kwargs",
         }
         kwargs = {
@@ -686,6 +697,9 @@ class RESTfulAPI:
                 n_gpu=n_gpu,
                 request_limits=request_limits,
                 wait_ready=wait_ready,
+                peft_model_path=peft_model_path,
+                image_lora_load_kwargs=image_lora_load_kwargs,
+                image_lora_fuse_kwargs=image_lora_fuse_kwargs,
                 **kwargs,
             )
@@ -845,6 +859,7 @@ class RESTfulAPI:
         }
         kwargs = body.dict(exclude_unset=True, exclude=exclude)
+        # TODO: Decide if this default value override is necessary #1061
         if body.max_tokens is None:
             kwargs["max_tokens"] = max_tokens_field.default
@@ -1136,6 +1151,7 @@ class RESTfulAPI:
         }
         kwargs = body.dict(exclude_unset=True, exclude=exclude)
+        # TODO: Decide if this default value override is necessary #1061
         if body.max_tokens is None:
             kwargs["max_tokens"] = max_tokens_field.default
@@ -1256,6 +1272,7 @@ class RESTfulAPI:
                         self.handle_request_limit_error(re)
                     async for item in iterator:
                         yield item
+                    yield "[DONE]"
                 except Exception as ex:
                     logger.exception("Chat completion stream got an error: %s", ex)
                     await self._report_error_event(model_uid, str(ex))
@@ -1348,6 +1365,22 @@ class RESTfulAPI:
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+    async def list_vllm_supported_model_families(self) -> JSONResponse:
+        try:
+            from ..model.llm.vllm.core import (
+                VLLM_SUPPORTED_CHAT_MODELS,
+                VLLM_SUPPORTED_MODELS,
+            )
+            data = {
+                "chat": VLLM_SUPPORTED_CHAT_MODELS,
+                "generate": VLLM_SUPPORTED_MODELS,
+            }
+            return JSONResponse(content=data)
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
     async def get_cluster_device_info(
         self, detailed: bool = Query(False)
     ) -> JSONResponse:

xinference/client/common.py CHANGED Viewed

@@ -43,6 +43,8 @@ def streaming_response_iterator(
         line = line.strip()
         if line.startswith(b"data:"):
             json_str = line[len(b"data:") :].strip()
+            if json_str == b"[DONE]":
+                continue
             data = json.loads(json_str.decode("utf-8"))
             error = data.get("error")
             if error is not None:

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+import typing
 import warnings
 from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
@@ -47,6 +48,25 @@ def _get_error_string(response: requests.Response) -> str:
     return "Unknown error"
+@typing.no_type_check
+def handle_system_prompts(
+    chat_history: List["ChatCompletionMessage"], system_prompt: Optional[str]
+) -> List["ChatCompletionMessage"]:
+    history_system_prompts = [
+        ch["content"] for ch in chat_history if ch["role"] == "system"
+    ]
+    if system_prompt is not None:
+        history_system_prompts.append(system_prompt)
+    # remove all the system prompt in the chat_history
+    chat_history = list(filter(lambda x: x["role"] != "system", chat_history))
+    # insert all system prompts at the beginning
+    chat_history.insert(
+        0, {"role": "system", "content": ". ".join(history_system_prompts)}
+    )
+    return chat_history
 class RESTfulModelHandle:
     """
     A sync model interface (for RESTful client) which provides type hints that makes it much easier to use xinference
@@ -363,15 +383,8 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
         if chat_history is None:
             chat_history = []
-        if chat_history and chat_history[0]["role"] == "system":
-            if system_prompt is not None:
-                chat_history[0]["content"] = system_prompt
-        else:
-            if system_prompt is not None:
-                chat_history.insert(0, {"role": "system", "content": system_prompt})
-        chat_history.append({"role": "user", "content": prompt})
+        chat_history = handle_system_prompts(chat_history, system_prompt)
+        chat_history.append({"role": "user", "content": prompt})  # type: ignore
         request_body: Dict[str, Any] = {
             "model": self._model_uid,
@@ -444,14 +457,8 @@ class RESTfulChatglmCppChatModelHandle(RESTfulModelHandle):
         if chat_history is None:
             chat_history = []
-        if chat_history and chat_history[0]["role"] == "system":
-            if system_prompt is not None:
-                chat_history[0]["content"] = system_prompt
-        else:
-            if system_prompt is not None:
-                chat_history.insert(0, {"role": "system", "content": system_prompt})
-        chat_history.append({"role": "user", "content": prompt})
+        chat_history = handle_system_prompts(chat_history, system_prompt)
+        chat_history.append({"role": "user", "content": prompt})  # type: ignore
         request_body: Dict[str, Any] = {
             "model": self._model_uid,
@@ -676,6 +683,19 @@ class Client:
             response_data = response.json()
             self._cluster_authed = bool(response_data["auth"])
+    def vllm_models(self) -> Dict[str, Any]:
+        url = f"{self.base_url}/v1/models/vllm-supported"
+        response = requests.get(url, headers=self._headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to fetch VLLM models. detail: {response.json()['detail']}"
+            )
+        try:
+            return response.json()
+        except Exception as e:
+            raise RuntimeError(f"Error parsing JSON response: {e}")
     def login(self, username: str, password: str):
         if not self._cluster_authed:
             return
@@ -771,6 +791,9 @@ class Client:
         replica: int = 1,
         n_gpu: Optional[Union[int, str]] = "auto",
         request_limits: Optional[int] = None,
+        peft_model_path: Optional[str] = None,
+        image_lora_load_kwargs: Optional[Dict] = None,
+        image_lora_fuse_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> str:
         """
@@ -798,6 +821,12 @@ class Client:
         request_limits: Optional[int]
             The number of request limits for this model， default is None.
             ``request_limits=None`` means no limits for this model.
+        peft_model_path: Optional[str]
+            PEFT (Parameter-Efficient Fine-Tuning) model path.
+        image_lora_load_kwargs: Optional[Dict]
+            lora load parameters for image model
+        image_lora_fuse_kwargs: Optional[Dict]
+            lora fuse parameters for image model
         **kwargs:
             Any other parameters been specified.
@@ -820,6 +849,9 @@ class Client:
             "replica": replica,
             "n_gpu": n_gpu,
             "request_limits": request_limits,
+            "peft_model_path": peft_model_path,
+            "image_lora_load_kwargs": image_lora_load_kwargs,
+            "image_lora_fuse_kwargs": image_lora_fuse_kwargs,
         }
         for key, value in kwargs.items():

xinference/conftest.py CHANGED Viewed

@@ -25,6 +25,10 @@ from typing import Dict, Optional
 import pytest
 import xoscar as xo
+# skip health checking for CI
+if os.environ.get("GITHUB_ACTIONS"):
+    os.environ["XINFERENCE_DISABLE_HEALTH_CHECK"] = "1"
 from .api.oauth2.types import AuthConfig, AuthStartupConfig, User
 from .constants import XINFERENCE_LOG_BACKUP_COUNT, XINFERENCE_LOG_MAX_BYTES
 from .core.supervisor import SupervisorActor
@@ -134,7 +138,6 @@ async def _start_test_cluster(
     logging_conf: Optional[Dict] = None,
 ):
     logging.config.dictConfig(logging_conf)  # type: ignore
     pool = None
     try:
         pool = await create_worker_actor_pool(

xinference/core/supervisor.py CHANGED Viewed

@@ -714,6 +714,9 @@ class SupervisorActor(xo.StatelessActor):
         request_limits: Optional[int] = None,
         wait_ready: bool = True,
         model_version: Optional[str] = None,
+        peft_model_path: Optional[str] = None,
+        image_lora_load_kwargs: Optional[Dict] = None,
+        image_lora_fuse_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> str:
         if model_uid is None:
@@ -751,6 +754,9 @@ class SupervisorActor(xo.StatelessActor):
                 model_type=model_type,
                 n_gpu=n_gpu,
                 request_limits=request_limits,
+                peft_model_path=peft_model_path,
+                image_lora_load_kwargs=image_lora_load_kwargs,
+                image_lora_fuse_kwargs=image_lora_fuse_kwargs,
                 **kwargs,
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
@@ -922,7 +928,11 @@ class SupervisorActor(xo.StatelessActor):
         workers = list(self._worker_address_to_worker.values())
         for worker in workers:
             ret.update(await worker.list_models())
-        return {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
+        running_model_info = {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
+        # add replica count
+        for k, v in running_model_info.items():
+            v["replica"] = self._model_uid_to_replica_info[k].replica
+        return running_model_info
     def is_local_deployment(self) -> bool:
         # TODO: temporary.

xinference/core/worker.py CHANGED Viewed

@@ -27,7 +27,11 @@ import xoscar as xo
 from async_timeout import timeout
 from xoscar import MainActorPoolType
-from ..constants import XINFERENCE_CACHE_DIR
+from ..constants import (
+    XINFERENCE_CACHE_DIR,
+    XINFERENCE_DISABLE_HEALTH_CHECK,
+    XINFERENCE_HEALTH_CHECK_INTERVAL,
+)
 from ..core import ModelActor
 from ..core.status_guard import LaunchStatus
 from ..device_utils import gpu_count
@@ -40,7 +44,6 @@ from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
 logger = getLogger(__name__)
-DEFAULT_NODE_HEARTBEAT_INTERVAL = 5
 MODEL_ACTOR_AUTO_RECOVER_LIMIT: Optional[int]
 _MODEL_ACTOR_AUTO_RECOVER_LIMIT = os.getenv("XINFERENCE_MODEL_ACTOR_AUTO_RECOVER_LIMIT")
 if _MODEL_ACTOR_AUTO_RECOVER_LIMIT is not None:
@@ -177,12 +180,13 @@ class WorkerActor(xo.StatelessActor):
             address=self._supervisor_address, uid=SupervisorActor.uid()
         )
         await self._supervisor_ref.add_worker(self.address)
-        # Run _periodical_report_status() in a dedicated thread.
-        self._isolation = Isolation(asyncio.new_event_loop(), threaded=True)
-        self._isolation.start()
-        asyncio.run_coroutine_threadsafe(
-            self._periodical_report_status(), loop=self._isolation.loop
-        )
+        if not XINFERENCE_DISABLE_HEALTH_CHECK:
+            # Run _periodical_report_status() in a dedicated thread.
+            self._isolation = Isolation(asyncio.new_event_loop(), threaded=True)
+            self._isolation.start()
+            asyncio.run_coroutine_threadsafe(
+                self._periodical_report_status(), loop=self._isolation.loop
+            )
         logger.info(f"Xinference worker {self.address} started")
         logger.info("Purge cache directory: %s", XINFERENCE_CACHE_DIR)
         purge_dir(XINFERENCE_CACHE_DIR)
@@ -487,6 +491,9 @@ class WorkerActor(xo.StatelessActor):
         quantization: Optional[str],
         model_type: str = "LLM",
         n_gpu: Optional[Union[int, str]] = "auto",
+        peft_model_path: Optional[str] = None,
+        image_lora_load_kwargs: Optional[Dict] = None,
+        image_lora_fuse_kwargs: Optional[Dict] = None,
         request_limits: Optional[int] = None,
         **kwargs,
     ):
@@ -512,6 +519,16 @@ class WorkerActor(xo.StatelessActor):
             if isinstance(n_gpu, str) and n_gpu != "auto":
                 raise ValueError("Currently `n_gpu` only supports `auto`.")
+        if peft_model_path is not None:
+            if model_type in ("embedding", "rerank"):
+                raise ValueError(
+                    f"PEFT adaptors cannot be applied to embedding or rerank models."
+                )
+            if model_type == "LLM" and model_format in ("ggufv2", "ggmlv3"):
+                raise ValueError(
+                    f"PEFT adaptors can only be applied to pytorch-like models"
+                )
         assert model_uid not in self._model_uid_to_model
         self._check_model_is_valid(model_name, model_format)
         assert self._supervisor_ref is not None
@@ -533,6 +550,9 @@ class WorkerActor(xo.StatelessActor):
                 model_format,
                 model_size_in_billions,
                 quantization,
+                peft_model_path,
+                image_lora_load_kwargs,
+                image_lora_fuse_kwargs,
                 is_local_deployment,
                 **kwargs,
             )
@@ -662,7 +682,7 @@ class WorkerActor(xo.StatelessActor):
             ) as ex:  # pragma: no cover  # noqa: E722  # nosec  # pylint: disable=bare-except
                 logger.error(f"Failed to upload node info: {ex}")
             try:
-                await asyncio.sleep(DEFAULT_NODE_HEARTBEAT_INTERVAL)
+                await asyncio.sleep(XINFERENCE_HEALTH_CHECK_INTERVAL)
             except asyncio.CancelledError:  # pragma: no cover
                 break

xinference/deploy/cmdline.py CHANGED Viewed

@@ -17,7 +17,7 @@ import logging
 import os
 import sys
 import warnings
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import click
 from xoscar.utils import get_next_port
@@ -40,7 +40,12 @@ from ..constants import (
 )
 from ..isolation import Isolation
 from ..types import ChatCompletionMessage
-from .utils import get_config_dict, get_log_file, get_timestamp_ms
+from .utils import (
+    get_config_dict,
+    get_log_file,
+    get_timestamp_ms,
+    handle_click_args_type,
+)
 try:
     # provide elaborate line editing and history features.
@@ -525,6 +530,10 @@ def list_model_registrations(
 @cli.command(
     "launch",
     help="Launch a model with the Xinference framework with the given parameters.",
+    context_settings=dict(
+        ignore_unknown_options=True,
+        allow_extra_args=True,
+    ),
 )
 @click.option(
     "--endpoint",
@@ -587,13 +596,35 @@ def list_model_registrations(
     type=str,
     help='The number of GPUs used by the model, default is "auto".',
 )
+@click.option(
+    "--peft-model-path",
+    default=None,
+    type=str,
+    help="PEFT model path.",
+)
+@click.option(
+    "--image-lora-load-kwargs",
+    "-ld",
+    "image_lora_load_kwargs",
+    type=(str, str),
+    multiple=True,
+)
+@click.option(
+    "--image-lora-fuse-kwargs",
+    "-fd",
+    "image_lora_fuse_kwargs",
+    type=(str, str),
+    multiple=True,
+)
 @click.option(
     "--trust-remote-code",
     default=True,
     type=bool,
     help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
 )
+@click.pass_context
 def model_launch(
+    ctx,
     endpoint: Optional[str],
     model_name: str,
     model_type: str,
@@ -603,8 +634,18 @@ def model_launch(
     quantization: str,
     replica: int,
     n_gpu: str,
+    peft_model_path: Optional[str],
+    image_lora_load_kwargs: Optional[Tuple],
+    image_lora_fuse_kwargs: Optional[Tuple],
     trust_remote_code: bool,
 ):
+    kwargs = {}
+    for i in range(0, len(ctx.args), 2):
+        if not ctx.args[i].startswith("--"):
+            raise ValueError("You must specify extra kwargs with `--` prefix.")
+        kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
+    print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)
     if n_gpu.lower() == "none":
         _n_gpu: Optional[Union[int, str]] = None
     elif n_gpu == "auto":
@@ -612,6 +653,17 @@ def model_launch(
     else:
         _n_gpu = int(n_gpu)
+    image_lora_load_params = (
+        {k: handle_click_args_type(v) for k, v in dict(image_lora_load_kwargs).items()}
+        if image_lora_load_kwargs
+        else None
+    )
+    image_lora_fuse_params = (
+        {k: handle_click_args_type(v) for k, v in dict(image_lora_fuse_kwargs).items()}
+        if image_lora_fuse_kwargs
+        else None
+    )
     endpoint = get_endpoint(endpoint)
     model_size: Optional[Union[str, int]] = (
         size_in_billions
@@ -630,7 +682,11 @@ def model_launch(
         quantization=quantization,
         replica=replica,
         n_gpu=_n_gpu,
+        peft_model_path=peft_model_path,
+        image_lora_load_kwargs=image_lora_load_params,
+        image_lora_fuse_kwargs=image_lora_fuse_params,
         trust_remote_code=trust_remote_code,
+        **kwargs,
     )
     print(f"Model uid: {model_uid}", file=sys.stderr)
@@ -925,6 +981,21 @@ def model_chat(
             )
+@cli.command("vllm-models", help="Query and display models compatible with VLLM.")
+@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
+def vllm_models(endpoint: Optional[str]):
+    endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
+    vllm_models_dict = client.vllm_models()
+    print("VLLM supported model families:")
+    chat_models = vllm_models_dict["chat"]
+    supported_models = vllm_models_dict["generate"]
+    print("VLLM supported chat model families:", chat_models)
+    print("VLLM supported generate model families:", supported_models)
 @cli.command("login", help="Login when the cluster is authenticated.")
 @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
 @click.option("--username", type=str, required=True, help="Username.")

xinference/deploy/utils.py CHANGED Viewed

@@ -15,7 +15,8 @@
 import logging
 import os
 import time
-from typing import TYPE_CHECKING, Optional
+import typing
+from typing import TYPE_CHECKING, Any, Optional
 import xoscar as xo
@@ -159,3 +160,26 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
 def get_timestamp_ms():
     t = time.time()
     return int(round(t * 1000))
+@typing.no_type_check
+def handle_click_args_type(arg: str) -> Any:
+    if arg == "None":
+        return None
+    if arg in ("True", "true"):
+        return True
+    if arg in ("False", "false"):
+        return False
+    try:
+        result = int(arg)
+        return result
+    except:
+        pass
+    try:
+        result = float(arg)
+        return result
+    except:
+        pass
+    return arg

xinference/device_utils.py CHANGED Viewed

@@ -92,8 +92,6 @@ def gpu_count():
         )
         return min(torch.cuda.device_count(), len(cuda_visible_devices))
-    elif torch.backends.mps.is_available():
-        return 1
     elif is_xpu_available():
         return torch.xpu.device_count()
     else:

xinference/model/core.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 from .._compat import BaseModel
@@ -52,6 +52,9 @@ def create_model_instance(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[int] = None,
     quantization: Optional[str] = None,
+    peft_model_path: Optional[str] = None,
+    image_lora_load_kwargs: Optional[Dict] = None,
+    image_lora_fuse_kwargs: Optional[Dict] = None,
     is_local_deployment: bool = False,
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:
@@ -70,6 +73,7 @@ def create_model_instance(
             model_format,
             model_size_in_billions,
             quantization,
+            peft_model_path,
             is_local_deployment,
             **kwargs,
         )
@@ -82,7 +86,14 @@ def create_model_instance(
     elif model_type == "image":
         kwargs.pop("trust_remote_code", None)
         return create_image_model_instance(
-            subpool_addr, devices, model_uid, model_name, **kwargs
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            lora_model_path=peft_model_path,
+            lora_load_kwargs=image_lora_load_kwargs,
+            lora_fuse_kwargs=image_lora_fuse_kwargs,
+            **kwargs,
         )
     elif model_type == "rerank":
         kwargs.pop("trust_remote_code", None)

xinference/model/image/core.py CHANGED Viewed

@@ -155,7 +155,14 @@ def get_cache_status(
 def create_image_model_instance(
-    subpool_addr: str, devices: List[str], model_uid: str, model_name: str, **kwargs
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    lora_model_path: Optional[str] = None,
+    lora_load_kwargs: Optional[Dict] = None,
+    lora_fuse_kwargs: Optional[Dict] = None,
+    **kwargs,
 ) -> Tuple[DiffusionModel, ImageModelDescription]:
     model_spec = match_diffusion(model_name)
     controlnet = kwargs.get("controlnet")
@@ -187,7 +194,14 @@ def create_image_model_instance(
         else:
             kwargs["controlnet"] = controlnet_model_paths
     model_path = cache(model_spec)
-    model = DiffusionModel(model_uid, model_path, **kwargs)
+    model = DiffusionModel(
+        model_uid,
+        model_path,
+        lora_model_path=lora_model_path,
+        lora_load_kwargs=lora_load_kwargs,
+        lora_fuse_kwargs=lora_fuse_kwargs,
+        **kwargs,
+    )
     model_description = ImageModelDescription(
         subpool_addr, devices, model_spec, model_path=model_path
     )

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -21,7 +21,7 @@ import uuid
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from io import BytesIO
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 from ....constants import XINFERENCE_IMAGE_DIR
 from ....device_utils import move_model_to_available_device
@@ -32,14 +32,36 @@ logger = logging.getLogger(__name__)
 class DiffusionModel:
     def __init__(
-        self, model_uid: str, model_path: str, device: Optional[str] = None, **kwargs
+        self,
+        model_uid: str,
+        model_path: str,
+        device: Optional[str] = None,
+        lora_model_path: Optional[str] = None,
+        lora_load_kwargs: Optional[Dict] = None,
+        lora_fuse_kwargs: Optional[Dict] = None,
+        **kwargs,
     ):
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
         self._model = None
+        self._lora_model_path = lora_model_path
+        self._lora_load_kwargs = lora_load_kwargs or {}
+        self._lora_fuse_kwargs = lora_fuse_kwargs or {}
         self._kwargs = kwargs
+    def _apply_lora(self):
+        if self._lora_model_path is not None:
+            logger.info(
+                f"Loading the LoRA with load kwargs: {self._lora_load_kwargs}, fuse kwargs: {self._lora_fuse_kwargs}."
+            )
+            assert self._model is not None
+            self._model.load_lora_weights(
+                self._lora_model_path, **self._lora_load_kwargs
+            )
+            self._model.fuse_lora(**self._lora_fuse_kwargs)
+            logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
     def load(self):
         # import torch
         from diffusers import AutoPipelineForText2Image
@@ -61,6 +83,7 @@ class DiffusionModel:
         self._model = move_model_to_available_device(self._model)
         # Recommended if your computer has < 64 GB of RAM
         self._model.enable_attention_slicing()
+        self._apply_lora()
     def _call_model(
         self,

xinference 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

Potentially problematic release.

xinference 0.9.0py3-none-any.whl → 0.9.2py3-none-any.whl