PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/model/image/cache_manager.py CHANGED Viewed

@@ -60,3 +60,59 @@ class ImageCacheManager(CacheManager):
             raise NotImplementedError
         return full_path
+    def cache_lightning(self, lightning_version: Optional[str] = None):
+        from ..utils import IS_NEW_HUGGINGFACE_HUB, retry_download, symlink_local_file
+        from .core import ImageModelFamilyV2
+        if not lightning_version:
+            return None
+        assert isinstance(self._model_family, ImageModelFamilyV2)
+        cache_dir = self.get_cache_dir()
+        if not self._model_family.lightning_model_file_name_template:
+            raise NotImplementedError(
+                f"{self._model_family.model_name} does not support lightning"
+            )
+        if lightning_version not in (self._model_family.lightning_versions or []):
+            raise ValueError(
+                f"Cannot support lightning version {lightning_version}, "
+                f"available lightning version: {self._model_family.lightning_versions}"
+            )
+        filename = self._model_family.lightning_model_file_name_template.format(lightning_version=lightning_version)  # type: ignore
+        full_path = os.path.join(cache_dir, filename)
+        if self._model_family.model_hub == "huggingface":
+            import huggingface_hub
+            use_symlinks = {}
+            if not IS_NEW_HUGGINGFACE_HUB:
+                use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
+            download_file_path = retry_download(
+                huggingface_hub.hf_hub_download,
+                self._model_family.model_name,
+                None,
+                self._model_family.lightning_model_id,
+                filename=filename,
+                **use_symlinks,
+            )
+            if IS_NEW_HUGGINGFACE_HUB:
+                symlink_local_file(download_file_path, cache_dir, filename)
+        elif self._model_family.model_hub == "modelscope":
+            from modelscope.hub.file_download import model_file_download
+            download_file_path = retry_download(
+                model_file_download,
+                self._model_family.model_name,
+                None,
+                self._model_family.lightning_model_id,
+                filename,
+                revision=self._model_family.model_revision,
+            )
+            symlink_local_file(download_file_path, cache_dir, filename)
+        else:
+            raise NotImplementedError
+        return full_path

xinference/model/image/core.py CHANGED Viewed

@@ -51,6 +51,10 @@ class ImageModelFamilyV2(CacheableModelSpec, ModelInstanceInfoMixin):
     gguf_model_id: Optional[str]
     gguf_quantizations: Optional[List[str]]
     gguf_model_file_name_template: Optional[str]
+    lightning_model_id: Optional[str]
+    lightning_versions: Optional[List[str]]
+    lightning_model_file_name_template: Optional[str]
     virtualenv: Optional[VirtualEnvSettings]
     class Config:
@@ -180,6 +184,8 @@ def create_image_model_instance(
     model_path: Optional[str] = None,
     gguf_quantization: Optional[str] = None,
     gguf_model_path: Optional[str] = None,
+    lightning_version: Optional[str] = None,
+    lightning_model_path: Optional[str] = None,
     **kwargs,
 ) -> Union[DiffusionModel, MLXDiffusionModel, GotOCR2Model]:
     from .cache_manager import ImageCacheManager
@@ -235,6 +241,8 @@ def create_image_model_instance(
         model_path = cache_manager.cache()
     if not gguf_model_path and gguf_quantization:
         gguf_model_path = cache_manager.cache_gguf(gguf_quantization)
+    if not lightning_model_path and lightning_version:
+        lightning_model_path = cache_manager.cache_lightning(lightning_version)
     if peft_model_config is not None:
         lora_model = peft_model_config.peft_model
         lora_load_kwargs = peft_model_config.image_lora_load_kwargs
@@ -262,6 +270,7 @@ def create_image_model_instance(
         lora_fuse_kwargs=lora_fuse_kwargs,
         model_spec=model_spec,
         gguf_model_path=gguf_model_path,
+        lightning_model_path=lightning_model_path,
         **kwargs,
     )
     return model

xinference/model/image/model_spec.json CHANGED Viewed

@@ -169,7 +169,184 @@
     },
     "virtualenv": {
       "packages": [
-        "git+https://github.com/huggingface/diffusers.git",
+        "diffusers==0.35.1",
+        "peft>=0.17.0",
+        "#system_torch#",
+        "#system_numpy#"
+      ],
+      "no_build_isolation": true
+    }
+  },
+  {
+    "version": 2,
+    "model_name": "Qwen-Image",
+    "model_family": "stable_diffusion",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "model_src": {
+      "huggingface": {
+        "model_id": "Qwen/Qwen-Image",
+        "model_revision": "4516c4d3058302ff35cd86c62ffa645d039fefad",
+        "gguf_model_id": "city96/Qwen-Image-gguf",
+        "gguf_quantizations": [
+          "F16",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0",
+          "8steps-V1.1-bf16",
+          "8steps-V1.1"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Lightning-{lightning_version}.safetensors"
+      },
+      "modelscope": {
+        "model_id": "Qwen/Qwen-Image",
+        "model_revision": "master",
+        "gguf_model_id": "city96/Qwen-Image-gguf",
+        "gguf_quantizations": [
+          "F16",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0",
+          "8steps-V1.1-bf16",
+          "8steps-V1.1"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Lightning-{lightning_version}.safetensors"
+      }
+    },
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder",
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "true_cfg_scale": 1.0
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers==0.35.1",
+        "peft>=0.17.0",
+        "#system_torch#",
+        "#system_numpy#"
+      ],
+      "no_build_isolation": true
+    }
+  },
+  {
+    "version": 2,
+    "model_name": "Qwen-Image-Edit",
+    "model_family": "stable_diffusion",
+    "model_ability": [
+      "image2image"
+    ],
+    "model_src": {
+      "huggingface": {
+        "model_id": "Qwen/Qwen-Image-Edit",
+        "model_revision": "0b71959872ea3bf4d106c578b7c480ebb133dba7",
+        "gguf_model_id": "QuantStack/Qwen-Image-Edit-GGUF",
+        "gguf_quantizations": [
+          "Q2_K",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "Qwen_Image_Edit-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0-bf16",
+          "8steps-V1.0"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Edit-Lightning-{lightning_version}.safetensors"
+      },
+      "modelscope": {
+        "model_id": "Qwen/Qwen-Image-Edit",
+        "model_revision": "master",
+        "gguf_model_id": "QuantStack/Qwen-Image-Edit-GGUF",
+        "gguf_quantizations": [
+          "Q2_K",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "Qwen_Image_Edit-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0-bf16",
+          "8steps-V1.0"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Edit-Lightning-{lightning_version}.safetensors"
+      }
+    },
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder",
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "true_cfg_scale": 4.0
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers==0.35.1",
+        "peft>=0.17.0",
+        "#system_torch#",
         "#system_numpy#"
       ],
       "no_build_isolation": true

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import contextlib
 import gc
 import importlib
@@ -19,6 +20,7 @@ import inspect
 import itertools
 import json
 import logging
+import math
 import os
 import re
 import sys
@@ -30,7 +32,11 @@ import PIL.Image
 import torch
 from PIL import ImageOps
-from ....device_utils import get_available_device, move_model_to_available_device
+from ....device_utils import (
+    get_available_device,
+    gpu_count,
+    move_model_to_available_device,
+)
 from ....types import LoRA
 from ..sdapi import SDAPIDiffusionModelMixin
 from ..utils import handle_image_result
@@ -89,6 +95,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         lora_fuse_kwargs: Optional[Dict] = None,
         model_spec: Optional["ImageModelFamilyV2"] = None,
         gguf_model_path: Optional[str] = None,
+        lightning_model_path: Optional[str] = None,
         **kwargs,
     ):
         self.model_family = model_spec
@@ -115,6 +122,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         self._kwargs = kwargs
         # gguf
         self._gguf_model_path = gguf_model_path
+        # lightning
+        self._lightning_model_path = lightning_model_path
     @property
     def model_ability(self):
@@ -171,7 +180,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 )
             model = model_type.from_pipe(self._model, controlnet=controlnet)
         else:
-            model = model_type.from_pipe(self._model)
+            try:
+                from diffusers import (
+                    QwenImageImg2ImgPipeline,
+                    QwenImageInpaintPipeline,
+                    QwenImagePipeline,
+                )
+            except ImportError:
+                QwenImagePipeline = None
+                QwenImageImg2ImgPipeline = None
+                QwenImageInpaintPipeline = None
+            if QwenImagePipeline is not None and isinstance(
+                self._model, QwenImagePipeline
+            ):
+                # special process for Qwen-image
+                if ability == "image2image":
+                    model = QwenImageImg2ImgPipeline.from_pipe(
+                        self._model, torch_dtype=None
+                    )
+                else:
+                    assert ability == "inpainting"
+                    model = QwenImageInpaintPipeline.from_pipe(
+                        self._model, torch_dtype=None
+                    )
+            else:
+                model = model_type.from_pipe(self._model)
         self._load_to_device(model)
         self._ability_to_models[ability, controlnet_name] = model
@@ -237,27 +271,42 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         else:
             self._quantize_transformer()
+        if (device_count := gpu_count()) > 1 and "device_map" not in self._kwargs:
+            logger.debug(
+                "Device count (%d) > 1, force to set device_map=balanced", device_count
+            )
+            self._kwargs["device_map"] = "balanced"
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
         )
-        try:
-            self._model = AutoPipelineModel.from_pretrained(
-                self._model_path,
-                **self._kwargs,
-            )
-        except ValueError:
-            if "kontext" in self._model_spec.model_name.lower():
-                # TODO: remove this branch when auto pipeline supports
-                # flux.1-kontext-dev
-                from diffusers import FluxKontextPipeline
-                self._model = FluxKontextPipeline.from_pretrained(
-                    self._model_path, **self._kwargs
+        with self._process_lightning(self._kwargs):
+            try:
+                self._model = AutoPipelineModel.from_pretrained(
+                    self._model_path,
+                    **self._kwargs,
                 )
-            else:
-                raise
-        self._load_to_device(self._model)
-        self._apply_lora()
+            except ValueError:
+                if "kontext" in self._model_spec.model_name.lower():
+                    # TODO: remove this branch when auto pipeline supports
+                    # flux.1-kontext-dev
+                    from diffusers import FluxKontextPipeline
+                    self._model = FluxKontextPipeline.from_pretrained(
+                        self._model_path, **self._kwargs
+                    )
+                elif "qwen" in self._model_spec.model_name.lower():
+                    # TODO: remove this branch when auto pipeline supports
+                    # Qwen-Image
+                    from diffusers import DiffusionPipeline
+                    self._model = DiffusionPipeline.from_pretrained(
+                        self._model_path, **self._kwargs
+                    )
+                else:
+                    raise
+            self._load_to_device(self._model)
+            self._apply_lora()
         if self._kwargs.get("deepcache", False):
             try:
@@ -348,11 +397,19 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             return
         if not quantize_text_encoder:
+            logger.debug("No text encoder quantization")
             return
         quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
         quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
+        logger.debug(
+            "Quantize text encoder %s with method %s, quantization %s",
+            quantize_text_encoder,
+            quantization_method,
+            quantization,
+        )
         torch_dtype = self._torch_dtype
         for text_encoder_name in quantize_text_encoder.split(","):
             quantization_kwargs: Dict[str, Any] = {}
@@ -389,8 +446,13 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         if not quantization:
             # skip if no quantization specified
+            logger.debug("No transformer quantization")
             return
+        logger.debug(
+            "Quantize transformer with %s, quantization %s", method, quantization
+        )
         torch_dtype = self._torch_dtype
         transformer_cls = self._get_layer_cls("transformer")
         quantization_config = self._get_quantize_config(
@@ -409,6 +471,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         # GGUF transformer
         torch_dtype = self._torch_dtype
+        logger.debug("Quantize transformer with gguf file %s", self._gguf_model_path)
         self._kwargs["transformer"] = self._get_layer_cls(
             "transformer"
         ).from_single_file(
@@ -418,6 +481,44 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             config=os.path.join(self._model_path, "transformer"),
         )
+    @contextlib.contextmanager
+    def _process_lightning(self, kwargs):
+        lightning_model_path = self._lightning_model_path
+        if not lightning_model_path:
+            yield
+            return
+        from diffusers import FlowMatchEulerDiscreteScheduler
+        if "qwen" in self._model_spec.model_name.lower():
+            scheduler_config = {
+                "base_image_seq_len": 256,
+                "base_shift": math.log(3),  # We use shift=3 in distillation
+                "invert_sigmas": False,
+                "max_image_seq_len": 8192,
+                "max_shift": math.log(3),  # We use shift=3 in distillation
+                "num_train_timesteps": 1000,
+                "shift": 1.0,
+                "shift_terminal": None,  # set shift_terminal to None
+                "stochastic_sampling": False,
+                "time_shift_type": "exponential",
+                "use_beta_sigmas": False,
+                "use_dynamic_shifting": True,
+                "use_exponential_sigmas": False,
+                "use_karras_sigmas": False,
+            }
+            scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
+            kwargs["scheduler"] = scheduler
+            yield
+            model = self._model
+            logger.debug("Loading lightning lora: %s", self._lightning_model_path)
+            model.load_lora_weights(self._lightning_model_path)
+        else:
+            logger.debug("No lightning applied")
+            yield
     def _load_to_device(self, model):
         if self._kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
@@ -665,7 +766,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             await self._image_batch_scheduler.add_request(
                 prompt, future, n, size, response_format, **kwargs
             )
-            import asyncio
             fut = asyncio.wrap_future(future)
             return await fut
@@ -680,6 +780,18 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         if self._image_batch_scheduler and not self._image_batch_scheduler._running:
             await self._image_batch_scheduler.start()
+    def _gen_config_for_lightning(self, kwargs):
+        if (
+            not kwargs.get("num_inference_steps")
+            and self._lightning_model_path is not None
+        ):
+            is_4_steps = "4steps" in self._lightning_model_path
+            if is_4_steps:
+                kwargs["num_inference_steps"] = 4
+            else:
+                assert "8steps" in self._lightning_model_path
+                kwargs["num_inference_steps"] = 8
     async def _direct_text_to_image(
         self,
         prompt: str,
@@ -692,14 +804,28 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         generate_kwargs = self._model_spec.default_generate_config.copy()  # type: ignore
         generate_kwargs.update({k: v for k, v in kwargs.items() if v is not None})
         generate_kwargs["width"], generate_kwargs["height"] = width, height
+        self._gen_config_for_lightning(generate_kwargs)
-        return self._call_model(
-            prompt=prompt,
-            num_images_per_prompt=n,
+        return await asyncio.to_thread(
+            self._call_model,
+            prompt=prompt,  # type: ignore
+            num_images_per_prompt=n,  # type: ignore
             response_format=response_format,
             **generate_kwargs,
         )
+    async def abort_request(self, request_id: str) -> str:
+        """Abort a running request."""
+        from ....model.scheduler.core import AbortRequestMessage
+        # Check if we have a cancel callback for this request
+        if hasattr(self, "_cancel_callbacks") and request_id in self._cancel_callbacks:
+            cancel_callback = self._cancel_callbacks.pop(request_id)
+            cancel_callback()
+            return AbortRequestMessage.DONE.name
+        return AbortRequestMessage.NO_OP.name
     @staticmethod
     def pad_to_multiple(image, multiple=8):
         x, y = image.size
@@ -747,6 +873,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             if allow_width_height:
                 kwargs["width"], kwargs["height"] = image.size
+        # generate config for lightning
+        self._gen_config_for_lightning(kwargs)
         return self._call_model(
             image=image,
             prompt=prompt,
@@ -797,6 +926,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             # calculate actual image size after padding
             kwargs["width"], kwargs["height"] = image.size
+        # generate config for lightning
+        self._gen_config_for_lightning(kwargs)
         return self._call_model(
             image=image,
             mask_image=mask_image,

xinference/model/llm/cache_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 import os
 from typing import TYPE_CHECKING, Optional
@@ -81,7 +95,7 @@ class LLMCacheManager(CacheManager):
         if not IS_NEW_HUGGINGFACE_HUB:
             use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
-        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "bnb", "mlx"]:
             download_dir = retry_download(
                 huggingface_hub.snapshot_download,
                 self._model_name,
@@ -144,7 +158,7 @@ class LLMCacheManager(CacheManager):
         if self.get_cache_status():
             return cache_dir
-        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+        if self._model_format in ["pytorch", "gptq", "awq", "bnb", "fp8", "bnb", "mlx"]:
             download_dir = retry_download(
                 snapshot_download,
                 self._model_name,
@@ -234,7 +248,7 @@ class LLMCacheManager(CacheManager):
         if self.get_cache_status():
             return cache_dir
-        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "bnb", "mlx"]:
             download_dir = retry_download(
                 snapshot_download,
                 self._model_name,

xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl