PyPI - xinference - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/model/image/core.py CHANGED Viewed

@@ -51,6 +51,10 @@ class ImageModelFamilyV2(CacheableModelSpec, ModelInstanceInfoMixin):
     gguf_model_id: Optional[str]
     gguf_quantizations: Optional[List[str]]
     gguf_model_file_name_template: Optional[str]
+    lightning_model_id: Optional[str]
+    lightning_versions: Optional[List[str]]
+    lightning_model_file_name_template: Optional[str]
     virtualenv: Optional[VirtualEnvSettings]
     class Config:
@@ -180,6 +184,8 @@ def create_image_model_instance(
     model_path: Optional[str] = None,
     gguf_quantization: Optional[str] = None,
     gguf_model_path: Optional[str] = None,
+    lightning_version: Optional[str] = None,
+    lightning_model_path: Optional[str] = None,
     **kwargs,
 ) -> Union[DiffusionModel, MLXDiffusionModel, GotOCR2Model]:
     from .cache_manager import ImageCacheManager
@@ -235,6 +241,8 @@ def create_image_model_instance(
         model_path = cache_manager.cache()
     if not gguf_model_path and gguf_quantization:
         gguf_model_path = cache_manager.cache_gguf(gguf_quantization)
+    if not lightning_model_path and lightning_version:
+        lightning_model_path = cache_manager.cache_lightning(lightning_version)
     if peft_model_config is not None:
         lora_model = peft_model_config.peft_model
         lora_load_kwargs = peft_model_config.image_lora_load_kwargs
@@ -262,6 +270,7 @@ def create_image_model_instance(
         lora_fuse_kwargs=lora_fuse_kwargs,
         model_spec=model_spec,
         gguf_model_path=gguf_model_path,
+        lightning_model_path=lightning_model_path,
         **kwargs,
     )
     return model

xinference/model/image/model_spec.json CHANGED Viewed

@@ -169,7 +169,9 @@
     },
     "virtualenv": {
       "packages": [
-        "git+https://github.com/huggingface/diffusers.git",
+        "diffusers==0.35.1",
+        "peft>=0.17.0",
+        "#system_torch#",
         "#system_numpy#"
       ],
       "no_build_isolation": true
@@ -180,7 +182,9 @@
     "model_name": "Qwen-Image",
     "model_family": "stable_diffusion",
     "model_ability": [
-      "text2image"
+      "text2image",
+      "image2image",
+      "inpainting"
     ],
     "model_src": {
       "huggingface": {
@@ -202,7 +206,16 @@
           "Q6_K",
           "Q8_0"
         ],
-        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf"
+        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0",
+          "8steps-V1.1-bf16",
+          "8steps-V1.1"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Lightning-{lightning_version}.safetensors"
       },
       "modelscope": {
         "model_id": "Qwen/Qwen-Image",
@@ -223,7 +236,102 @@
           "Q6_K",
           "Q8_0"
         ],
-        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf"
+        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0",
+          "8steps-V1.1-bf16",
+          "8steps-V1.1"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Lightning-{lightning_version}.safetensors"
+      }
+    },
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder",
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "true_cfg_scale": 1.0
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers==0.35.1",
+        "peft>=0.17.0",
+        "#system_torch#",
+        "#system_numpy#"
+      ],
+      "no_build_isolation": true
+    }
+  },
+  {
+    "version": 2,
+    "model_name": "Qwen-Image-Edit",
+    "model_family": "stable_diffusion",
+    "model_ability": [
+      "image2image"
+    ],
+    "model_src": {
+      "huggingface": {
+        "model_id": "Qwen/Qwen-Image-Edit",
+        "model_revision": "0b71959872ea3bf4d106c578b7c480ebb133dba7",
+        "gguf_model_id": "QuantStack/Qwen-Image-Edit-GGUF",
+        "gguf_quantizations": [
+          "Q2_K",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "Qwen_Image_Edit-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0-bf16",
+          "8steps-V1.0"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Edit-Lightning-{lightning_version}.safetensors"
+      },
+      "modelscope": {
+        "model_id": "Qwen/Qwen-Image-Edit",
+        "model_revision": "master",
+        "gguf_model_id": "QuantStack/Qwen-Image-Edit-GGUF",
+        "gguf_quantizations": [
+          "Q2_K",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "Qwen_Image_Edit-{quantization}.gguf",
+        "lightning_model_id": "lightx2v/Qwen-Image-Lightning",
+        "lightning_versions": [
+          "4steps-V1.0-bf16",
+          "4steps-V1.0",
+          "8steps-V1.0-bf16",
+          "8steps-V1.0"
+        ],
+        "lightning_model_file_name_template": "Qwen-Image-Edit-Lightning-{lightning_version}.safetensors"
       }
     },
     "default_model_config": {
@@ -232,11 +340,11 @@
       "torch_dtype": "bfloat16"
     },
     "default_generate_config": {
-      "guidance_scale": 1.0
+      "true_cfg_scale": 4.0
     },
     "virtualenv": {
       "packages": [
-        "git+https://github.com/huggingface/diffusers.git",
+        "diffusers==0.35.1",
         "peft>=0.17.0",
         "#system_torch#",
         "#system_numpy#"
@@ -716,13 +824,12 @@
         "deepspeed==0.12.3",
         "peft==0.4.0",
         "tiktoken==0.6.0",
-        "bitsandbytes==0.41.0",
-        "scikit-learn==1.2.2",
         "sentencepiece==0.1.99",
         "einops==0.6.1",
         "einops-exts==0.0.4",
         "timm==0.6.13",
-        "numpy==1.26.4"
+        "#system_numpy#",
+        "#system_torch#"
       ]
     },
     "model_src": {

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import contextlib
 import gc
 import importlib
@@ -19,6 +20,7 @@ import inspect
 import itertools
 import json
 import logging
+import math
 import os
 import re
 import sys
@@ -30,7 +32,11 @@ import PIL.Image
 import torch
 from PIL import ImageOps
-from ....device_utils import get_available_device, move_model_to_available_device
+from ....device_utils import (
+    get_available_device,
+    gpu_count,
+    move_model_to_available_device,
+)
 from ....types import LoRA
 from ..sdapi import SDAPIDiffusionModelMixin
 from ..utils import handle_image_result
@@ -89,6 +95,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         lora_fuse_kwargs: Optional[Dict] = None,
         model_spec: Optional["ImageModelFamilyV2"] = None,
         gguf_model_path: Optional[str] = None,
+        lightning_model_path: Optional[str] = None,
         **kwargs,
     ):
         self.model_family = model_spec
@@ -115,6 +122,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         self._kwargs = kwargs
         # gguf
         self._gguf_model_path = gguf_model_path
+        # lightning
+        self._lightning_model_path = lightning_model_path
     @property
     def model_ability(self):
@@ -171,7 +180,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 )
             model = model_type.from_pipe(self._model, controlnet=controlnet)
         else:
-            model = model_type.from_pipe(self._model)
+            try:
+                from diffusers import (
+                    QwenImageImg2ImgPipeline,
+                    QwenImageInpaintPipeline,
+                    QwenImagePipeline,
+                )
+            except ImportError:
+                QwenImagePipeline = None
+                QwenImageImg2ImgPipeline = None
+                QwenImageInpaintPipeline = None
+            if QwenImagePipeline is not None and isinstance(
+                self._model, QwenImagePipeline
+            ):
+                # special process for Qwen-image
+                if ability == "image2image":
+                    model = QwenImageImg2ImgPipeline.from_pipe(
+                        self._model, torch_dtype=None
+                    )
+                else:
+                    assert ability == "inpainting"
+                    model = QwenImageInpaintPipeline.from_pipe(
+                        self._model, torch_dtype=None
+                    )
+            else:
+                model = model_type.from_pipe(self._model)
         self._load_to_device(model)
         self._ability_to_models[ability, controlnet_name] = model
@@ -237,35 +271,42 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         else:
             self._quantize_transformer()
+        if (device_count := gpu_count()) > 1 and "device_map" not in self._kwargs:
+            logger.debug(
+                "Device count (%d) > 1, force to set device_map=balanced", device_count
+            )
+            self._kwargs["device_map"] = "balanced"
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
         )
-        try:
-            self._model = AutoPipelineModel.from_pretrained(
-                self._model_path,
-                **self._kwargs,
-            )
-        except ValueError:
-            if "kontext" in self._model_spec.model_name.lower():
-                # TODO: remove this branch when auto pipeline supports
-                # flux.1-kontext-dev
-                from diffusers import FluxKontextPipeline
-                self._model = FluxKontextPipeline.from_pretrained(
-                    self._model_path, **self._kwargs
+        with self._process_lightning(self._kwargs):
+            try:
+                self._model = AutoPipelineModel.from_pretrained(
+                    self._model_path,
+                    **self._kwargs,
                 )
-            elif "qwen" in self._model_spec.model_name.lower():
-                # TODO: remove this branch when auto pipeline supports
-                # Qwen-Image
-                from diffusers import DiffusionPipeline
-                self._model = DiffusionPipeline.from_pretrained(
-                    self._model_path, **self._kwargs
-                )
-            else:
-                raise
-        self._load_to_device(self._model)
-        self._apply_lora()
+            except ValueError:
+                if "kontext" in self._model_spec.model_name.lower():
+                    # TODO: remove this branch when auto pipeline supports
+                    # flux.1-kontext-dev
+                    from diffusers import FluxKontextPipeline
+                    self._model = FluxKontextPipeline.from_pretrained(
+                        self._model_path, **self._kwargs
+                    )
+                elif "qwen" in self._model_spec.model_name.lower():
+                    # TODO: remove this branch when auto pipeline supports
+                    # Qwen-Image
+                    from diffusers import DiffusionPipeline
+                    self._model = DiffusionPipeline.from_pretrained(
+                        self._model_path, **self._kwargs
+                    )
+                else:
+                    raise
+            self._load_to_device(self._model)
+            self._apply_lora()
         if self._kwargs.get("deepcache", False):
             try:
@@ -440,6 +481,44 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             config=os.path.join(self._model_path, "transformer"),
         )
+    @contextlib.contextmanager
+    def _process_lightning(self, kwargs):
+        lightning_model_path = self._lightning_model_path
+        if not lightning_model_path:
+            yield
+            return
+        from diffusers import FlowMatchEulerDiscreteScheduler
+        if "qwen" in self._model_spec.model_name.lower():
+            scheduler_config = {
+                "base_image_seq_len": 256,
+                "base_shift": math.log(3),  # We use shift=3 in distillation
+                "invert_sigmas": False,
+                "max_image_seq_len": 8192,
+                "max_shift": math.log(3),  # We use shift=3 in distillation
+                "num_train_timesteps": 1000,
+                "shift": 1.0,
+                "shift_terminal": None,  # set shift_terminal to None
+                "stochastic_sampling": False,
+                "time_shift_type": "exponential",
+                "use_beta_sigmas": False,
+                "use_dynamic_shifting": True,
+                "use_exponential_sigmas": False,
+                "use_karras_sigmas": False,
+            }
+            scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
+            kwargs["scheduler"] = scheduler
+            yield
+            model = self._model
+            logger.debug("Loading lightning lora: %s", self._lightning_model_path)
+            model.load_lora_weights(self._lightning_model_path)
+        else:
+            logger.debug("No lightning applied")
+            yield
     def _load_to_device(self, model):
         if self._kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
@@ -687,7 +766,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             await self._image_batch_scheduler.add_request(
                 prompt, future, n, size, response_format, **kwargs
             )
-            import asyncio
             fut = asyncio.wrap_future(future)
             return await fut
@@ -702,6 +780,18 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         if self._image_batch_scheduler and not self._image_batch_scheduler._running:
             await self._image_batch_scheduler.start()
+    def _gen_config_for_lightning(self, kwargs):
+        if (
+            not kwargs.get("num_inference_steps")
+            and self._lightning_model_path is not None
+        ):
+            is_4_steps = "4steps" in self._lightning_model_path
+            if is_4_steps:
+                kwargs["num_inference_steps"] = 4
+            else:
+                assert "8steps" in self._lightning_model_path
+                kwargs["num_inference_steps"] = 8
     async def _direct_text_to_image(
         self,
         prompt: str,
@@ -714,14 +804,28 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         generate_kwargs = self._model_spec.default_generate_config.copy()  # type: ignore
         generate_kwargs.update({k: v for k, v in kwargs.items() if v is not None})
         generate_kwargs["width"], generate_kwargs["height"] = width, height
+        self._gen_config_for_lightning(generate_kwargs)
-        return self._call_model(
-            prompt=prompt,
-            num_images_per_prompt=n,
+        return await asyncio.to_thread(
+            self._call_model,
+            prompt=prompt,  # type: ignore
+            num_images_per_prompt=n,  # type: ignore
             response_format=response_format,
             **generate_kwargs,
         )
+    async def abort_request(self, request_id: str) -> str:
+        """Abort a running request."""
+        from ....model.scheduler.core import AbortRequestMessage
+        # Check if we have a cancel callback for this request
+        if hasattr(self, "_cancel_callbacks") and request_id in self._cancel_callbacks:
+            cancel_callback = self._cancel_callbacks.pop(request_id)
+            cancel_callback()
+            return AbortRequestMessage.DONE.name
+        return AbortRequestMessage.NO_OP.name
     @staticmethod
     def pad_to_multiple(image, multiple=8):
         x, y = image.size
@@ -769,6 +873,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             if allow_width_height:
                 kwargs["width"], kwargs["height"] = image.size
+        # generate config for lightning
+        self._gen_config_for_lightning(kwargs)
         return self._call_model(
             image=image,
             prompt=prompt,
@@ -819,6 +926,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             # calculate actual image size after padding
             kwargs["width"], kwargs["height"] = image.size
+        # generate config for lightning
+        self._gen_config_for_lightning(kwargs)
         return self._call_model(
             image=image,
             mask_image=mask_image,

xinference/model/llm/core.py CHANGED Viewed

@@ -27,6 +27,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 from ...core.utils import parse_replica_model_uid
 from ...types import PeftModelConfig
 from .reasoning_parser import ReasoningParser
+from .tool_parsers import TOOL_PARSERS
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV2, LLMSpecV1
@@ -59,6 +60,7 @@ class LLM(abc.ABC):
         self.quantization = model_family.model_specs[0].quantization
         self.model_path = model_path
         self.reasoning_parser = None
+        self.tool_parser = None
         if args:
             raise ValueError(f"Unrecognized positional arguments: {args}")
         if kwargs:
@@ -171,6 +173,14 @@ class LLM(abc.ABC):
             enable_thinking=enable_thinking,
         )
+    def prepare_parse_tool_calls(self):
+        if self.model_family.tool_parser is None:
+            return
+        if self.model_family.tool_parser not in TOOL_PARSERS:
+            return
+        tool_parser = TOOL_PARSERS[self.model_family.tool_parser]
+        self.tool_parser = tool_parser()
 # Context variable for passing per-request chat context (e.g., chat_template_kwargs).
 # This variable should be set at the beginning of each chat or stream_chat call.

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -19,11 +19,11 @@ import pprint
 import queue
 from typing import Iterator, List, Optional, Union
-import orjson
+from packaging import version
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
-from ..core import LLM
+from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import ChatModelMixin
@@ -98,10 +98,19 @@ class XllamaCppModel(LLM, ChatModelMixin):
             from xllamacpp import (
                 CommonParams,
                 Server,
+                __version__,
                 estimate_gpu_layers,
                 get_device_info,
                 ggml_backend_dev_type,
             )
+            try:
+                if version.parse(__version__) < version.parse("0.2.0"):
+                    raise RuntimeError(
+                        "Please update xllamacpp to >= 0.2.0 by `pip install -U xllamacpp`"
+                    )
+            except version.InvalidVersion:
+                pass  # If the version parse failed, we just skip the version check.
         except ImportError:
             error_message = "Failed to import module 'xllamacpp'"
             installation_guide = ["Please make sure 'xllamacpp' is installed. "]
@@ -113,6 +122,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         if os.path.isfile(self.model_path):
             # mostly passed from --model_path
@@ -160,6 +170,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
             params.mmproj.path = mmproj
             if self.model_family.chat_template:
                 params.chat_template = self.model_family.chat_template
+            params.use_jinja = True
             # This is the default value, could be overwritten by _llamacpp_model_config
             params.n_parallel = min(8, os.cpu_count() or 1)
             for k, v in self._llamacpp_model_config.items():
@@ -208,7 +219,8 @@ class XllamaCppModel(LLM, ChatModelMixin):
                         )
                         logger.info("Estimate num gpu layers: %s", estimate)
                         if estimate.tensor_split:
-                            params.tensor_split = estimate.tensor_split
+                            for i in range(len(estimate.tensor_split)):
+                                params.tensor_split[i] = estimate.tensor_split[i]
                         else:
                             params.n_gpu_layers = estimate.layers
                 except Exception as e:
@@ -242,28 +254,18 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 {
                     "prompt": prompt,
                     "stream": stream,
+                    "model": self.model_uid,
                 }
             )
-            prompt_json = orjson.dumps(data)
-            def _error_callback(err):
-                try:
-                    msg = orjson.loads(err)
-                    q.put(_Error(msg))
-                except Exception as e:
-                    q.put(_Error(str(e)))
+            try:
-            def _ok_callback(ok):
-                try:
-                    res = orjson.loads(ok)
-                    res["model"] = self.model_uid
-                    q.put(res)
-                except Exception as e:
-                    logger.exception("handle_completions callback failed: %s", e)
-                    q.put(_Error(str(e)))
+                def _callback(res):
+                    if res.get("code"):
+                        q.put(_Error(res))
+                    else:
+                        q.put(res)
-            try:
-                self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
+                self._llm.handle_completions(data, _callback)
             except Exception as ex:
                 logger.exception("handle_completions failed: %s", ex)
                 q.put(_Error(str(ex)))
@@ -296,6 +298,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
         if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
             generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
         stream = generate_config.get("stream", False)
+        chat_template_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
+        chat_context_var.set(chat_template_kwargs)
         tools = generate_config.pop("tools", []) if generate_config else None
         q: queue.Queue = queue.Queue()
@@ -310,30 +321,21 @@ class XllamaCppModel(LLM, ChatModelMixin):
                     "messages": messages,
                     "stream": stream,
                     "tools": tools,
+                    "model": self.model_uid,
                 }
             )
-            prompt_json = orjson.dumps(data)
+            if chat_template_kwargs:
+                data["chat_template_kwargs"] = chat_template_kwargs
-            def _error_callback(err):
-                try:
-                    msg = orjson.loads(err)
-                    q.put(_Error(msg))
-                except Exception as e:
-                    q.put(_Error(str(e)))
+            try:
-            def _ok_callback(ok):
-                try:
-                    res = orjson.loads(ok)
-                    res["model"] = self.model_uid
-                    q.put(res)
-                except Exception as e:
-                    logger.exception("handle_chat_completions callback failed: %s", e)
-                    q.put(_Error(str(e)))
+                def _callback(res):
+                    if res.get("code"):
+                        q.put(_Error(res))
+                    else:
+                        q.put(res)
-            try:
-                self._llm.handle_chat_completions(
-                    prompt_json, _error_callback, _ok_callback
-                )
+                self._llm.handle_chat_completions(data, _callback)
             except Exception as ex:
                 logger.exception("handle_chat_completions failed: %s", ex)
                 q.put(_Error(str(ex)))

xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl