PyPI - xinference - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

xinference 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (170) hide show

xinference/model/image/model_spec.json CHANGED Viewed

@@ -11,8 +11,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "city96/FLUX.1-schnell-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
   },
   {
     "model_name": "FLUX.1-dev",
@@ -26,8 +42,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "city96/FLUX.1-dev-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
   },
   {
     "model_name": "sd3-medium",
@@ -44,6 +76,97 @@
       "quantize_text_encoder": "text_encoder_3"
     }
   },
+  {
+    "model_name": "sd3.5-medium",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-3.5-medium",
+    "model_revision": "94b13ccbe959c51e8159d91f562c58f29fac971a",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "city96/stable-diffusion-3.5-medium-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q3_K_M",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_M",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_M",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-3.5-large",
+    "model_revision": "ceddf0a7fdf2064ea28e2213e3b84e4afa170a0f",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "gguf_model_id": "city96/stable-diffusion-3.5-large-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large-turbo",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-3.5-large-turbo",
+    "model_revision": "ec07796fc06b096cc56de9762974a28f4c632eda",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "num_inference_steps": 4
+    },
+    "gguf_model_id": "city96/stable-diffusion-3.5-large-turbo-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -12,8 +12,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "Xorbits/FLUX.1-schnell-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
   },
   {
     "model_name": "FLUX.1-dev",
@@ -28,8 +44,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "AI-ModelScope/FLUX.1-dev-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
   },
   {
     "model_name": "sd3-medium",
@@ -47,6 +79,100 @@
       "quantize_text_encoder": "text_encoder_3"
     }
   },
+  {
+    "model_name": "sd3.5-medium",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-medium",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-medium-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q3_K_M",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_M",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_M",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-large",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large-turbo",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-large-turbo",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "num_inference_steps": 4
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-turbo-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -14,8 +14,10 @@
 import contextlib
 import gc
+import importlib
 import inspect
 import itertools
+import json
 import logging
 import os
 import re
@@ -86,6 +88,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         lora_load_kwargs: Optional[Dict] = None,
         lora_fuse_kwargs: Optional[Dict] = None,
         model_spec: Optional["ImageModelFamilyV1"] = None,
+        gguf_model_path: Optional[str] = None,
         **kwargs,
     ):
         self._model_uid = model_uid
@@ -109,6 +112,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         self._model_spec = model_spec
         self._abilities = model_spec.model_ability or []  # type: ignore
         self._kwargs = kwargs
+        # gguf
+        self._gguf_model_path = gguf_model_path
     @property
     def model_ability(self):
@@ -184,7 +189,17 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             self._model.fuse_lora(**self._lora_fuse_kwargs)
             logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
+    def _get_layer_cls(self, layer: str):
+        with open(os.path.join(self._model_path, "model_index.json")) as f:  # type: ignore
+            model_index = json.load(f)
+            layer_info = model_index[layer]
+            module_name, class_name = layer_info
+            module = importlib.import_module(module_name)
+            return getattr(module, class_name)
     def load(self):
+        from transformers import BitsAndBytesConfig, T5EncoderModel
         if "text2image" in self._abilities or "image2image" in self._abilities:
             from diffusers import AutoPipelineForText2Image as AutoPipelineModel
         elif "inpainting" in self._abilities:
@@ -200,7 +215,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 glob(os.path.join(self._model_path, "*/*.safetensors"))
             )
         if isinstance(torch_dtype, str):
-            self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+            self._torch_dtype = torch_dtype = self._kwargs["torch_dtype"] = getattr(
+                torch, torch_dtype
+            )
         controlnet = self._kwargs.get("controlnet")
         if controlnet is not None:
@@ -212,18 +229,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 ]
         quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
-        if quantize_text_encoder:
-            try:
-                from transformers import BitsAndBytesConfig, T5EncoderModel
-            except ImportError:
-                error_message = "Failed to import module 'transformers'"
-                installation_guide = [
-                    "Please make sure 'transformers' is installed. ",
-                    "You can install it by `pip install transformers`\n",
-                ]
-                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        if quantize_text_encoder and not self._gguf_model_path:
             try:
                 import bitsandbytes  # noqa: F401
             except ImportError:
@@ -249,6 +255,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 self._kwargs[text_encoder_name] = text_encoder
                 self._kwargs["device_map"] = "balanced"
+        if self._gguf_model_path:
+            from diffusers import GGUFQuantizationConfig
+            # GGUF transformer
+            self._kwargs["transformer"] = self._get_layer_cls(
+                "transformer"
+            ).from_single_file(
+                self._gguf_model_path,
+                quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
+                torch_dtype=torch_dtype,
+                config=os.path.join(self._model_path, "transformer"),
+            )
+        elif self._kwargs.get("transformer_nf4"):
+            nf4_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch_dtype,
+            )
+            model_nf4 = self._get_layer_cls("transformer").from_pretrained(
+                self._model_path,
+                subfolder="transformer",
+                quantization_config=nf4_config,
+                torch_dtype=torch_dtype,
+            )
+            self._kwargs["transformer"] = model_nf4
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
         )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -131,7 +131,7 @@ def register_custom_model():
 def _install():
     from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
-    from .mlx.core import MLXChatModel, MLXModel
+    from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
     from .transformers.chatglm import ChatglmPytorchChatModel
     from .transformers.cogvlm2 import CogVLM2Model
@@ -172,7 +172,7 @@ def _install():
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
-    MLX_CLASSES.extend([MLXModel, MLXChatModel])
+    MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
     LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
     TRANSFORMERS_CLASSES.extend(
         [

xinference 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

xinference 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl