PyPI - diffusers - Versions diffs - 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

diffusers 0.23.0py3-none-any.whl → 0.24.0py3-none-any.whl

Files changed (177) hide show

diffusers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.23.0"
+__version__ = "0.24.0"
 from typing import TYPE_CHECKING
@@ -76,9 +76,11 @@ else:
         [
             "AsymmetricAutoencoderKL",
             "AutoencoderKL",
+            "AutoencoderKLTemporalDecoder",
             "AutoencoderTiny",
             "ConsistencyDecoderVAE",
             "ControlNetModel",
+            "Kandinsky3UNet",
             "ModelMixin",
             "MotionAdapter",
             "MultiAdapter",
@@ -91,9 +93,11 @@ else:
             "UNet2DModel",
             "UNet3DConditionModel",
             "UNetMotionModel",
+            "UNetSpatioTemporalConditionModel",
             "VQModel",
         ]
     )
     _import_structure["optimization"] = [
         "get_constant_schedule",
         "get_constant_schedule_with_warmup",
@@ -103,7 +107,6 @@ else:
         "get_polynomial_decay_schedule_with_warmup",
         "get_scheduler",
     ]
     _import_structure["pipelines"].extend(
         [
             "AudioPipelineOutput",
@@ -214,6 +217,8 @@ else:
             "IFPipeline",
             "IFSuperResolutionPipeline",
             "ImageTextPipelineOutput",
+            "Kandinsky3Img2ImgPipeline",
+            "Kandinsky3Pipeline",
             "KandinskyCombinedPipeline",
             "KandinskyImg2ImgCombinedPipeline",
             "KandinskyImg2ImgPipeline",
@@ -274,8 +279,10 @@ else:
             "StableDiffusionXLPipeline",
             "StableUnCLIPImg2ImgPipeline",
             "StableUnCLIPPipeline",
+            "StableVideoDiffusionPipeline",
             "TextToVideoSDPipeline",
             "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
             "UnCLIPImageVariationPipeline",
             "UnCLIPPipeline",
             "UniDiffuserModel",
@@ -443,9 +450,11 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from .models import (
             AsymmetricAutoencoderKL,
             AutoencoderKL,
+            AutoencoderKLTemporalDecoder,
             AutoencoderTiny,
             ConsistencyDecoderVAE,
             ControlNetModel,
+            Kandinsky3UNet,
             ModelMixin,
             MotionAdapter,
             MultiAdapter,
@@ -458,6 +467,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             UNet2DModel,
             UNet3DConditionModel,
             UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
             VQModel,
         )
         from .optimization import (
@@ -560,6 +570,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             IFPipeline,
             IFSuperResolutionPipeline,
             ImageTextPipelineOutput,
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
             KandinskyImg2ImgPipeline,
@@ -620,8 +632,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             StableDiffusionXLPipeline,
             StableUnCLIPImg2ImgPipeline,
             StableUnCLIPPipeline,
+            StableVideoDiffusionPipeline,
             TextToVideoSDPipeline,
             TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
             UnCLIPImageVariationPipeline,
             UnCLIPPipeline,
             UniDiffuserModel,

diffusers/configuration_utils.py CHANGED Viewed

@@ -95,6 +95,7 @@ class ConfigMixin:
           should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
           subclass).
     """
     config_name = None
     ignore_for_config = []
     has_compatibles = False

diffusers/dependency_versions_check.py CHANGED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 from .dependency_versions_table import deps
 from .utils.versions import require_version, require_version_core
@@ -23,21 +22,9 @@ from .utils.versions import require_version, require_version_core
 # order specific notes:
 # - tqdm must be checked before tokenizers
-pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
-if sys.version_info < (3, 7):
-    pkgs_to_check_at_runtime.append("dataclasses")
-if sys.version_info < (3, 8):
-    pkgs_to_check_at_runtime.append("importlib_metadata")
+pkgs_to_check_at_runtime = "python requests filelock numpy".split()
 for pkg in pkgs_to_check_at_runtime:
     if pkg in deps:
-        if pkg == "tokenizers":
-            # must be loaded here, or else tqdm check may fail
-            from .utils import is_tokenizers_available
-            if not is_tokenizers_available():
-                continue  # not required, check version only if installed
         require_version_core(deps[pkg])
     else:
         raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")

diffusers/dependency_versions_table.py CHANGED Viewed

@@ -1,16 +1,15 @@
 # THIS FILE HAS BEEN AUTOGENERATED. To update:
 # 1. modify the `_deps` dict in setup.py
-# 2. run `make deps_table_update``
+# 2. run `make deps_table_update`
 deps = {
     "Pillow": "Pillow",
     "accelerate": "accelerate>=0.11.0",
     "compel": "compel==0.1.8",
-    "black": "black~=23.1",
     "datasets": "datasets",
     "filelock": "filelock",
     "flax": "flax>=0.4.1",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.13.2",
+    "huggingface-hub": "huggingface-hub>=0.19.4",
     "requests-mock": "requests-mock==1.10.0",
     "importlib_metadata": "importlib_metadata",
     "invisible-watermark": "invisible-watermark>=0.2.0",
@@ -25,11 +24,13 @@ deps = {
     "numpy": "numpy",
     "omegaconf": "omegaconf",
     "parameterized": "parameterized",
+    "peft": "peft>=0.6.0",
     "protobuf": "protobuf>=3.20.3,<4",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
-    "ruff": "ruff==0.0.280",
+    "python": "python>=3.8.0",
+    "ruff": "ruff>=0.1.5,<=0.2",
     "safetensors": "safetensors>=0.3.1",
     "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
     "scipy": "scipy",

diffusers/image_processor.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 import warnings
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
@@ -33,6 +33,15 @@ PipelineImageInput = Union[
     List[torch.FloatTensor],
 ]
+PipelineDepthInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.FloatTensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.FloatTensor],
+]
 class VaeImageProcessor(ConfigMixin):
     """
@@ -126,14 +135,14 @@ class VaeImageProcessor(ConfigMixin):
         return images
     @staticmethod
-    def normalize(images):
+    def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Normalize an image array to [-1,1].
         """
         return 2.0 * images - 1.0
     @staticmethod
-    def denormalize(images):
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Denormalize an image array to [0,1].
         """
@@ -159,10 +168,10 @@ class VaeImageProcessor(ConfigMixin):
     def get_default_height_width(
         self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
-    ):
+    ) -> Tuple[int, int]:
         """
         This function return the height and width that are downscaled to the next integer multiple of
         `vae_scale_factor`.
@@ -202,12 +211,24 @@ class VaeImageProcessor(ConfigMixin):
     def resize(
         self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
-    ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]:
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
         """
         Resize image.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor.
+            height (`int`, *optional*, defaults to `None`):
+                The height to resize to.
+            width (`int`, *optional*`, defaults to `None`):
+                The width to resize to.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The resized image.
         """
         if isinstance(image, PIL.Image.Image):
             image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
@@ -227,7 +248,15 @@ class VaeImageProcessor(ConfigMixin):
     def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
         """
-        create a mask
+        Create a mask.
+        Args:
+            image (`PIL.Image.Image`):
+                The image input, should be a PIL image.
+        Returns:
+            `PIL.Image.Image`:
+                The binarized image. Values less than 0.5 are set to 0, values greater than 0.5 are set to 1.
         """
         image[image < 0.5] = 0
         image[image >= 0.5] = 1
@@ -306,7 +335,7 @@ class VaeImageProcessor(ConfigMixin):
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.config.do_normalize
-        if image.min() < 0 and do_normalize:
+        if do_normalize and image.min() < 0:
             warnings.warn(
                 "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
                 f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
@@ -327,7 +356,23 @@ class VaeImageProcessor(ConfigMixin):
         image: torch.FloatTensor,
         output_type: str = "pil",
         do_denormalize: Optional[List[bool]] = None,
-    ):
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`torch.FloatTensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
+                The postprocessed image.
+        """
         if not isinstance(image, torch.Tensor):
             raise ValueError(
                 f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
@@ -390,7 +435,7 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         super().__init__()
     @staticmethod
-    def numpy_to_pil(images):
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
         """
         Convert a NumPy image or a batch of images to a PIL image.
         """
@@ -406,7 +451,19 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         return pil_images
     @staticmethod
-    def rgblike_to_depthmap(image):
+    def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / (2**16 - 1) for image in images]
+        images = np.stack(images, axis=0)
+        return images
+    @staticmethod
+    def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Args:
             image: RGB-like depth image
@@ -416,7 +473,7 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         """
         return image[:, :, 1] * 2**8 + image[:, :, 2]
-    def numpy_to_depth(self, images):
+    def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
         """
         Convert a NumPy depth image or a batch of images to a PIL image.
         """
@@ -441,7 +498,23 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         image: torch.FloatTensor,
         output_type: str = "pil",
         do_denormalize: Optional[List[bool]] = None,
-    ):
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`torch.FloatTensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
+                The postprocessed image.
+        """
         if not isinstance(image, torch.Tensor):
             raise ValueError(
                 f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
@@ -474,3 +547,102 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
             return self.numpy_to_pil(image), self.numpy_to_depth(image)
         else:
             raise Exception(f"This type {output_type} is not supported")
+    def preprocess(
+        self,
+        rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        target_res: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(rgb, (torch.Tensor, np.ndarray)) and rgb.ndim == 3:
+            raise Exception("This is not yet supported")
+        if isinstance(rgb, supported_formats):
+            rgb = [rgb]
+            depth = [depth]
+        elif not (isinstance(rgb, list) and all(isinstance(i, supported_formats) for i in rgb)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in rgb]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+        if isinstance(rgb[0], PIL.Image.Image):
+            if self.config.do_convert_rgb:
+                raise Exception("This is not yet supported")
+                # rgb = [self.convert_to_rgb(i) for i in rgb]
+                # depth = [self.convert_to_depth(i) for i in depth]  #TODO define convert_to_depth
+            if self.config.do_resize or target_res:
+                height, width = self.get_default_height_width(rgb[0], height, width) if not target_res else target_res
+                rgb = [self.resize(i, height, width) for i in rgb]
+                depth = [self.resize(i, height, width) for i in depth]
+            rgb = self.pil_to_numpy(rgb)  # to np
+            rgb = self.numpy_to_pt(rgb)  # to pt
+            depth = self.depth_pil_to_numpy(depth)  # to np
+            depth = self.numpy_to_pt(depth)  # to pt
+        elif isinstance(rgb[0], np.ndarray):
+            rgb = np.concatenate(rgb, axis=0) if rgb[0].ndim == 4 else np.stack(rgb, axis=0)
+            rgb = self.numpy_to_pt(rgb)
+            height, width = self.get_default_height_width(rgb, height, width)
+            if self.config.do_resize:
+                rgb = self.resize(rgb, height, width)
+            depth = np.concatenate(depth, axis=0) if rgb[0].ndim == 4 else np.stack(depth, axis=0)
+            depth = self.numpy_to_pt(depth)
+            height, width = self.get_default_height_width(depth, height, width)
+            if self.config.do_resize:
+                depth = self.resize(depth, height, width)
+        elif isinstance(rgb[0], torch.Tensor):
+            raise Exception("This is not yet supported")
+            # rgb = torch.cat(rgb, axis=0) if rgb[0].ndim == 4 else torch.stack(rgb, axis=0)
+            # if self.config.do_convert_grayscale and rgb.ndim == 3:
+            #     rgb = rgb.unsqueeze(1)
+            # channel = rgb.shape[1]
+            # height, width = self.get_default_height_width(rgb, height, width)
+            # if self.config.do_resize:
+            #     rgb = self.resize(rgb, height, width)
+            # depth = torch.cat(depth, axis=0) if depth[0].ndim == 4 else torch.stack(depth, axis=0)
+            # if self.config.do_convert_grayscale and depth.ndim == 3:
+            #     depth = depth.unsqueeze(1)
+            # channel = depth.shape[1]
+            # # don't need any preprocess if the image is latents
+            # if depth == 4:
+            #     return rgb, depth
+            # height, width = self.get_default_height_width(depth, height, width)
+            # if self.config.do_resize:
+            #     depth = self.resize(depth, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if rgb.min() < 0 and do_normalize:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{rgb.min()},{rgb.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            rgb = self.normalize(rgb)
+            depth = self.normalize(depth)
+        if self.config.do_binarize:
+            rgb = self.binarize(rgb)
+            depth = self.binarize(depth)
+        return rgb, depth

diffusers/loaders/__init__.py ADDED Viewed

@@ -0,0 +1,82 @@
+from typing import TYPE_CHECKING
+from ..utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, deprecate
+from ..utils.import_utils import is_torch_available, is_transformers_available
+def text_encoder_lora_state_dict(text_encoder):
+    deprecate(
+        "text_encoder_load_state_dict in `models`",
+        "0.27.0",
+        "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+    )
+    state_dict = {}
+    for name, module in text_encoder_attn_modules(text_encoder):
+        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
+        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
+        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
+        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
+    return state_dict
+if is_transformers_available():
+    def text_encoder_attn_modules(text_encoder):
+        deprecate(
+            "text_encoder_attn_modules in `models`",
+            "0.27.0",
+            "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+        )
+        from transformers import CLIPTextModel, CLIPTextModelWithProjection
+        attn_modules = []
+        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+                name = f"text_model.encoder.layers.{i}.self_attn"
+                mod = layer.self_attn
+                attn_modules.append((name, mod))
+        else:
+            raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+        return attn_modules
+_import_structure = {}
+if is_torch_available():
+    _import_structure["single_file"] = ["FromOriginalControlnetMixin", "FromOriginalVAEMixin"]
+    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
+    _import_structure["utils"] = ["AttnProcsLayers"]
+    if is_transformers_available():
+        _import_structure["single_file"].extend(["FromSingleFileMixin"])
+        _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
+        _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
+        _import_structure["ip_adapter"] = ["IPAdapterMixin"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    if is_torch_available():
+        from .single_file import FromOriginalControlnetMixin, FromOriginalVAEMixin
+        from .unet import UNet2DConditionLoadersMixin
+        from .utils import AttnProcsLayers
+        if is_transformers_available():
+            from .ip_adapter import IPAdapterMixin
+            from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin
+            from .single_file import FromSingleFileMixin
+            from .textual_inversion import TextualInversionLoaderMixin
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

diffusers 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

diffusers 0.23.0py3-none-any.whl → 0.24.0py3-none-any.whl