PyPI - diffusers - Versions diffs - 0.23.1__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

diffusers 0.23.1py3-none-any.whl → 0.24.0py3-none-any.whl

Files changed (176) hide show

diffusers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.23.1"
+__version__ = "0.24.0"
 from typing import TYPE_CHECKING
@@ -76,9 +76,11 @@ else:
         [
             "AsymmetricAutoencoderKL",
             "AutoencoderKL",
+            "AutoencoderKLTemporalDecoder",
             "AutoencoderTiny",
             "ConsistencyDecoderVAE",
             "ControlNetModel",
+            "Kandinsky3UNet",
             "ModelMixin",
             "MotionAdapter",
             "MultiAdapter",
@@ -91,9 +93,11 @@ else:
             "UNet2DModel",
             "UNet3DConditionModel",
             "UNetMotionModel",
+            "UNetSpatioTemporalConditionModel",
             "VQModel",
         ]
     )
     _import_structure["optimization"] = [
         "get_constant_schedule",
         "get_constant_schedule_with_warmup",
@@ -103,7 +107,6 @@ else:
         "get_polynomial_decay_schedule_with_warmup",
         "get_scheduler",
     ]
     _import_structure["pipelines"].extend(
         [
             "AudioPipelineOutput",
@@ -214,6 +217,8 @@ else:
             "IFPipeline",
             "IFSuperResolutionPipeline",
             "ImageTextPipelineOutput",
+            "Kandinsky3Img2ImgPipeline",
+            "Kandinsky3Pipeline",
             "KandinskyCombinedPipeline",
             "KandinskyImg2ImgCombinedPipeline",
             "KandinskyImg2ImgPipeline",
@@ -274,8 +279,10 @@ else:
             "StableDiffusionXLPipeline",
             "StableUnCLIPImg2ImgPipeline",
             "StableUnCLIPPipeline",
+            "StableVideoDiffusionPipeline",
             "TextToVideoSDPipeline",
             "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
             "UnCLIPImageVariationPipeline",
             "UnCLIPPipeline",
             "UniDiffuserModel",
@@ -443,9 +450,11 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from .models import (
             AsymmetricAutoencoderKL,
             AutoencoderKL,
+            AutoencoderKLTemporalDecoder,
             AutoencoderTiny,
             ConsistencyDecoderVAE,
             ControlNetModel,
+            Kandinsky3UNet,
             ModelMixin,
             MotionAdapter,
             MultiAdapter,
@@ -458,6 +467,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             UNet2DModel,
             UNet3DConditionModel,
             UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
             VQModel,
         )
         from .optimization import (
@@ -560,6 +570,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             IFPipeline,
             IFSuperResolutionPipeline,
             ImageTextPipelineOutput,
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
             KandinskyImg2ImgPipeline,
@@ -620,8 +632,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             StableDiffusionXLPipeline,
             StableUnCLIPImg2ImgPipeline,
             StableUnCLIPPipeline,
+            StableVideoDiffusionPipeline,
             TextToVideoSDPipeline,
             TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
             UnCLIPImageVariationPipeline,
             UnCLIPPipeline,
             UniDiffuserModel,

diffusers/configuration_utils.py CHANGED Viewed

@@ -95,6 +95,7 @@ class ConfigMixin:
           should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
           subclass).
     """
     config_name = None
     ignore_for_config = []
     has_compatibles = False

diffusers/dependency_versions_check.py CHANGED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 from .dependency_versions_table import deps
 from .utils.versions import require_version, require_version_core

diffusers/dependency_versions_table.py CHANGED Viewed

@@ -1,16 +1,15 @@
 # THIS FILE HAS BEEN AUTOGENERATED. To update:
 # 1. modify the `_deps` dict in setup.py
-# 2. run `make deps_table_update``
+# 2. run `make deps_table_update`
 deps = {
     "Pillow": "Pillow",
     "accelerate": "accelerate>=0.11.0",
     "compel": "compel==0.1.8",
-    "black": "black~=23.1",
     "datasets": "datasets",
     "filelock": "filelock",
     "flax": "flax>=0.4.1",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.13.2",
+    "huggingface-hub": "huggingface-hub>=0.19.4",
     "requests-mock": "requests-mock==1.10.0",
     "importlib_metadata": "importlib_metadata",
     "invisible-watermark": "invisible-watermark>=0.2.0",
@@ -25,13 +24,13 @@ deps = {
     "numpy": "numpy",
     "omegaconf": "omegaconf",
     "parameterized": "parameterized",
-    "peft": "peft<=0.6.2",
+    "peft": "peft>=0.6.0",
     "protobuf": "protobuf>=3.20.3,<4",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
     "python": "python>=3.8.0",
-    "ruff": "ruff==0.0.280",
+    "ruff": "ruff>=0.1.5,<=0.2",
     "safetensors": "safetensors>=0.3.1",
     "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
     "scipy": "scipy",

diffusers/image_processor.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 import warnings
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
@@ -33,6 +33,15 @@ PipelineImageInput = Union[
     List[torch.FloatTensor],
 ]
+PipelineDepthInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.FloatTensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.FloatTensor],
+]
 class VaeImageProcessor(ConfigMixin):
     """
@@ -126,14 +135,14 @@ class VaeImageProcessor(ConfigMixin):
         return images
     @staticmethod
-    def normalize(images):
+    def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Normalize an image array to [-1,1].
         """
         return 2.0 * images - 1.0
     @staticmethod
-    def denormalize(images):
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Denormalize an image array to [0,1].
         """
@@ -159,10 +168,10 @@ class VaeImageProcessor(ConfigMixin):
     def get_default_height_width(
         self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
-    ):
+    ) -> Tuple[int, int]:
         """
         This function return the height and width that are downscaled to the next integer multiple of
         `vae_scale_factor`.
@@ -202,12 +211,24 @@ class VaeImageProcessor(ConfigMixin):
     def resize(
         self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
-    ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]:
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
         """
         Resize image.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor.
+            height (`int`, *optional*, defaults to `None`):
+                The height to resize to.
+            width (`int`, *optional*`, defaults to `None`):
+                The width to resize to.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The resized image.
         """
         if isinstance(image, PIL.Image.Image):
             image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
@@ -227,7 +248,15 @@ class VaeImageProcessor(ConfigMixin):
     def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
         """
-        create a mask
+        Create a mask.
+        Args:
+            image (`PIL.Image.Image`):
+                The image input, should be a PIL image.
+        Returns:
+            `PIL.Image.Image`:
+                The binarized image. Values less than 0.5 are set to 0, values greater than 0.5 are set to 1.
         """
         image[image < 0.5] = 0
         image[image >= 0.5] = 1
@@ -306,7 +335,7 @@ class VaeImageProcessor(ConfigMixin):
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.config.do_normalize
-        if image.min() < 0 and do_normalize:
+        if do_normalize and image.min() < 0:
             warnings.warn(
                 "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
                 f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
@@ -327,7 +356,23 @@ class VaeImageProcessor(ConfigMixin):
         image: torch.FloatTensor,
         output_type: str = "pil",
         do_denormalize: Optional[List[bool]] = None,
-    ):
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`torch.FloatTensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
+                The postprocessed image.
+        """
         if not isinstance(image, torch.Tensor):
             raise ValueError(
                 f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
@@ -390,7 +435,7 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         super().__init__()
     @staticmethod
-    def numpy_to_pil(images):
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
         """
         Convert a NumPy image or a batch of images to a PIL image.
         """
@@ -406,7 +451,19 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         return pil_images
     @staticmethod
-    def rgblike_to_depthmap(image):
+    def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / (2**16 - 1) for image in images]
+        images = np.stack(images, axis=0)
+        return images
+    @staticmethod
+    def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Args:
             image: RGB-like depth image
@@ -416,7 +473,7 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         """
         return image[:, :, 1] * 2**8 + image[:, :, 2]
-    def numpy_to_depth(self, images):
+    def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
         """
         Convert a NumPy depth image or a batch of images to a PIL image.
         """
@@ -441,7 +498,23 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
         image: torch.FloatTensor,
         output_type: str = "pil",
         do_denormalize: Optional[List[bool]] = None,
-    ):
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`torch.FloatTensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
+                The postprocessed image.
+        """
         if not isinstance(image, torch.Tensor):
             raise ValueError(
                 f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
@@ -474,3 +547,102 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
             return self.numpy_to_pil(image), self.numpy_to_depth(image)
         else:
             raise Exception(f"This type {output_type} is not supported")
+    def preprocess(
+        self,
+        rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        target_res: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(rgb, (torch.Tensor, np.ndarray)) and rgb.ndim == 3:
+            raise Exception("This is not yet supported")
+        if isinstance(rgb, supported_formats):
+            rgb = [rgb]
+            depth = [depth]
+        elif not (isinstance(rgb, list) and all(isinstance(i, supported_formats) for i in rgb)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in rgb]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+        if isinstance(rgb[0], PIL.Image.Image):
+            if self.config.do_convert_rgb:
+                raise Exception("This is not yet supported")
+                # rgb = [self.convert_to_rgb(i) for i in rgb]
+                # depth = [self.convert_to_depth(i) for i in depth]  #TODO define convert_to_depth
+            if self.config.do_resize or target_res:
+                height, width = self.get_default_height_width(rgb[0], height, width) if not target_res else target_res
+                rgb = [self.resize(i, height, width) for i in rgb]
+                depth = [self.resize(i, height, width) for i in depth]
+            rgb = self.pil_to_numpy(rgb)  # to np
+            rgb = self.numpy_to_pt(rgb)  # to pt
+            depth = self.depth_pil_to_numpy(depth)  # to np
+            depth = self.numpy_to_pt(depth)  # to pt
+        elif isinstance(rgb[0], np.ndarray):
+            rgb = np.concatenate(rgb, axis=0) if rgb[0].ndim == 4 else np.stack(rgb, axis=0)
+            rgb = self.numpy_to_pt(rgb)
+            height, width = self.get_default_height_width(rgb, height, width)
+            if self.config.do_resize:
+                rgb = self.resize(rgb, height, width)
+            depth = np.concatenate(depth, axis=0) if rgb[0].ndim == 4 else np.stack(depth, axis=0)
+            depth = self.numpy_to_pt(depth)
+            height, width = self.get_default_height_width(depth, height, width)
+            if self.config.do_resize:
+                depth = self.resize(depth, height, width)
+        elif isinstance(rgb[0], torch.Tensor):
+            raise Exception("This is not yet supported")
+            # rgb = torch.cat(rgb, axis=0) if rgb[0].ndim == 4 else torch.stack(rgb, axis=0)
+            # if self.config.do_convert_grayscale and rgb.ndim == 3:
+            #     rgb = rgb.unsqueeze(1)
+            # channel = rgb.shape[1]
+            # height, width = self.get_default_height_width(rgb, height, width)
+            # if self.config.do_resize:
+            #     rgb = self.resize(rgb, height, width)
+            # depth = torch.cat(depth, axis=0) if depth[0].ndim == 4 else torch.stack(depth, axis=0)
+            # if self.config.do_convert_grayscale and depth.ndim == 3:
+            #     depth = depth.unsqueeze(1)
+            # channel = depth.shape[1]
+            # # don't need any preprocess if the image is latents
+            # if depth == 4:
+            #     return rgb, depth
+            # height, width = self.get_default_height_width(depth, height, width)
+            # if self.config.do_resize:
+            #     depth = self.resize(depth, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if rgb.min() < 0 and do_normalize:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{rgb.min()},{rgb.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            rgb = self.normalize(rgb)
+            depth = self.normalize(depth)
+        if self.config.do_binarize:
+            rgb = self.binarize(rgb)
+            depth = self.binarize(depth)
+        return rgb, depth

diffusers/loaders/__init__.py ADDED Viewed

@@ -0,0 +1,82 @@
+from typing import TYPE_CHECKING
+from ..utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, deprecate
+from ..utils.import_utils import is_torch_available, is_transformers_available
+def text_encoder_lora_state_dict(text_encoder):
+    deprecate(
+        "text_encoder_load_state_dict in `models`",
+        "0.27.0",
+        "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+    )
+    state_dict = {}
+    for name, module in text_encoder_attn_modules(text_encoder):
+        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
+        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
+        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
+        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
+    return state_dict
+if is_transformers_available():
+    def text_encoder_attn_modules(text_encoder):
+        deprecate(
+            "text_encoder_attn_modules in `models`",
+            "0.27.0",
+            "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+        )
+        from transformers import CLIPTextModel, CLIPTextModelWithProjection
+        attn_modules = []
+        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+                name = f"text_model.encoder.layers.{i}.self_attn"
+                mod = layer.self_attn
+                attn_modules.append((name, mod))
+        else:
+            raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+        return attn_modules
+_import_structure = {}
+if is_torch_available():
+    _import_structure["single_file"] = ["FromOriginalControlnetMixin", "FromOriginalVAEMixin"]
+    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
+    _import_structure["utils"] = ["AttnProcsLayers"]
+    if is_transformers_available():
+        _import_structure["single_file"].extend(["FromSingleFileMixin"])
+        _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
+        _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
+        _import_structure["ip_adapter"] = ["IPAdapterMixin"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    if is_torch_available():
+        from .single_file import FromOriginalControlnetMixin, FromOriginalVAEMixin
+        from .unet import UNet2DConditionLoadersMixin
+        from .utils import AttnProcsLayers
+        if is_transformers_available():
+            from .ip_adapter import IPAdapterMixin
+            from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin
+            from .single_file import FromSingleFileMixin
+            from .textual_inversion import TextualInversionLoaderMixin
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

diffusers/loaders/ip_adapter.py ADDED Viewed

@@ -0,0 +1,157 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict, Union
+import torch
+from safetensors import safe_open
+from ..utils import (
+    DIFFUSERS_CACHE,
+    HF_HUB_OFFLINE,
+    _get_model_file,
+    is_transformers_available,
+    logging,
+)
+if is_transformers_available():
+    from transformers import (
+        CLIPImageProcessor,
+        CLIPVisionModelWithProjection,
+    )
+    from ..models.attention_processor import (
+        IPAdapterAttnProcessor,
+        IPAdapterAttnProcessor2_0,
+    )
+logger = logging.get_logger(__name__)
+class IPAdapterMixin:
+    """Mixin for handling IP Adapters."""
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        subfolder: str,
+        weight_name: str,
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+        """
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+            )
+            if weight_name.endswith(".safetensors"):
+                state_dict = {"image_proj": {}, "ip_adapter": {}}
+                with safe_open(model_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        if key.startswith("image_proj."):
+                            state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                        elif key.startswith("ip_adapter."):
+                            state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+            else:
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+        keys = list(state_dict.keys())
+        if keys != ["image_proj", "ip_adapter"]:
+            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+        # load CLIP image encoer here if it has not been registered to the pipeline yet
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                    pretrained_model_name_or_path_or_dict,
+                    subfolder=os.path.join(subfolder, "image_encoder"),
+                ).to(self.device, dtype=self.dtype)
+                self.image_encoder = image_encoder
+            else:
+                raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
+        # create feature extractor if it has not been registered to the pipeline yet
+        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+            self.feature_extractor = CLIPImageProcessor()
+        # load ip-adapter into unet
+        self.unet._load_ip_adapter_weights(state_dict)
+    def set_ip_adapter_scale(self, scale):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                attn_processor.scale = scale

diffusers 0.23.1__py3-none-any.whl → 0.24.0__py3-none-any.whl

diffusers 0.23.1py3-none-any.whl → 0.24.0py3-none-any.whl