PyPI - diffusers - Versions diffs - 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl - Mend

diffusers 0.15.1py3-none-any.whl → 0.16.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

diffusers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.15.1"
+__version__ = "0.16.1"
 from .configuration_utils import ConfigMixin
 from .utils import (
@@ -109,12 +109,17 @@ try:
 except OptionalDependencyNotAvailable:
     from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
 else:
-    from .loaders import TextualInversionLoaderMixin
     from .pipelines import (
         AltDiffusionImg2ImgPipeline,
         AltDiffusionPipeline,
         AudioLDMPipeline,
         CycleDiffusionPipeline,
+        IFImg2ImgPipeline,
+        IFImg2ImgSuperResolutionPipeline,
+        IFInpaintingPipeline,
+        IFInpaintingSuperResolutionPipeline,
+        IFPipeline,
+        IFSuperResolutionPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,

diffusers/configuration_utils.py CHANGED Viewed

@@ -109,6 +109,7 @@ class ConfigMixin:
         # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
         # or solve in a more general way.
         kwargs.pop("kwargs", None)
         if not hasattr(self, "_internal_dict"):
             internal_dict = kwargs
         else:
@@ -550,6 +551,9 @@ class ConfigMixin:
             return value
         config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
+        # Don't save "_ignore_files"
+        config_dict.pop("_ignore_files", None)
         return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
     def to_json_file(self, json_file_path: Union[str, os.PathLike]):

diffusers/loaders.py CHANGED Viewed

@@ -13,11 +13,17 @@
 # limitations under the License.
 import os
 from collections import defaultdict
+from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 import torch
+from huggingface_hub import hf_hub_download
-from .models.attention_processor import LoRAAttnProcessor
+from .models.attention_processor import (
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    LoRAAttnProcessor,
+)
 from .utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -46,6 +52,9 @@ LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
 TEXT_INVERSION_NAME = "learned_embeds.bin"
 TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
+CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
+CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
 class AttnProcsLayers(torch.nn.Module):
     def __init__(self, state_dict: Dict[str, torch.Tensor]):
@@ -213,6 +222,7 @@ class UNet2DConditionLoadersMixin:
         attn_processors = {}
         is_lora = all("lora" in k for k in state_dict.keys())
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
         if is_lora:
             lora_grouped_dict = defaultdict(dict)
@@ -229,9 +239,38 @@ class UNet2DConditionLoadersMixin:
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
                 )
                 attn_processors[key].load_state_dict(value_dict)
+        elif is_custom_diffusion:
+            custom_diffusion_grouped_dict = defaultdict(dict)
+            for key, value in state_dict.items():
+                if len(value) == 0:
+                    custom_diffusion_grouped_dict[key] = {}
+                else:
+                    if "to_out" in key:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                    else:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+            for key, value_dict in custom_diffusion_grouped_dict.items():
+                if len(value_dict) == 0:
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                    )
+                else:
+                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=True,
+                        train_q_out=train_q_out,
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                    )
+                    attn_processors[key].load_state_dict(value_dict)
         else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
+            raise ValueError(
+                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
+            )
         # set correct dtype & device
         attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()}
@@ -285,16 +324,31 @@ class UNet2DConditionLoadersMixin:
         os.makedirs(save_directory, exist_ok=True)
-        model_to_save = AttnProcsLayers(self.attn_processors)
-        # Save the model
-        state_dict = model_to_save.state_dict()
+        is_custom_diffusion = any(
+            isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+            for (_, x) in self.attn_processors.items()
+        )
+        if is_custom_diffusion:
+            model_to_save = AttnProcsLayers(
+                {
+                    y: x
+                    for (y, x) in self.attn_processors.items()
+                    if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+                }
+            )
+            state_dict = model_to_save.state_dict()
+            for name, attn in self.attn_processors.items():
+                if len(attn.state_dict()) == 0:
+                    state_dict[name] = {}
+        else:
+            model_to_save = AttnProcsLayers(self.attn_processors)
+            state_dict = model_to_save.state_dict()
         if weight_name is None:
             if safe_serialization:
-                weight_name = LORA_WEIGHT_NAME_SAFE
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
             else:
-                weight_name = LORA_WEIGHT_NAME
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
         # Save the model
         save_function(state_dict, os.path.join(save_directory, weight_name))
@@ -356,7 +410,7 @@ class TextualInversionLoaderMixin:
                 replacement = token
                 i = 1
                 while f"{token}_{i}" in tokenizer.added_tokens_encoder:
-                    replacement += f"{token}_{i}"
+                    replacement += f" {token}_{i}"
                     i += 1
                 prompt = prompt.replace(token, replacement)
@@ -431,6 +485,7 @@ class TextualInversionLoaderMixin:
         Example:
         To load a textual inversion embedding vector in `diffusers` format:
         ```py
         from diffusers import StableDiffusionPipeline
         import torch
@@ -456,13 +511,14 @@ class TextualInversionLoaderMixin:
         model_id = "runwayml/stable-diffusion-v1-5"
         pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-        pipe.load_textual_inversion("./charturnerv2.pt")
+        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
         prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
         image = pipe(prompt, num_inference_steps=50).images[0]
         image.save("character.png")
         ```
         """
         if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer):
             raise ValueError(
@@ -792,7 +848,7 @@ class LoraLoaderMixin:
         """
         # Loop over the original attention modules.
         for name, _ in self.text_encoder.named_modules():
-            if any([x in name for x in TEXT_ENCODER_TARGET_MODULES]):
+            if any(x in name for x in TEXT_ENCODER_TARGET_MODULES):
                 # Retrieve the module and its corresponding LoRA processor.
                 module = self.text_encoder.get_submodule(name)
                 # Construct a new function that performs the LoRA merging. We will monkey patch
@@ -1051,3 +1107,197 @@ class LoraLoaderMixin:
         save_function(state_dict, os.path.join(save_directory, weight_name))
         logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
+class FromCkptMixin:
+    """This helper class allows to directly load .ckpt stable diffusion file_extension
+    into the respective classes."""
+    @classmethod
+    def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights saved in the original .ckpt format.
+        The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the .ckpt file on the Hub. Should be in the format
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>"`
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            use_safetensors (`bool`, *optional* ):
+                If set to `True`, the pipeline will be loaded from `safetensors` weights. If set to `None` (the
+                default). The pipeline will load using `safetensors` if the safetensors weights are available *and* if
+                `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
+            extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+                checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults
+                to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+                inference. Non-EMA weights are usually better to continue fine-tuning.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted. This is necessary when running stable
+            image_size (`int`, *optional*, defaults to 512):
+                The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+                Base. Use 768 for Stable Diffusion v2.
+            prediction_type (`str`, *optional*):
+                The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+                Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+            num_in_channels (`int`, *optional*, defaults to None):
+                The number of input channels. If `None`, it will be automatically inferred.
+            scheduler_type (`str`, *optional*, defaults to 'pndm'):
+                Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+                "ddim"]`.
+            load_safety_checker (`bool`, *optional*, defaults to `True`):
+                Whether to load the safety checker or not. Defaults to `True`.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
+                specific pipeline class. The overwritten components are then directly passed to the pipelines
+                `__init__` method. See example below for more information.
+        Examples:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> pipeline = StableDiffusionPipeline.from_ckpt(
+        ...     "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
+        ... )
+        >>> # Download pipeline from local file
+        >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt
+        >>> pipeline = StableDiffusionPipeline.from_ckpt("./v1-5-pruned-emaonly")
+        >>> # Enable float16 and move to GPU
+        >>> pipeline = StableDiffusionPipeline.from_ckpt(
+        ...     "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipeline.to("cuda")
+        ```
+        """
+        # import here to avoid circular dependency
+        from .pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        extract_ema = kwargs.pop("extract_ema", False)
+        image_size = kwargs.pop("image_size", 512)
+        scheduler_type = kwargs.pop("scheduler_type", "pndm")
+        num_in_channels = kwargs.pop("num_in_channels", None)
+        upcast_attention = kwargs.pop("upcast_attention", None)
+        load_safety_checker = kwargs.pop("load_safety_checker", True)
+        prediction_type = kwargs.pop("prediction_type", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+        pipeline_name = cls.__name__
+        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
+        from_safetensors = file_extension == "safetensors"
+        if from_safetensors and use_safetensors is True:
+            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
+        # TODO: For now we only support stable diffusion
+        stable_unclip = None
+        controlnet = False
+        if pipeline_name == "StableDiffusionControlNetPipeline":
+            model_type = "FrozenCLIPEmbedder"
+            controlnet = True
+        elif "StableDiffusion" in pipeline_name:
+            model_type = "FrozenCLIPEmbedder"
+        elif pipeline_name == "StableUnCLIPPipeline":
+            model_type == "FrozenOpenCLIPEmbedder"
+            stable_unclip = "txt2img"
+        elif pipeline_name == "StableUnCLIPImg2ImgPipeline":
+            model_type == "FrozenOpenCLIPEmbedder"
+            stable_unclip = "img2img"
+        elif pipeline_name == "PaintByExamplePipeline":
+            model_type == "PaintByExample"
+        elif pipeline_name == "LDMTextToImagePipeline":
+            model_type == "LDMTextToImage"
+        else:
+            raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
+        # remove huggingface url
+        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
+            if pretrained_model_link_or_path.startswith(prefix):
+                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
+        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
+        ckpt_path = Path(pretrained_model_link_or_path)
+        if not ckpt_path.is_file():
+            # get repo_id and (potentially nested) file path of ckpt in repo
+            repo_id = str(Path().joinpath(*ckpt_path.parts[:2]))
+            file_path = str(Path().joinpath(*ckpt_path.parts[2:]))
+            if file_path.startswith("blob/"):
+                file_path = file_path[len("blob/") :]
+            if file_path.startswith("main/"):
+                file_path = file_path[len("main/") :]
+            pretrained_model_link_or_path = hf_hub_download(
+                repo_id,
+                filename=file_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                force_download=force_download,
+            )
+        pipe = download_from_original_stable_diffusion_ckpt(
+            pretrained_model_link_or_path,
+            pipeline_class=cls,
+            model_type=model_type,
+            stable_unclip=stable_unclip,
+            controlnet=controlnet,
+            from_safetensors=from_safetensors,
+            extract_ema=extract_ema,
+            image_size=image_size,
+            scheduler_type=scheduler_type,
+            num_in_channels=num_in_channels,
+            upcast_attention=upcast_attention,
+            load_safety_checker=load_safety_checker,
+            prediction_type=prediction_type,
+        )
+        if torch_dtype is not None:
+            pipe.to(torch_dtype=torch_dtype)
+        return pipe

diffusers/models/attention.py CHANGED Viewed

@@ -60,7 +60,6 @@ class AttentionBlock(nn.Module):
         self.channels = channels
         self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
-        self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
         # define q,k,v as linear layers
@@ -72,20 +71,30 @@ class AttentionBlock(nn.Module):
         self.proj_attn = nn.Linear(channels, channels, bias=True)
         self._use_memory_efficient_attention_xformers = False
+        self._use_2_0_attn = True
         self._attention_op = None
-    def reshape_heads_to_batch_dim(self, tensor):
+    def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True):
         batch_size, seq_len, dim = tensor.shape
         head_size = self.num_heads
         tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if merge_head_and_batch:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
         return tensor
-    def reshape_batch_dim_to_heads(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
+    def reshape_batch_dim_to_heads(self, tensor, unmerge_head_and_batch=True):
         head_size = self.num_heads
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        if unmerge_head_and_batch:
+            batch_head_size, seq_len, dim = tensor.shape
+            batch_size = batch_head_size // head_size
+            tensor = tensor.reshape(batch_size, head_size, seq_len, dim)
+        else:
+            batch_size, _, seq_len, dim = tensor.shape
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size, seq_len, dim * head_size)
         return tensor
     def set_use_memory_efficient_attention_xformers(
@@ -134,14 +143,24 @@ class AttentionBlock(nn.Module):
         scale = 1 / math.sqrt(self.channels / self.num_heads)
-        query_proj = self.reshape_heads_to_batch_dim(query_proj)
-        key_proj = self.reshape_heads_to_batch_dim(key_proj)
-        value_proj = self.reshape_heads_to_batch_dim(value_proj)
+        _use_2_0_attn = self._use_2_0_attn and not self._use_memory_efficient_attention_xformers
+        use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") and _use_2_0_attn
+        query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn)
+        key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn)
+        value_proj = self.reshape_heads_to_batch_dim(value_proj, merge_head_and_batch=not use_torch_2_0_attn)
         if self._use_memory_efficient_attention_xformers:
             # Memory efficient attention
             hidden_states = xformers.ops.memory_efficient_attention(
-                query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op
+                query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op, scale=scale
+            )
+            hidden_states = hidden_states.to(query_proj.dtype)
+        elif use_torch_2_0_attn:
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            hidden_states = F.scaled_dot_product_attention(
+                query_proj, key_proj, value_proj, dropout_p=0.0, is_causal=False
             )
             hidden_states = hidden_states.to(query_proj.dtype)
         else:
@@ -162,7 +181,7 @@ class AttentionBlock(nn.Module):
             hidden_states = torch.bmm(attention_probs, value_proj)
         # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states, unmerge_head_and_batch=not use_torch_2_0_attn)
         # compute next hidden_states
         hidden_states = self.proj_attn(hidden_states)

diffusers 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

diffusers 0.15.1py3-none-any.whl → 0.16.1py3-none-any.whl