PyPI - diffusers - Versions diffs - 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl - Mend

diffusers 0.19.3py3-none-any.whl → 0.20.1py3-none-any.whl

Files changed (114) hide show

diffusers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.19.3"
+__version__ = "0.20.1"
 from .configuration_utils import ConfigMixin
 from .utils import (
@@ -38,6 +38,7 @@ else:
     from .models import (
         AsymmetricAutoencoderKL,
         AutoencoderKL,
+        AutoencoderTiny,
         ControlNetModel,
         ModelMixin,
         MultiAdapter,
@@ -170,6 +171,7 @@ else:
         StableDiffusionControlNetPipeline,
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionDiffEditPipeline,
+        StableDiffusionGLIGENPipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,

diffusers/commands/fp16_safetensors.py CHANGED Viewed

@@ -27,7 +27,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from packaging import version
-from ..utils import is_safetensors_available, logging
+from ..utils import logging
 from . import BaseDiffusersCLICommand
@@ -68,12 +68,7 @@ class FP16SafetensorsCommand(BaseDiffusersCLICommand):
         self.local_ckpt_dir = f"/tmp/{ckpt_id}"
         self.fp16 = fp16
-        if is_safetensors_available():
-            self.use_safetensors = use_safetensors
-        else:
-            raise ImportError(
-                "When `use_safetensors` is set to True, the `safetensors` library needs to be installed. Install it via `pip install safetensors`."
-            )
+        self.use_safetensors = use_safetensors
         if not self.use_safetensors and not self.fp16:
             raise NotImplementedError(

diffusers/configuration_utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ from pathlib import PosixPath
 from typing import Any, Dict, Tuple, Union
 import numpy as np
-from huggingface_hub import hf_hub_download
+from huggingface_hub import create_repo, hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
 from requests import HTTPError
@@ -144,6 +144,12 @@ class ConfigMixin:
         Args:
             save_directory (`str` or `os.PathLike`):
                 Directory where the configuration JSON file is saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -156,6 +162,22 @@ class ConfigMixin:
         self.to_json_file(output_config_file)
         logger.info(f"Configuration saved in {output_config_file}")
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
     @classmethod
     def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
         r"""

diffusers/dependency_versions_table.py CHANGED Viewed

@@ -29,7 +29,7 @@ deps = {
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
-    "ruff": "ruff>=0.0.241",
+    "ruff": "ruff==0.0.280",
     "safetensors": "safetensors>=0.3.1",
     "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
     "scipy": "scipy",

diffusers/loaders.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import os
 import re
 import warnings
@@ -21,6 +22,7 @@ from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 import requests
+import safetensors
 import torch
 import torch.nn.functional as F
 from huggingface_hub import hf_hub_download
@@ -33,16 +35,12 @@ from .utils import (
     deprecate,
     is_accelerate_available,
     is_omegaconf_available,
-    is_safetensors_available,
     is_transformers_available,
     logging,
 )
 from .utils.import_utils import BACKENDS_MAPPING
-if is_safetensors_available():
-    import safetensors
 if is_transformers_available():
     from transformers import CLIPTextModel, CLIPTextModelWithProjection, PreTrainedModel, PreTrainedTokenizer
@@ -189,7 +187,7 @@ class UNet2DConditionLoadersMixin:
         r"""
         Load pretrained attention processor layers into [`UNet2DConditionModel`]. Attention processor layers have to be
         defined in
-        [`cross_attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py)
+        [`attention_processor.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py)
         and be a `torch.nn.Module` class.
         Parameters:
@@ -258,15 +256,12 @@ class UNet2DConditionLoadersMixin:
         # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
         # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
         network_alphas = kwargs.pop("network_alphas", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
+        is_network_alphas_none = network_alphas is None
         allow_pickle = False
         if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
+            use_safetensors = True
             allow_pickle = True
         user_agent = {
@@ -349,13 +344,20 @@ class UNet2DConditionLoadersMixin:
                 # Create another `mapped_network_alphas` dictionary so that we can properly map them.
                 if network_alphas is not None:
-                    for k in network_alphas:
+                    network_alphas_ = copy.deepcopy(network_alphas)
+                    for k in network_alphas_:
                         if k.replace(".alpha", "") in key:
-                            mapped_network_alphas.update({attn_processor_key: network_alphas[k]})
+                            mapped_network_alphas.update({attn_processor_key: network_alphas.pop(k)})
+            if not is_network_alphas_none:
+                if len(network_alphas) > 0:
+                    raise ValueError(
+                        f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+                    )
             if len(state_dict) > 0:
                 raise ValueError(
-                    f"The state_dict has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+                    f"The `state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
                 )
             for key, value_dict in lora_grouped_dict.items():
@@ -434,14 +436,6 @@ class UNet2DConditionLoadersMixin:
                             v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight"),
                             out_rank=rank_mapping.get("to_out_lora.down.weight"),
                             out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight"),
-                            # rank=rank_mapping.get("to_k_lora.down.weight", None),
-                            # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
-                            # q_rank=rank_mapping.get("to_q_lora.down.weight", None),
-                            # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight", None),
-                            # v_rank=rank_mapping.get("to_v_lora.down.weight", None),
-                            # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight", None),
-                            # out_rank=rank_mapping.get("to_out_lora.down.weight", None),
-                            # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight", None),
                         )
                     else:
                         attn_processors[key] = attn_processor_class(
@@ -496,9 +490,6 @@ class UNet2DConditionLoadersMixin:
         # set ff layers
         for target_module, lora_layer in non_attn_lora_layers:
             target_module.set_lora_layer(lora_layer)
-            # It should raise an error if we don't have a set lora here
-            # if hasattr(target_module, "set_lora_layer"):
-            #     target_module.set_lora_layer(lora_layer)
     def save_attn_procs(
         self,
@@ -506,7 +497,7 @@ class UNet2DConditionLoadersMixin:
         is_main_process: bool = True,
         weight_name: str = None,
         save_function: Callable = None,
-        safe_serialization: bool = False,
+        safe_serialization: bool = True,
         **kwargs,
     ):
         r"""
@@ -524,19 +515,14 @@ class UNet2DConditionLoadersMixin:
                 The function to use to save the state dictionary. Useful during distributed training when you need to
                 replace `torch.save` with another method. Can be configured with the environment variable
                 `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
         """
         from .models.attention_processor import (
             CustomDiffusionAttnProcessor,
             CustomDiffusionXFormersAttnProcessor,
         )
-        weight_name = weight_name or deprecate(
-            "weights_name",
-            "0.20.0",
-            "`weights_name` is deprecated, please use `weight_name` instead.",
-            take_from=kwargs,
-        )
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -766,14 +752,9 @@ class TextualInversionLoaderMixin:
         weight_name = kwargs.pop("weight_name", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
         allow_pickle = False
         if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
+            use_safetensors = True
             allow_pickle = True
         user_agent = {
@@ -1023,14 +1004,9 @@ class LoraLoaderMixin:
         unet_config = kwargs.pop("unet_config", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
         allow_pickle = False
         if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
+            use_safetensors = True
             allow_pickle = True
         user_agent = {
@@ -1063,6 +1039,7 @@ class LoraLoaderMixin:
                     if not allow_pickle:
                         raise e
                     # try loading non-safetensors weights
+                    model_file = None
                     pass
             if model_file is None:
                 model_file = _get_model_file(
@@ -1258,9 +1235,10 @@ class LoraLoaderMixin:
         keys = list(state_dict.keys())
         prefix = cls.text_encoder_name if prefix is None else prefix
+        # Safe prefix to check with.
         if any(cls.text_encoder_name in key for key in keys):
             # Load the layers corresponding to text encoder and make necessary adjustments.
-            text_encoder_keys = [k for k in keys if k.startswith(prefix)]
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
             text_encoder_lora_state_dict = {
                 k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
             }
@@ -1310,6 +1288,14 @@ class LoraLoaderMixin:
                 ].shape[1]
                 patch_mlp = any(".mlp." in key for key in text_encoder_lora_state_dict.keys())
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
                 cls._modify_text_encoder(
                     text_encoder,
                     lora_scale,
@@ -1371,12 +1357,13 @@ class LoraLoaderMixin:
         lora_parameters = []
         network_alphas = {} if network_alphas is None else network_alphas
+        is_network_alphas_populated = len(network_alphas) > 0
         for name, attn_module in text_encoder_attn_modules(text_encoder):
-            query_alpha = network_alphas.get(name + ".k.proj.alpha")
-            key_alpha = network_alphas.get(name + ".q.proj.alpha")
-            value_alpha = network_alphas.get(name + ".v.proj.alpha")
-            proj_alpha = network_alphas.get(name + ".out.proj.alpha")
+            query_alpha = network_alphas.pop(name + ".to_q_lora.down.weight.alpha", None)
+            key_alpha = network_alphas.pop(name + ".to_k_lora.down.weight.alpha", None)
+            value_alpha = network_alphas.pop(name + ".to_v_lora.down.weight.alpha", None)
+            out_alpha = network_alphas.pop(name + ".to_out_lora.down.weight.alpha", None)
             attn_module.q_proj = PatchedLoraProjection(
                 attn_module.q_proj, lora_scale, network_alpha=query_alpha, rank=rank, dtype=dtype
@@ -1394,14 +1381,14 @@ class LoraLoaderMixin:
             lora_parameters.extend(attn_module.v_proj.lora_linear_layer.parameters())
             attn_module.out_proj = PatchedLoraProjection(
-                attn_module.out_proj, lora_scale, network_alpha=proj_alpha, rank=rank, dtype=dtype
+                attn_module.out_proj, lora_scale, network_alpha=out_alpha, rank=rank, dtype=dtype
             )
             lora_parameters.extend(attn_module.out_proj.lora_linear_layer.parameters())
         if patch_mlp:
             for name, mlp_module in text_encoder_mlp_modules(text_encoder):
-                fc1_alpha = network_alphas.get(name + ".fc1.alpha")
-                fc2_alpha = network_alphas.get(name + ".fc2.alpha")
+                fc1_alpha = network_alphas.pop(name + ".fc1.lora_linear_layer.down.weight.alpha")
+                fc2_alpha = network_alphas.pop(name + ".fc2.lora_linear_layer.down.weight.alpha")
                 mlp_module.fc1 = PatchedLoraProjection(
                     mlp_module.fc1, lora_scale, network_alpha=fc1_alpha, rank=rank, dtype=dtype
@@ -1413,6 +1400,11 @@ class LoraLoaderMixin:
                 )
                 lora_parameters.extend(mlp_module.fc2.lora_linear_layer.parameters())
+        if is_network_alphas_populated and len(network_alphas) > 0:
+            raise ValueError(
+                f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+            )
         return lora_parameters
     @classmethod
@@ -1424,7 +1416,7 @@ class LoraLoaderMixin:
         is_main_process: bool = True,
         weight_name: str = None,
         save_function: Callable = None,
-        safe_serialization: bool = False,
+        safe_serialization: bool = True,
     ):
         r"""
         Save the LoRA parameters corresponding to the UNet and text encoder.
@@ -1445,6 +1437,8 @@ class LoraLoaderMixin:
                 The function to use to save the state dictionary. Useful during distributed training when you need to
                 replace `torch.save` with another method. Can be configured with the environment variable
                 `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
         """
         # Create a flat dictionary.
         state_dict = {}
@@ -1526,10 +1520,6 @@ class LoraLoaderMixin:
             lora_name_up = lora_name + ".lora_up.weight"
             lora_name_alpha = lora_name + ".alpha"
-            # if lora_name_alpha in state_dict:
-            #     alpha = state_dict.pop(lora_name_alpha).item()
-            #     network_alphas.update({lora_name_alpha: alpha})
             if lora_name.startswith("lora_unet_"):
                 diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
@@ -1851,7 +1841,7 @@ class FromSingleFileMixin:
         torch_dtype = kwargs.pop("torch_dtype", None)
-        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+        use_safetensors = kwargs.pop("use_safetensors", None)
         pipeline_name = cls.__name__
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
@@ -1892,16 +1882,24 @@ class FromSingleFileMixin:
             raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
         # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
+        has_valid_url_prefix = False
+        valid_url_prefixes = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+        for prefix in valid_url_prefixes:
             if pretrained_model_link_or_path.startswith(prefix):
                 pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
+                has_valid_url_prefix = True
         # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
         ckpt_path = Path(pretrained_model_link_or_path)
         if not ckpt_path.is_file():
+            if not has_valid_url_prefix:
+                raise ValueError(
+                    f"The provided path is either not a file or a valid huggingface URL was not provided. Valid URLs begin with {', '.join(valid_url_prefixes)}"
+                )
             # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = os.path.join(*ckpt_path.parts[:2])
-            file_path = os.path.join(*ckpt_path.parts[2:])
+            repo_id = "/".join(ckpt_path.parts[:2])
+            file_path = "/".join(ckpt_path.parts[2:])
             if file_path.startswith("blob/"):
                 file_path = file_path[len("blob/") :]
@@ -2048,7 +2046,7 @@ class FromOriginalVAEMixin:
         torch_dtype = kwargs.pop("torch_dtype", None)
-        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+        use_safetensors = kwargs.pop("use_safetensors", None)
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
         from_safetensors = file_extension == "safetensors"
@@ -2221,7 +2219,7 @@ class FromOriginalControlnetMixin:
         torch_dtype = kwargs.pop("torch_dtype", None)
-        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+        use_safetensors = kwargs.pop("use_safetensors", None)
         file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
         from_safetensors = file_extension == "safetensors"

diffusers/models/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ if is_torch_available():
     from .adapter import MultiAdapter, T2IAdapter
     from .autoencoder_asym_kl import AsymmetricAutoencoderKL
     from .autoencoder_kl import AutoencoderKL
+    from .autoencoder_tiny import AutoencoderTiny
     from .controlnet import ControlNetModel
     from .dual_transformer_2d import DualTransformer2DModel
     from .modeling_utils import ModelMixin

diffusers/models/activations.py CHANGED Viewed

@@ -8,5 +8,7 @@ def get_activation(act_fn):
         return nn.Mish()
     elif act_fn == "gelu":
         return nn.GELU()
+    elif act_fn == "relu":
+        return nn.ReLU()
     else:
         raise ValueError(f"Unsupported activation function: {act_fn}")

diffusers/models/attention.py CHANGED Viewed

@@ -24,6 +24,38 @@ from .embeddings import CombinedTimestepLabelEmbeddings
 from .lora import LoRACompatibleLinear
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, d_head):
+        super().__init__()
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+        self.enabled = True
+    def forward(self, x, objs):
+        if not self.enabled:
+            return x
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+        return x
 @maybe_allow_in_graph
 class BasicTransformerBlock(nn.Module):
     r"""
@@ -62,6 +94,7 @@ class BasicTransformerBlock(nn.Module):
         norm_elementwise_affine: bool = True,
         norm_type: str = "layer_norm",
         final_dropout: bool = False,
+        attention_type: str = "default",
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
@@ -120,6 +153,10 @@ class BasicTransformerBlock(nn.Module):
         self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # 4. Fuser
+        if attention_type == "gated":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
         # let chunk size default to None
         self._chunk_size = None
         self._chunk_dim = 0
@@ -150,7 +187,9 @@ class BasicTransformerBlock(nn.Module):
         else:
             norm_hidden_states = self.norm1(hidden_states)
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        # 0. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
         attn_output = self.attn1(
             norm_hidden_states,
@@ -162,6 +201,11 @@ class BasicTransformerBlock(nn.Module):
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
+        # 1.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 1.5 ends
         # 2. Cross-Attention
         if self.attn2 is not None:
             norm_hidden_states = (

diffusers 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl

diffusers 0.19.3py3-none-any.whl → 0.20.1py3-none-any.whl