PyPI - diffusers - Versions diffs - 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl - Mend

diffusers 0.17.1py3-none-any.whl → 0.18.2py3-none-any.whl

Files changed (120) hide show

diffusers/__init__.py +26 -1
diffusers/configuration_utils.py +34 -29
diffusers/dependency_versions_table.py +4 -0
diffusers/image_processor.py +125 -12
diffusers/loaders.py +169 -203
diffusers/models/attention.py +24 -1
diffusers/models/attention_flax.py +10 -5
diffusers/models/attention_processor.py +3 -0
diffusers/models/autoencoder_kl.py +114 -33
diffusers/models/controlnet.py +131 -14
diffusers/models/controlnet_flax.py +37 -26
diffusers/models/cross_attention.py +17 -17
diffusers/models/embeddings.py +67 -0
diffusers/models/modeling_flax_utils.py +64 -56
diffusers/models/modeling_utils.py +193 -104
diffusers/models/prior_transformer.py +207 -37
diffusers/models/resnet.py +26 -26
diffusers/models/transformer_2d.py +36 -41
diffusers/models/transformer_temporal.py +24 -21
diffusers/models/unet_1d.py +31 -25
diffusers/models/unet_2d.py +43 -30
diffusers/models/unet_2d_blocks.py +210 -89
diffusers/models/unet_2d_blocks_flax.py +12 -12
diffusers/models/unet_2d_condition.py +172 -64
diffusers/models/unet_2d_condition_flax.py +38 -24
diffusers/models/unet_3d_blocks.py +34 -31
diffusers/models/unet_3d_condition.py +101 -34
diffusers/models/vae.py +5 -5
diffusers/models/vae_flax.py +37 -34
diffusers/models/vq_model.py +23 -14
diffusers/pipelines/__init__.py +24 -1
diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +1 -1
diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -3
diffusers/pipelines/consistency_models/__init__.py +1 -0
diffusers/pipelines/consistency_models/pipeline_consistency_models.py +337 -0
diffusers/pipelines/controlnet/multicontrolnet.py +120 -1
diffusers/pipelines/controlnet/pipeline_controlnet.py +59 -17
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +60 -15
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +60 -17
diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
diffusers/pipelines/kandinsky/__init__.py +1 -1
diffusers/pipelines/kandinsky/pipeline_kandinsky.py +4 -6
diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +1 -0
diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -0
diffusers/pipelines/kandinsky2_2/__init__.py +7 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +317 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +372 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +434 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +398 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +531 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +541 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +605 -0
diffusers/pipelines/pipeline_flax_utils.py +2 -2
diffusers/pipelines/pipeline_utils.py +124 -146
diffusers/pipelines/shap_e/__init__.py +27 -0
diffusers/pipelines/shap_e/camera.py +147 -0
diffusers/pipelines/shap_e/pipeline_shap_e.py +390 -0
diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +349 -0
diffusers/pipelines/shap_e/renderer.py +709 -0
diffusers/pipelines/stable_diffusion/__init__.py +2 -0
diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +261 -66
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +3 -3
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -3
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -2
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +719 -0
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +832 -0
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +17 -7
diffusers/pipelines/stable_diffusion_xl/__init__.py +26 -0
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +823 -0
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +896 -0
diffusers/pipelines/stable_diffusion_xl/watermark.py +31 -0
diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -1
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -1
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +771 -0
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +92 -6
diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +209 -91
diffusers/schedulers/__init__.py +3 -0
diffusers/schedulers/scheduling_consistency_models.py +380 -0
diffusers/schedulers/scheduling_ddim.py +28 -6
diffusers/schedulers/scheduling_ddim_inverse.py +19 -4
diffusers/schedulers/scheduling_ddim_parallel.py +642 -0
diffusers/schedulers/scheduling_ddpm.py +53 -7
diffusers/schedulers/scheduling_ddpm_parallel.py +604 -0
diffusers/schedulers/scheduling_deis_multistep.py +66 -11
diffusers/schedulers/scheduling_dpmsolver_multistep.py +55 -13
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +19 -4
diffusers/schedulers/scheduling_dpmsolver_sde.py +73 -11
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +23 -7
diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -9
diffusers/schedulers/scheduling_euler_discrete.py +58 -8
diffusers/schedulers/scheduling_heun_discrete.py +89 -14
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +73 -11
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +73 -11
diffusers/schedulers/scheduling_lms_discrete.py +57 -8
diffusers/schedulers/scheduling_pndm.py +46 -10
diffusers/schedulers/scheduling_repaint.py +19 -4
diffusers/schedulers/scheduling_sde_ve.py +5 -1
diffusers/schedulers/scheduling_unclip.py +43 -4
diffusers/schedulers/scheduling_unipc_multistep.py +48 -7
diffusers/training_utils.py +1 -1
diffusers/utils/__init__.py +2 -1
diffusers/utils/dummy_pt_objects.py +60 -0
diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py +32 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +180 -0
diffusers/utils/hub_utils.py +1 -1
diffusers/utils/import_utils.py +20 -3
diffusers/utils/logging.py +15 -18
diffusers/utils/outputs.py +3 -3
diffusers/utils/testing_utils.py +15 -0
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/METADATA +4 -2
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/RECORD +120 -94
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/WHEEL +1 -1
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/LICENSE +0 -0
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/entry_points.txt +0 -0
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/top_level.txt +0 -0

diffusers/models/prior_transformer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Dict, Optional, Union
 import torch
 import torch.nn.functional as F
@@ -8,6 +8,7 @@ from torch import nn
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput
 from .attention import BasicTransformerBlock
+from .attention_processor import AttentionProcessor, AttnProcessor
 from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
@@ -15,6 +16,8 @@ from .modeling_utils import ModelMixin
 @dataclass
 class PriorTransformerOutput(BaseOutput):
     """
+    The output of [`PriorTransformer`].
     Args:
         predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
             The predicted CLIP image embedding conditioned on the CLIP text embedding input.
@@ -25,27 +28,39 @@ class PriorTransformerOutput(BaseOutput):
 class PriorTransformer(ModelMixin, ConfigMixin):
     """
-    The prior transformer from unCLIP is used to predict CLIP image embeddings from CLIP text embeddings. Note that the
-    transformer predicts the image embeddings through a denoising diffusion process.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
-    For more details, see the original paper: https://arxiv.org/abs/2204.06125
+    A Prior Transformer model.
     Parameters:
         num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
         attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
         num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
-        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the CLIP embeddings. Note that CLIP
-            image embeddings and text embeddings are both the same dimension.
-        num_embeddings (`int`, *optional*, defaults to 77): The max number of clip embeddings allowed. I.e. the
-            length of the prompt after it has been tokenized.
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77):
+            The number of embeddings of the model input `hidden_states`
         additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
-            projected hidden_states. The actual length of the used hidden_states is `num_embeddings +
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
             additional_embeddings`.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            The activation function to use to create timestep embeddings.
+        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
+            passing to Transformer blocks. Set it to `None` if normalization is not needed.
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+            needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+            `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
+            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
+            product between the text embedding and image embedding as proposed in the unclip paper
+            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
+        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
+            If None, will be set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None):
+            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None):
+            The dimension of the output. If None, will be set to `embedding_dim`.
     """
     @register_to_config
@@ -58,6 +73,14 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         num_embeddings=77,
         additional_embeddings=4,
         dropout: float = 0.0,
+        time_embed_act_fn: str = "silu",
+        norm_in_type: Optional[str] = None,  # layer
+        embedding_proj_norm_type: Optional[str] = None,  # layer
+        encoder_hid_proj_type: Optional[str] = "linear",  # linear
+        added_emb_type: Optional[str] = "prd",  # prd
+        time_embed_dim: Optional[int] = None,
+        embedding_proj_dim: Optional[int] = None,
+        clip_embed_dim: Optional[int] = None,
     ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
@@ -65,17 +88,41 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         inner_dim = num_attention_heads * attention_head_dim
         self.additional_embeddings = additional_embeddings
+        time_embed_dim = time_embed_dim or inner_dim
+        embedding_proj_dim = embedding_proj_dim or embedding_dim
+        clip_embed_dim = clip_embed_dim or embedding_dim
         self.time_proj = Timesteps(inner_dim, True, 0)
-        self.time_embedding = TimestepEmbedding(inner_dim, inner_dim)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
         self.proj_in = nn.Linear(embedding_dim, inner_dim)
-        self.embedding_proj = nn.Linear(embedding_dim, inner_dim)
-        self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        if embedding_proj_norm_type is None:
+            self.embedding_proj_norm = None
+        elif embedding_proj_norm_type == "layer":
+            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
+        else:
+            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
+        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
+        if encoder_hid_proj_type is None:
+            self.encoder_hidden_states_proj = None
+        elif encoder_hid_proj_type == "linear":
+            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        else:
+            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
         self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
-        self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        if added_emb_type == "prd":
+            self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        elif added_emb_type is None:
+            self.prd_embedding = None
+        else:
+            raise ValueError(
+                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
+            )
         self.transformer_blocks = nn.ModuleList(
             [
@@ -91,8 +138,16 @@ class PriorTransformer(ModelMixin, ConfigMixin):
             ]
         )
+        if norm_in_type == "layer":
+            self.norm_in = nn.LayerNorm(inner_dim)
+        elif norm_in_type is None:
+            self.norm_in = None
+        else:
+            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
         self.norm_out = nn.LayerNorm(inner_dim)
-        self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim)
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
         causal_attention_mask = torch.full(
             [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
@@ -101,23 +156,92 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         causal_attention_mask = causal_attention_mask[None, ...]
         self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
-        self.clip_mean = nn.Parameter(torch.zeros(1, embedding_dim))
-        self.clip_std = nn.Parameter(torch.zeros(1, embedding_dim))
+        self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
+        self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
     def forward(
         self,
         hidden_states,
         timestep: Union[torch.Tensor, float, int],
         proj_embedding: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.BoolTensor] = None,
         return_dict: bool = True,
     ):
         """
+        The [`PriorTransformer`] forward method.
         Args:
             hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
-                x_t, the currently predicted image embeddings.
-            timestep (`torch.long`):
+                The currently predicted image embeddings.
+            timestep (`torch.LongTensor`):
                 Current denoising step.
             proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
                 Projected embedding vector the denoising process is conditioned on.
@@ -126,13 +250,13 @@ class PriorTransformer(ModelMixin, ConfigMixin):
             attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
                 Text mask for the text embeddings.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.prior_transformer.PriorTransformerOutput`] instead of a plain
+                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
                 tuple.
         Returns:
             [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
-            [`~models.prior_transformer.PriorTransformerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
         """
         batch_size = hidden_states.shape[0]
@@ -152,23 +276,61 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
+        if self.embedding_proj_norm is not None:
+            proj_embedding = self.embedding_proj_norm(proj_embedding)
         proj_embeddings = self.embedding_proj(proj_embedding)
-        encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
+            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
+            raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
         hidden_states = self.proj_in(hidden_states)
-        prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
         positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
+        additional_embeds = []
+        additional_embeddings_len = 0
+        if encoder_hidden_states is not None:
+            additional_embeds.append(encoder_hidden_states)
+            additional_embeddings_len += encoder_hidden_states.shape[1]
+        if len(proj_embeddings.shape) == 2:
+            proj_embeddings = proj_embeddings[:, None, :]
+        if len(hidden_states.shape) == 2:
+            hidden_states = hidden_states[:, None, :]
+        additional_embeds = additional_embeds + [
+            proj_embeddings,
+            time_embeddings[:, None, :],
+            hidden_states,
+        ]
+        if self.prd_embedding is not None:
+            prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
+            additional_embeds.append(prd_embedding)
         hidden_states = torch.cat(
-            [
-                encoder_hidden_states,
-                proj_embeddings[:, None, :],
-                time_embeddings[:, None, :],
-                hidden_states[:, None, :],
-                prd_embedding,
-            ],
+            additional_embeds,
             dim=1,
         )
+        # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
+        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
+        if positional_embeddings.shape[1] < hidden_states.shape[1]:
+            positional_embeddings = F.pad(
+                positional_embeddings,
+                (
+                    0,
+                    0,
+                    additional_embeddings_len,
+                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
+                ),
+                value=0.0,
+            )
         hidden_states = hidden_states + positional_embeddings
         if attention_mask is not None:
@@ -177,11 +339,19 @@ class PriorTransformer(ModelMixin, ConfigMixin):
             attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
             attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
+        if self.norm_in is not None:
+            hidden_states = self.norm_in(hidden_states)
         for block in self.transformer_blocks:
             hidden_states = block(hidden_states, attention_mask=attention_mask)
         hidden_states = self.norm_out(hidden_states)
-        hidden_states = hidden_states[:, -1]
+        if self.prd_embedding is not None:
+            hidden_states = hidden_states[:, -1]
+        else:
+            hidden_states = hidden_states[:, additional_embeddings_len:]
         predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
         if not return_dict:

diffusers/models/resnet.py CHANGED Viewed

@@ -95,9 +95,9 @@ class Downsample1D(nn.Module):
             assert self.channels == self.out_channels
             self.conv = nn.AvgPool1d(kernel_size=stride, stride=stride)
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        return self.conv(x)
+    def forward(self, inputs):
+        assert inputs.shape[1] == self.channels
+        return self.conv(inputs)
 class Upsample2D(nn.Module):
@@ -431,13 +431,13 @@ class KDownsample2D(nn.Module):
         self.pad = kernel_1d.shape[1] // 2 - 1
         self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
-    def forward(self, x):
-        x = F.pad(x, (self.pad,) * 4, self.pad_mode)
-        weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
-        indices = torch.arange(x.shape[1], device=x.device)
-        kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1)
+    def forward(self, inputs):
+        inputs = F.pad(inputs, (self.pad,) * 4, self.pad_mode)
+        weight = inputs.new_zeros([inputs.shape[1], inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
+        indices = torch.arange(inputs.shape[1], device=inputs.device)
+        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
         weight[indices, indices] = kernel
-        return F.conv2d(x, weight, stride=2)
+        return F.conv2d(inputs, weight, stride=2)
 class KUpsample2D(nn.Module):
@@ -448,13 +448,13 @@ class KUpsample2D(nn.Module):
         self.pad = kernel_1d.shape[1] // 2 - 1
         self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
-    def forward(self, x):
-        x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode)
-        weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
-        indices = torch.arange(x.shape[1], device=x.device)
-        kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1)
+    def forward(self, inputs):
+        inputs = F.pad(inputs, ((self.pad + 1) // 2,) * 4, self.pad_mode)
+        weight = inputs.new_zeros([inputs.shape[1], inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
+        indices = torch.arange(inputs.shape[1], device=inputs.device)
+        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
         weight[indices, indices] = kernel
-        return F.conv_transpose2d(x, weight, stride=2, padding=self.pad * 2 + 1)
+        return F.conv_transpose2d(inputs, weight, stride=2, padding=self.pad * 2 + 1)
 class ResnetBlock2D(nn.Module):
@@ -664,13 +664,13 @@ class Conv1dBlock(nn.Module):
         self.group_norm = nn.GroupNorm(n_groups, out_channels)
         self.mish = nn.Mish()
-    def forward(self, x):
-        x = self.conv1d(x)
-        x = rearrange_dims(x)
-        x = self.group_norm(x)
-        x = rearrange_dims(x)
-        x = self.mish(x)
-        return x
+    def forward(self, inputs):
+        intermediate_repr = self.conv1d(inputs)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        intermediate_repr = self.group_norm(intermediate_repr)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        output = self.mish(intermediate_repr)
+        return output
 # unet_rl.py
@@ -687,10 +687,10 @@ class ResidualTemporalBlock1D(nn.Module):
             nn.Conv1d(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
         )
-    def forward(self, x, t):
+    def forward(self, inputs, t):
         """
         Args:
-            x : [ batch_size x inp_channels x horizon ]
+            inputs : [ batch_size x inp_channels x horizon ]
             t : [ batch_size x embed_dim ]
         returns:
@@ -698,9 +698,9 @@ class ResidualTemporalBlock1D(nn.Module):
         """
         t = self.time_emb_act(t)
         t = self.time_emb(t)
-        out = self.conv_in(x) + rearrange_dims(t)
+        out = self.conv_in(inputs) + rearrange_dims(t)
         out = self.conv_out(out)
-        return out + self.residual_conv(x)
+        return out + self.residual_conv(inputs)
 def upsample_2d(hidden_states, kernel=None, factor=2, gain=1):

diffusers/models/transformer_2d.py CHANGED Viewed

@@ -29,10 +29,12 @@ from .modeling_utils import ModelMixin
 @dataclass
 class Transformer2DModelOutput(BaseOutput):
     """
+    The output of [`Transformer2DModel`].
     Args:
         sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
-            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
-            for the unnoised latent pixels.
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
     """
     sample: torch.FloatTensor
@@ -40,40 +42,30 @@ class Transformer2DModelOutput(BaseOutput):
 class Transformer2DModel(ModelMixin, ConfigMixin):
     """
-    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
-    embeddings) inputs.
-    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
-    transformer action. Finally, reshape to image.
-    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
-    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
-    classes of unnoised image.
-    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
-    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+    A 2D Transformer model for image-like data.
     Parameters:
         num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
         attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
         in_channels (`int`, *optional*):
-            Pass if the input is continuous. The number of channels in the input and output.
+            The number of channels in the input and output (specify if the input is **continuous**).
         num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
-        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
-            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
-            `ImagePositionalEmbeddings`.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
         num_vector_embeds (`int`, *optional*):
-            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
             Includes the class for the masked latent pixel.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
-            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
-            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
-            up to but not more than steps than `num_embeds_ada_norm`.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
         attention_bias (`bool`, *optional*):
-            Configure if the TransformerBlocks' attention should contain a bias parameter.
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
     """
     @register_to_config
@@ -223,31 +215,34 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
         return_dict: bool = True,
     ):
         """
+        The [`Transformer2DModel`] forward method.
         Args:
-            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
-                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
-                hidden_states
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
             encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
                 Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                 self-attention.
             timestep ( `torch.LongTensor`, *optional*):
-                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
             class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
-                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
-                conditioning.
-            encoder_attention_mask ( `torch.Tensor`, *optional* ).
-                Cross-attention mask, applied to encoder_hidden_states. Two formats supported:
-                    Mask `(batch, sequence_length)` True = keep, False = discard. Bias `(batch, 1, sequence_length)` 0
-                    = keep, -10000 = discard.
-                If ndim == 2: will be interpreted as a mask, then converted into a bias consistent with the format
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
                 above. This bias will be added to the cross-attention scores.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
         Returns:
-            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
-            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
         """
         # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
         #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.

diffusers 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl

diffusers 0.17.1py3-none-any.whl → 0.18.2py3-none-any.whl