PyPI - diffusers - Versions diffs - 0.32.2__py3-none-any.whl → 0.33.1__py3-none-any.whl - Mend

diffusers 0.32.2py3-none-any.whl → 0.33.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/models/transformers/dit_transformer_2d.py CHANGED Viewed

@@ -18,7 +18,7 @@ import torch.nn.functional as F
 from torch import nn
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import is_torch_version, logging
+from ...utils import logging
 from ..attention import BasicTransformerBlock
 from ..embeddings import PatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
@@ -64,7 +64,9 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
             A small constant added to the denominator in normalization layers to prevent division by zero.
     """
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
     _supports_gradient_checkpointing = True
+    _supports_group_offloading = False
     @register_to_config
     def __init__(
@@ -143,10 +145,6 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
             self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels
         )
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -185,19 +183,8 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
         # 2. Blocks
         for block in self.transformer_blocks:
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
                     hidden_states,
                     None,
                     None,
@@ -205,7 +192,6 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
                     timestep,
                     cross_attention_kwargs,
                     class_labels,
-                    **ckpt_kwargs,
                 )
             else:
                 hidden_states = block(

diffusers/models/transformers/hunyuan_transformer_2d.py CHANGED Viewed

@@ -244,6 +244,9 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
             Whether or not to use style condition and image meta size. True for version <=1.1, False for version >= 1.2
     """
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm", "pooler"]
+    _supports_group_offloading = False
     @register_to_config
     def __init__(
         self,
@@ -277,9 +280,7 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
             act_fn="silu_fp32",
         )
-        self.text_embedding_padding = nn.Parameter(
-            torch.randn(text_len + text_len_t5, cross_attention_dim, dtype=torch.float32)
-        )
+        self.text_embedding_padding = nn.Parameter(torch.randn(text_len + text_len_t5, cross_attention_dim))
         self.pos_embed = PatchEmbed(
             height=sample_size,

diffusers/models/transformers/latte_transformer_3d.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
 import torch
@@ -19,13 +20,14 @@ from torch import nn
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models.embeddings import PixArtAlphaTextProjection, get_1d_sincos_pos_embed_from_grid
 from ..attention import BasicTransformerBlock
+from ..cache_utils import CacheMixin
 from ..embeddings import PatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import AdaLayerNormSingle
-class LatteTransformer3DModel(ModelMixin, ConfigMixin):
+class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
     _supports_gradient_checkpointing = True
     """
@@ -65,6 +67,8 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
             The number of frames in the video-like data.
     """
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
     @register_to_config
     def __init__(
         self,
@@ -162,9 +166,6 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
         self.gradient_checkpointing = False
-    def _set_gradient_checkpointing(self, module, value=False):
-        self.gradient_checkpointing = value
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -226,20 +227,24 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
         # Prepare text embeddings for spatial block
         # batch_size num_tokens hidden_size -> (batch_size * num_frame) num_tokens hidden_size
         encoder_hidden_states = self.caption_projection(encoder_hidden_states)  # 3 120 1152
-        encoder_hidden_states_spatial = encoder_hidden_states.repeat_interleave(num_frame, dim=0).view(
-            -1, encoder_hidden_states.shape[-2], encoder_hidden_states.shape[-1]
-        )
+        encoder_hidden_states_spatial = encoder_hidden_states.repeat_interleave(
+            num_frame, dim=0, output_size=encoder_hidden_states.shape[0] * num_frame
+        ).view(-1, encoder_hidden_states.shape[-2], encoder_hidden_states.shape[-1])
         # Prepare timesteps for spatial and temporal block
-        timestep_spatial = timestep.repeat_interleave(num_frame, dim=0).view(-1, timestep.shape[-1])
-        timestep_temp = timestep.repeat_interleave(num_patches, dim=0).view(-1, timestep.shape[-1])
+        timestep_spatial = timestep.repeat_interleave(
+            num_frame, dim=0, output_size=timestep.shape[0] * num_frame
+        ).view(-1, timestep.shape[-1])
+        timestep_temp = timestep.repeat_interleave(
+            num_patches, dim=0, output_size=timestep.shape[0] * num_patches
+        ).view(-1, timestep.shape[-1])
         # Spatial and temporal transformer blocks
         for i, (spatial_block, temp_block) in enumerate(
             zip(self.transformer_blocks, self.temporal_transformer_blocks)
         ):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = self._gradient_checkpointing_func(
                     spatial_block,
                     hidden_states,
                     None,  # attention_mask
@@ -248,7 +253,6 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
                     timestep_spatial,
                     None,  # cross_attention_kwargs
                     None,  # class_labels
-                    use_reentrant=False,
                 )
             else:
                 hidden_states = spatial_block(
@@ -269,10 +273,10 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
                 hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
                 if i == 0 and num_frame > 1:
-                    hidden_states = hidden_states + self.temp_pos_embed
+                    hidden_states = hidden_states + self.temp_pos_embed.to(hidden_states.dtype)
                 if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    hidden_states = torch.utils.checkpoint.checkpoint(
+                    hidden_states = self._gradient_checkpointing_func(
                         temp_block,
                         hidden_states,
                         None,  # attention_mask
@@ -281,7 +285,6 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
                         timestep_temp,
                         None,  # cross_attention_kwargs
                         None,  # class_labels
-                        use_reentrant=False,
                     )
                 else:
                     hidden_states = temp_block(
@@ -300,7 +303,9 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin):
                 ).permute(0, 2, 1, 3)
                 hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
-        embedded_timestep = embedded_timestep.repeat_interleave(num_frame, dim=0).view(-1, embedded_timestep.shape[-1])
+        embedded_timestep = embedded_timestep.repeat_interleave(
+            num_frame, dim=0, output_size=embedded_timestep.shape[0] * num_frame
+        ).view(-1, embedded_timestep.shape[-1])
         shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
         hidden_states = self.norm_out(hidden_states)
         # Modulation

diffusers/models/transformers/lumina_nextdit2d.py CHANGED Viewed

@@ -98,7 +98,7 @@ class LuminaNextDiTBlock(nn.Module):
         self.feed_forward = LuminaFeedForward(
             dim=dim,
-            inner_dim=4 * dim,
+            inner_dim=int(4 * 2 * dim / 3),
             multiple_of=multiple_of,
             ffn_dim_multiplier=ffn_dim_multiplier,
         )
@@ -221,6 +221,8 @@ class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
             overall scale of the model's operations.
     """
+    _skip_layerwise_casting_patterns = ["patch_embedder", "norm", "ffn_norm"]
     @register_to_config
     def __init__(
         self,

diffusers/models/transformers/pixart_transformer_2d.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 from torch import nn
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import is_torch_version, logging
+from ...utils import logging
 from ..attention import BasicTransformerBlock
 from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
@@ -79,6 +79,7 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm", "adaln_single"]
     @register_to_config
     def __init__(
@@ -183,10 +184,6 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
                 in_features=self.config.caption_channels, hidden_size=self.inner_dim
             )
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
     @property
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -387,19 +384,8 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
         # 2. Blocks
         for block in self.transformer_blocks:
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
                     hidden_states,
                     attention_mask,
                     encoder_hidden_states,
@@ -407,7 +393,6 @@ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
                     timestep,
                     cross_attention_kwargs,
                     None,
-                    **ckpt_kwargs,
                 )
             else:
                 hidden_states = block(

diffusers/models/transformers/prior_transformer.py CHANGED Viewed

@@ -353,7 +353,11 @@ class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Pef
             attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
             attention_mask = F.pad(attention_mask, (0, self.additional_embeddings), value=0.0)
             attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
-            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
+            attention_mask = attention_mask.repeat_interleave(
+                self.config.num_attention_heads,
+                dim=0,
+                output_size=attention_mask.shape[0] * self.config.num_attention_heads,
+            )
         if self.norm_in is not None:
             hidden_states = self.norm_in(hidden_states)

diffusers/models/transformers/sana_transformer.py CHANGED Viewed

@@ -15,18 +15,18 @@
 from typing import Any, Dict, Optional, Tuple, Union
 import torch
+import torch.nn.functional as F
 from torch import nn
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import PeftAdapterMixin
-from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention_processor import (
     Attention,
     AttentionProcessor,
-    AttnProcessor2_0,
     SanaLinearAttnProcessor2_0,
 )
-from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
+from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import AdaLayerNormSingle, RMSNorm
@@ -82,6 +82,109 @@ class GLUMBConv(nn.Module):
         return hidden_states
+class SanaModulatedNorm(nn.Module):
+    def __init__(self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, scale_shift_table: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        shift, scale = (scale_shift_table[None] + temb[:, None].to(scale_shift_table.device)).chunk(2, dim=1)
+        hidden_states = hidden_states * (1 + scale) + shift
+        return hidden_states
+class SanaCombinedTimestepGuidanceEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor = None, hidden_dtype: torch.dtype = None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+        guidance_proj = self.guidance_condition_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=hidden_dtype))
+        conditioning = timesteps_emb + guidance_emb
+        return self.linear(self.silu(conditioning)), conditioning
+class SanaAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("SanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
 class SanaTransformerBlock(nn.Module):
     r"""
     Transformer block introduced in [Sana](https://huggingface.co/papers/2410.10629).
@@ -101,6 +204,7 @@ class SanaTransformerBlock(nn.Module):
         norm_eps: float = 1e-6,
         attention_out_bias: bool = True,
         mlp_ratio: float = 2.5,
+        qk_norm: Optional[str] = None,
     ) -> None:
         super().__init__()
@@ -110,6 +214,8 @@ class SanaTransformerBlock(nn.Module):
             query_dim=dim,
             heads=num_attention_heads,
             dim_head=attention_head_dim,
+            kv_heads=num_attention_heads if qk_norm is not None else None,
+            qk_norm=qk_norm,
             dropout=dropout,
             bias=attention_bias,
             cross_attention_dim=None,
@@ -121,13 +227,15 @@ class SanaTransformerBlock(nn.Module):
             self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             self.attn2 = Attention(
                 query_dim=dim,
+                qk_norm=qk_norm,
+                kv_heads=num_cross_attention_heads if qk_norm is not None else None,
                 cross_attention_dim=cross_attention_dim,
                 heads=num_cross_attention_heads,
                 dim_head=cross_attention_head_dim,
                 dropout=dropout,
                 bias=True,
                 out_bias=attention_out_bias,
-                processor=AttnProcessor2_0(),
+                processor=SanaAttnProcessor2_0(),
             )
         # 3. Feed-forward
@@ -181,7 +289,7 @@ class SanaTransformerBlock(nn.Module):
         return hidden_states
-class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.
@@ -218,10 +326,15 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             Whether to use elementwise affinity in the normalization layer.
         norm_eps (`float`, defaults to `1e-6`):
             The epsilon value for the normalization layer.
+        qk_norm (`str`, *optional*, defaults to `None`):
+            The normalization to use for the query and key.
+        timestep_scale (`float`, defaults to `1.0`):
+            The scale to use for the timesteps.
     """
     _supports_gradient_checkpointing = True
-    _no_split_modules = ["SanaTransformerBlock", "PatchEmbed"]
+    _no_split_modules = ["SanaTransformerBlock", "PatchEmbed", "SanaModulatedNorm"]
+    _skip_layerwise_casting_patterns = ["patch_embed", "norm"]
     @register_to_config
     def __init__(
@@ -243,6 +356,10 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         norm_elementwise_affine: bool = False,
         norm_eps: float = 1e-6,
         interpolation_scale: Optional[int] = None,
+        guidance_embeds: bool = False,
+        guidance_embeds_scale: float = 0.1,
+        qk_norm: Optional[str] = None,
+        timestep_scale: float = 1.0,
     ) -> None:
         super().__init__()
@@ -250,7 +367,6 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         inner_dim = num_attention_heads * attention_head_dim
         # 1. Patch Embedding
-        interpolation_scale = interpolation_scale if interpolation_scale is not None else max(sample_size // 64, 1)
         self.patch_embed = PatchEmbed(
             height=sample_size,
             width=sample_size,
@@ -258,10 +374,14 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             in_channels=in_channels,
             embed_dim=inner_dim,
             interpolation_scale=interpolation_scale,
+            pos_embed_type="sincos" if interpolation_scale is not None else None,
         )
         # 2. Additional condition embeddings
-        self.time_embed = AdaLayerNormSingle(inner_dim)
+        if guidance_embeds:
+            self.time_embed = SanaCombinedTimestepGuidanceEmbeddings(inner_dim)
+        else:
+            self.time_embed = AdaLayerNormSingle(inner_dim)
         self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
         self.caption_norm = RMSNorm(inner_dim, eps=1e-5, elementwise_affine=True)
@@ -281,6 +401,7 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
                     norm_elementwise_affine=norm_elementwise_affine,
                     norm_eps=norm_eps,
                     mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
                 )
                 for _ in range(num_layers)
             ]
@@ -288,16 +409,11 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # 4. Output blocks
         self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
-        self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+        self.norm_out = SanaModulatedNorm(inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
         self.gradient_checkpointing = False
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
     @property
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -362,7 +478,8 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         self,
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
-        timestep: torch.LongTensor,
+        timestep: torch.Tensor,
+        guidance: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -413,9 +530,14 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         hidden_states = self.patch_embed(hidden_states)
-        timestep, embedded_timestep = self.time_embed(
-            timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
-        )
+        if guidance is not None:
+            timestep, embedded_timestep = self.time_embed(
+                timestep, guidance=guidance, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            timestep, embedded_timestep = self.time_embed(
+                timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+            )
         encoder_hidden_states = self.caption_projection(encoder_hidden_states)
         encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
@@ -424,21 +546,9 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # 2. Transformer blocks
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
             for block in self.transformer_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
                     hidden_states,
                     attention_mask,
                     encoder_hidden_states,
@@ -446,7 +556,6 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
                     timestep,
                     post_patch_height,
                     post_patch_width,
-                    **ckpt_kwargs,
                 )
         else:
@@ -462,13 +571,8 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
                 )
         # 3. Normalization
-        shift, scale = (
-            self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
-        ).chunk(2, dim=1)
-        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.norm_out(hidden_states, embedded_timestep, self.scale_shift_table)
-        # 4. Modulation
-        hidden_states = hidden_states * (1 + scale) + shift
         hidden_states = self.proj_out(hidden_states)
         # 5. Unpatchify

diffusers/models/transformers/stable_audio_transformer.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
-from typing import Any, Dict, Optional, Union
+from typing import Dict, Optional, Union
 import numpy as np
 import torch
@@ -29,7 +29,7 @@ from ...models.attention_processor import (
 )
 from ...models.modeling_utils import ModelMixin
 from ...models.transformers.transformer_2d import Transformer2DModelOutput
-from ...utils import is_torch_version, logging
+from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
@@ -211,6 +211,7 @@ class StableAudioDiTModel(ModelMixin, ConfigMixin):
     """
     _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["preprocess_conv", "postprocess_conv", "^proj_in$", "^proj_out$", "norm"]
     @register_to_config
     def __init__(
@@ -345,10 +346,6 @@ class StableAudioDiTModel(ModelMixin, ConfigMixin):
         """
         self.set_attn_processor(StableAudioAttnProcessor2_0())
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
     def forward(
         self,
         hidden_states: torch.FloatTensor,
@@ -415,25 +412,13 @@ class StableAudioDiTModel(ModelMixin, ConfigMixin):
         for block in self.transformer_blocks:
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
                     hidden_states,
                     attention_mask,
                     cross_attention_hidden_states,
                     encoder_attention_mask,
                     rotary_embedding,
-                    **ckpt_kwargs,
                 )
             else:

diffusers 0.32.2__py3-none-any.whl → 0.33.1__py3-none-any.whl

diffusers 0.32.2py3-none-any.whl → 0.33.1py3-none-any.whl