PyPI - diffusers - Versions diffs - 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/models/autoencoders/autoencoder_kl.py CHANGED Viewed

@@ -138,10 +138,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapter
         self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
         self.tile_overlap_factor = 0.25
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Encoder, Decoder)):
-            module.gradient_checkpointing = value
     def enable_tiling(self, use_tiling: bool = True):
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to

diffusers/models/autoencoders/autoencoder_kl_allegro.py CHANGED Viewed

@@ -103,7 +103,7 @@ class AllegroTemporalConvLayer(nn.Module):
         if self.down_sample:
             identity = hidden_states[:, :, ::2]
         elif self.up_sample:
-            identity = hidden_states.repeat_interleave(2, dim=2)
+            identity = hidden_states.repeat_interleave(2, dim=2, output_size=hidden_states.shape[2] * 2)
         else:
             identity = hidden_states
@@ -507,19 +507,12 @@ class AllegroEncoder3D(nn.Module):
         sample = sample + residual
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
             # Down blocks
             for down_block in self.down_blocks:
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+                sample = self._gradient_checkpointing_func(down_block, sample)
             # Mid block
-            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+            sample = self._gradient_checkpointing_func(self.mid_block, sample)
         else:
             # Down blocks
             for down_block in self.down_blocks:
@@ -647,19 +640,12 @@ class AllegroDecoder3D(nn.Module):
         upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
             # Mid block
-            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+            sample = self._gradient_checkpointing_func(self.mid_block, sample)
             # Up blocks
             for up_block in self.up_blocks:
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
+                sample = self._gradient_checkpointing_func(up_block, sample)
         else:
             # Mid block
@@ -809,10 +795,6 @@ class AutoencoderKLAllegro(ModelMixin, ConfigMixin):
             sample_size - self.tile_overlap_w,
         )
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (AllegroEncoder3D, AllegroDecoder3D)):
-            module.gradient_checkpointing = value
     def enable_tiling(self) -> None:
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to

diffusers/models/autoencoders/autoencoder_kl_cogvideox.py CHANGED Viewed

@@ -105,6 +105,7 @@ class CogVideoXCausalConv3d(nn.Module):
         self.width_pad = width_pad
         self.time_pad = time_pad
         self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        self.const_padding_conv3d = (0, self.width_pad, self.height_pad)
         self.temporal_dim = 2
         self.time_kernel_size = time_kernel_size
@@ -117,6 +118,8 @@ class CogVideoXCausalConv3d(nn.Module):
             kernel_size=kernel_size,
             stride=stride,
             dilation=dilation,
+            padding=0 if self.pad_mode == "replicate" else self.const_padding_conv3d,
+            padding_mode="zeros",
         )
     def fake_context_parallel_forward(
@@ -137,9 +140,7 @@ class CogVideoXCausalConv3d(nn.Module):
         if self.pad_mode == "replicate":
             conv_cache = None
         else:
-            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
             conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
-            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
         output = self.conv(inputs)
         return output, conv_cache
@@ -421,15 +422,8 @@ class CogVideoXDownBlock3D(nn.Module):
             conv_cache_key = f"resnet_{i}"
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
+                hidden_states, new_conv_cache[conv_cache_key] = self._gradient_checkpointing_func(
+                    resnet,
                     hidden_states,
                     temb,
                     zq,
@@ -523,15 +517,8 @@ class CogVideoXMidBlock3D(nn.Module):
             conv_cache_key = f"resnet_{i}"
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
+                hidden_states, new_conv_cache[conv_cache_key] = self._gradient_checkpointing_func(
+                    resnet, hidden_states, temb, zq, conv_cache.get(conv_cache_key)
                 )
             else:
                 hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -637,15 +624,8 @@ class CogVideoXUpBlock3D(nn.Module):
             conv_cache_key = f"resnet_{i}"
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
+                hidden_states, new_conv_cache[conv_cache_key] = self._gradient_checkpointing_func(
+                    resnet,
                     hidden_states,
                     temb,
                     zq,
@@ -774,18 +754,11 @@ class CogVideoXEncoder3D(nn.Module):
         hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
             # 1. Down
             for i, down_block in enumerate(self.down_blocks):
                 conv_cache_key = f"down_block_{i}"
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(down_block),
+                hidden_states, new_conv_cache[conv_cache_key] = self._gradient_checkpointing_func(
+                    down_block,
                     hidden_states,
                     temb,
                     None,
@@ -793,8 +766,8 @@ class CogVideoXEncoder3D(nn.Module):
                 )
             # 2. Mid
-            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block),
+            hidden_states, new_conv_cache["mid_block"] = self._gradient_checkpointing_func(
+                self.mid_block,
                 hidden_states,
                 temb,
                 None,
@@ -940,16 +913,9 @@ class CogVideoXDecoder3D(nn.Module):
         hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
             # 1. Mid
-            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block),
+            hidden_states, new_conv_cache["mid_block"] = self._gradient_checkpointing_func(
+                self.mid_block,
                 hidden_states,
                 temb,
                 sample,
@@ -959,8 +925,8 @@ class CogVideoXDecoder3D(nn.Module):
             # 2. Up
             for i, up_block in enumerate(self.up_blocks):
                 conv_cache_key = f"up_block_{i}"
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(up_block),
+                hidden_states, new_conv_cache[conv_cache_key] = self._gradient_checkpointing_func(
+                    up_block,
                     hidden_states,
                     temb,
                     sample,
@@ -1122,10 +1088,6 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         self.tile_overlap_factor_height = 1 / 6
         self.tile_overlap_factor_width = 1 / 5
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
-            module.gradient_checkpointing = value
     def enable_tiling(
         self,
         tile_sample_min_height: Optional[int] = None,

diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 import numpy as np
 import torch
@@ -21,7 +21,7 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import is_torch_version, logging
+from ...utils import logging
 from ...utils.accelerate_utils import apply_forward_hook
 from ..activations import get_activation
 from ..attention_processor import Attention
@@ -36,11 +36,11 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def prepare_causal_attention_mask(
     num_frames: int, height_width: int, dtype: torch.dtype, device: torch.device, batch_size: int = None
 ) -> torch.Tensor:
-    seq_len = num_frames * height_width
-    mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
-    for i in range(seq_len):
-        i_frame = i // height_width
-        mask[i, : (i_frame + 1) * height_width] = 0
+    indices = torch.arange(1, num_frames + 1, dtype=torch.int32, device=device)
+    indices_blocks = indices.repeat_interleave(height_width)
+    x, y = torch.meshgrid(indices_blocks, indices_blocks, indexing="xy")
+    mask = torch.where(x <= y, 0, -float("inf")).to(dtype=dtype)
     if batch_size is not None:
         mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
     return mask
@@ -252,21 +252,7 @@ class HunyuanVideoMidBlock3D(nn.Module):
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.resnets[0]), hidden_states, **ckpt_kwargs
-            )
+            hidden_states = self._gradient_checkpointing_func(self.resnets[0], hidden_states)
             for attn, resnet in zip(self.attentions, self.resnets[1:]):
                 if attn is not None:
@@ -278,9 +264,7 @@ class HunyuanVideoMidBlock3D(nn.Module):
                     hidden_states = attn(hidden_states, attention_mask=attention_mask)
                     hidden_states = hidden_states.unflatten(1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, **ckpt_kwargs
-                )
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
         else:
             hidden_states = self.resnets[0](hidden_states)
@@ -350,22 +334,8 @@ class HunyuanVideoDownBlock3D(nn.Module):
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
             for resnet in self.resnets:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, **ckpt_kwargs
-                )
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
         else:
             for resnet in self.resnets:
                 hidden_states = resnet(hidden_states)
@@ -426,22 +396,8 @@ class HunyuanVideoUpBlock3D(nn.Module):
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
             for resnet in self.resnets:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, **ckpt_kwargs
-                )
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
         else:
             for resnet in self.resnets:
@@ -545,26 +501,10 @@ class HunyuanVideoEncoder3D(nn.Module):
         hidden_states = self.conv_in(hidden_states)
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
             for down_block in self.down_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(down_block), hidden_states, **ckpt_kwargs
-                )
+                hidden_states = self._gradient_checkpointing_func(down_block, hidden_states)
-            hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block), hidden_states, **ckpt_kwargs
-            )
+            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
         else:
             for down_block in self.down_blocks:
                 hidden_states = down_block(hidden_states)
@@ -667,26 +607,10 @@ class HunyuanVideoDecoder3D(nn.Module):
         hidden_states = self.conv_in(hidden_states)
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block), hidden_states, **ckpt_kwargs
-            )
+            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
             for up_block in self.up_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(up_block), hidden_states, **ckpt_kwargs
-                )
+                hidden_states = self._gradient_checkpointing_func(up_block, hidden_states)
         else:
             hidden_states = self.mid_block(hidden_states)
@@ -786,7 +710,7 @@ class AutoencoderKLHunyuanVideo(ModelMixin, ConfigMixin):
         self.use_tiling = False
         # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames
-        # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered.
+        # at a fixed frame batch size (based on `self.tile_sample_min_num_frames`), the memory requirement can be lowered.
         self.use_framewise_encoding = True
         self.use_framewise_decoding = True
@@ -800,10 +724,6 @@ class AutoencoderKLHunyuanVideo(ModelMixin, ConfigMixin):
         self.tile_sample_stride_width = 192
         self.tile_sample_stride_num_frames = 12
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (HunyuanVideoEncoder3D, HunyuanVideoDecoder3D)):
-            module.gradient_checkpointing = value
     def enable_tiling(
         self,
         tile_sample_min_height: Optional[int] = None,
@@ -868,7 +788,7 @@ class AutoencoderKLHunyuanVideo(ModelMixin, ConfigMixin):
     def _encode(self, x: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, num_frames, height, width = x.shape
-        if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames:
+        if self.use_framewise_encoding and num_frames > self.tile_sample_min_num_frames:
             return self._temporal_tiled_encode(x)
         if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):

diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl