PyPI - diffusers - Versions diffs - 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl - Mend

diffusers 0.30.3py3-none-any.whl → 0.32.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (268) hide show

diffusers/__init__.py +97 -4
diffusers/callbacks.py +56 -3
diffusers/configuration_utils.py +13 -1
diffusers/image_processor.py +282 -71
diffusers/loaders/__init__.py +24 -3
diffusers/loaders/ip_adapter.py +543 -16
diffusers/loaders/lora_base.py +138 -125
diffusers/loaders/lora_conversion_utils.py +647 -0
diffusers/loaders/lora_pipeline.py +2216 -230
diffusers/loaders/peft.py +380 -0
diffusers/loaders/single_file_model.py +71 -4
diffusers/loaders/single_file_utils.py +597 -10
diffusers/loaders/textual_inversion.py +5 -3
diffusers/loaders/transformer_flux.py +181 -0
diffusers/loaders/transformer_sd3.py +89 -0
diffusers/loaders/unet.py +56 -12
diffusers/models/__init__.py +49 -12
diffusers/models/activations.py +22 -9
diffusers/models/adapter.py +53 -53
diffusers/models/attention.py +98 -13
diffusers/models/attention_flax.py +1 -1
diffusers/models/attention_processor.py +2160 -346
diffusers/models/autoencoders/__init__.py +5 -0
diffusers/models/autoencoders/autoencoder_dc.py +620 -0
diffusers/models/autoencoders/autoencoder_kl.py +73 -12
diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
diffusers/models/autoencoders/vae.py +18 -5
diffusers/models/controlnet.py +47 -802
diffusers/models/controlnet_flux.py +70 -0
diffusers/models/controlnet_sd3.py +26 -376
diffusers/models/controlnet_sparsectrl.py +46 -719
diffusers/models/controlnets/__init__.py +23 -0
diffusers/models/controlnets/controlnet.py +872 -0
diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
diffusers/models/controlnets/controlnet_flux.py +536 -0
diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
diffusers/models/controlnets/controlnet_sd3.py +489 -0
diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
diffusers/models/controlnets/controlnet_union.py +832 -0
diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
diffusers/models/controlnets/multicontrolnet.py +183 -0
diffusers/models/embeddings.py +996 -92
diffusers/models/embeddings_flax.py +23 -9
diffusers/models/model_loading_utils.py +264 -14
diffusers/models/modeling_flax_utils.py +1 -1
diffusers/models/modeling_utils.py +334 -51
diffusers/models/normalization.py +157 -13
diffusers/models/transformers/__init__.py +6 -0
diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
diffusers/models/transformers/dit_transformer_2d.py +1 -1
diffusers/models/transformers/latte_transformer_3d.py +4 -4
diffusers/models/transformers/pixart_transformer_2d.py +10 -2
diffusers/models/transformers/sana_transformer.py +488 -0
diffusers/models/transformers/stable_audio_transformer.py +1 -1
diffusers/models/transformers/transformer_2d.py +1 -1
diffusers/models/transformers/transformer_allegro.py +422 -0
diffusers/models/transformers/transformer_cogview3plus.py +386 -0
diffusers/models/transformers/transformer_flux.py +189 -51
diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
diffusers/models/transformers/transformer_ltx.py +469 -0
diffusers/models/transformers/transformer_mochi.py +499 -0
diffusers/models/transformers/transformer_sd3.py +112 -18
diffusers/models/transformers/transformer_temporal.py +1 -1
diffusers/models/unets/unet_1d_blocks.py +1 -1
diffusers/models/unets/unet_2d.py +8 -1
diffusers/models/unets/unet_2d_blocks.py +88 -21
diffusers/models/unets/unet_2d_condition.py +9 -9
diffusers/models/unets/unet_3d_blocks.py +9 -7
diffusers/models/unets/unet_motion_model.py +46 -68
diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
diffusers/models/unets/unet_stable_cascade.py +2 -2
diffusers/models/unets/uvit_2d.py +1 -1
diffusers/models/upsampling.py +14 -6
diffusers/pipelines/__init__.py +69 -6
diffusers/pipelines/allegro/__init__.py +48 -0
diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
diffusers/pipelines/allegro/pipeline_output.py +23 -0
diffusers/pipelines/animatediff/__init__.py +2 -0
diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
diffusers/pipelines/auto_pipeline.py +88 -10
diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
diffusers/pipelines/cogvideo/__init__.py +2 -0
diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
diffusers/pipelines/cogview3/__init__.py +47 -0
diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
diffusers/pipelines/cogview3/pipeline_output.py +21 -0
diffusers/pipelines/controlnet/__init__.py +86 -80
diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
diffusers/pipelines/flux/__init__.py +23 -1
diffusers/pipelines/flux/modeling_flux.py +47 -0
diffusers/pipelines/flux/pipeline_flux.py +256 -48
diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
diffusers/pipelines/flux/pipeline_output.py +16 -0
diffusers/pipelines/free_noise_utils.py +365 -5
diffusers/pipelines/hunyuan_video/__init__.py +48 -0
diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
diffusers/pipelines/kolors/text_encoder.py +2 -2
diffusers/pipelines/kolors/tokenizer.py +4 -0
diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
diffusers/pipelines/latte/pipeline_latte.py +2 -2
diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
diffusers/pipelines/ltx/__init__.py +50 -0
diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
diffusers/pipelines/ltx/pipeline_output.py +20 -0
diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
diffusers/pipelines/mochi/__init__.py +48 -0
diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
diffusers/pipelines/mochi/pipeline_output.py +20 -0
diffusers/pipelines/pag/__init__.py +13 -0
diffusers/pipelines/pag/pag_utils.py +8 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
diffusers/pipelines/pia/pipeline_pia.py +2 -0
diffusers/pipelines/pipeline_flax_utils.py +1 -1
diffusers/pipelines/pipeline_loading_utils.py +250 -31
diffusers/pipelines/pipeline_utils.py +158 -186
diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
diffusers/pipelines/sana/__init__.py +47 -0
diffusers/pipelines/sana/pipeline_output.py +21 -0
diffusers/pipelines/sana/pipeline_sana.py +884 -0
diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
diffusers/quantizers/__init__.py +16 -0
diffusers/quantizers/auto.py +139 -0
diffusers/quantizers/base.py +233 -0
diffusers/quantizers/bitsandbytes/__init__.py +2 -0
diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
diffusers/quantizers/bitsandbytes/utils.py +306 -0
diffusers/quantizers/gguf/__init__.py +1 -0
diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
diffusers/quantizers/gguf/utils.py +456 -0
diffusers/quantizers/quantization_config.py +669 -0
diffusers/quantizers/torchao/__init__.py +15 -0
diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
diffusers/schedulers/scheduling_ddim.py +4 -1
diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
diffusers/schedulers/scheduling_ddpm.py +6 -7
diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
diffusers/schedulers/scheduling_deis_multistep.py +102 -6
diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
diffusers/schedulers/scheduling_edm_euler.py +8 -6
diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
diffusers/schedulers/scheduling_euler_discrete.py +92 -7
diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
diffusers/schedulers/scheduling_heun_discrete.py +114 -8
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
diffusers/schedulers/scheduling_lcm.py +2 -6
diffusers/schedulers/scheduling_lms_discrete.py +76 -1
diffusers/schedulers/scheduling_repaint.py +1 -1
diffusers/schedulers/scheduling_sasolver.py +102 -6
diffusers/schedulers/scheduling_tcd.py +2 -6
diffusers/schedulers/scheduling_unclip.py +4 -1
diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
diffusers/training_utils.py +63 -19
diffusers/utils/__init__.py +7 -1
diffusers/utils/constants.py +1 -0
diffusers/utils/dummy_pt_objects.py +240 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
diffusers/utils/dynamic_modules_utils.py +3 -3
diffusers/utils/hub_utils.py +44 -40
diffusers/utils/import_utils.py +98 -8
diffusers/utils/loading_utils.py +28 -4
diffusers/utils/peft_utils.py +6 -3
diffusers/utils/testing_utils.py +115 -1
diffusers/utils/torch_utils.py +3 -0
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0

diffusers/models/autoencoders/autoencoder_kl_cogvideox.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -41,7 +41,9 @@ class CogVideoXSafeConv3d(nn.Conv3d):
     """
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        memory_count = torch.prod(torch.tensor(input.shape)).item() * 2 / 1024**3
+        memory_count = (
+            (input.shape[0] * input.shape[1] * input.shape[2] * input.shape[3] * input.shape[4]) * 2 / 1024**3
+        )
         # Set to 2GB, suitable for CuDNN
         if memory_count > 2:
@@ -92,11 +94,13 @@ class CogVideoXCausalConv3d(nn.Module):
         time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
-        self.pad_mode = pad_mode
-        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
-        height_pad = height_kernel_size // 2
-        width_pad = width_kernel_size // 2
+        # TODO(aryan): configure calculation based on stride and dilation in the future.
+        # Since CogVideoX does not use it, it is currently tailored to "just work" with Mochi
+        time_pad = time_kernel_size - 1
+        height_pad = (height_kernel_size - 1) // 2
+        width_pad = (width_kernel_size - 1) // 2
+        self.pad_mode = pad_mode
         self.height_pad = height_pad
         self.width_pad = width_pad
         self.time_pad = time_pad
@@ -105,7 +109,7 @@ class CogVideoXCausalConv3d(nn.Module):
         self.temporal_dim = 2
         self.time_kernel_size = time_kernel_size
-        stride = (stride, 1, 1)
+        stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
         dilation = (dilation, 1, 1)
         self.conv = CogVideoXSafeConv3d(
             in_channels=in_channels,
@@ -115,34 +119,30 @@ class CogVideoXCausalConv3d(nn.Module):
             dilation=dilation,
         )
-        self.conv_cache = None
-    def fake_context_parallel_forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        kernel_size = self.time_kernel_size
-        if kernel_size > 1:
-            cached_inputs = (
-                [self.conv_cache] if self.conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
-            )
-            inputs = torch.cat(cached_inputs + [inputs], dim=2)
+    def fake_context_parallel_forward(
+        self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if self.pad_mode == "replicate":
+            inputs = F.pad(inputs, self.time_causal_padding, mode="replicate")
+        else:
+            kernel_size = self.time_kernel_size
+            if kernel_size > 1:
+                cached_inputs = [conv_cache] if conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
+                inputs = torch.cat(cached_inputs + [inputs], dim=2)
         return inputs
-    def _clear_fake_context_parallel_cache(self):
-        del self.conv_cache
-        self.conv_cache = None
+    def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None) -> torch.Tensor:
+        inputs = self.fake_context_parallel_forward(inputs, conv_cache)
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        inputs = self.fake_context_parallel_forward(inputs)
-        self._clear_fake_context_parallel_cache()
-        # Note: we could move these to the cpu for a lower maximum memory usage but its only a few
-        # hundred megabytes and so let's not do it for now
-        self.conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
-        padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
-        inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
+        if self.pad_mode == "replicate":
+            conv_cache = None
+        else:
+            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
+            conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
         output = self.conv(inputs)
-        return output
+        return output, conv_cache
 class CogVideoXSpatialNorm3D(nn.Module):
@@ -172,7 +172,12 @@ class CogVideoXSpatialNorm3D(nn.Module):
         self.conv_y = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
         self.conv_b = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
-    def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, f: torch.Tensor, zq: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None
+    ) -> torch.Tensor:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
         if f.shape[2] > 1 and f.shape[2] % 2 == 1:
             f_first, f_rest = f[:, :, :1], f[:, :, 1:]
             f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:]
@@ -183,9 +188,12 @@ class CogVideoXSpatialNorm3D(nn.Module):
         else:
             zq = F.interpolate(zq, size=f.shape[-3:])
+        conv_y, new_conv_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
+        conv_b, new_conv_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
         norm_f = self.norm_layer(f)
-        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
-        return new_f
+        new_f = norm_f * conv_y + conv_b
+        return new_f, new_conv_cache
 class CogVideoXResnetBlock3D(nn.Module):
@@ -236,6 +244,7 @@ class CogVideoXResnetBlock3D(nn.Module):
         self.out_channels = out_channels
         self.nonlinearity = get_activation(non_linearity)
         self.use_conv_shortcut = conv_shortcut
+        self.spatial_norm_dim = spatial_norm_dim
         if spatial_norm_dim is None:
             self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
@@ -279,34 +288,43 @@ class CogVideoXResnetBlock3D(nn.Module):
         inputs: torch.Tensor,
         temb: Optional[torch.Tensor] = None,
         zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.Tensor:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
         hidden_states = inputs
         if zq is not None:
-            hidden_states = self.norm1(hidden_states, zq)
+            hidden_states, new_conv_cache["norm1"] = self.norm1(hidden_states, zq, conv_cache=conv_cache.get("norm1"))
         else:
             hidden_states = self.norm1(hidden_states)
         hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.conv1(hidden_states)
+        hidden_states, new_conv_cache["conv1"] = self.conv1(hidden_states, conv_cache=conv_cache.get("conv1"))
         if temb is not None:
             hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
         if zq is not None:
-            hidden_states = self.norm2(hidden_states, zq)
+            hidden_states, new_conv_cache["norm2"] = self.norm2(hidden_states, zq, conv_cache=conv_cache.get("norm2"))
         else:
             hidden_states = self.norm2(hidden_states)
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
+        hidden_states, new_conv_cache["conv2"] = self.conv2(hidden_states, conv_cache=conv_cache.get("conv2"))
         if self.in_channels != self.out_channels:
-            inputs = self.conv_shortcut(inputs)
+            if self.use_conv_shortcut:
+                inputs, new_conv_cache["conv_shortcut"] = self.conv_shortcut(
+                    inputs, conv_cache=conv_cache.get("conv_shortcut")
+                )
+            else:
+                inputs = self.conv_shortcut(inputs)
         hidden_states = hidden_states + inputs
-        return hidden_states
+        return hidden_states, new_conv_cache
 class CogVideoXDownBlock3D(nn.Module):
@@ -392,9 +410,17 @@ class CogVideoXDownBlock3D(nn.Module):
         hidden_states: torch.Tensor,
         temb: Optional[torch.Tensor] = None,
         zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.Tensor:
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing:
+        r"""Forward method of the `CogVideoXDownBlock3D` class."""
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+        for i, resnet in enumerate(self.resnets):
+            conv_cache_key = f"resnet_{i}"
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module):
                     def create_forward(*inputs):
@@ -402,17 +428,23 @@ class CogVideoXDownBlock3D(nn.Module):
                     return create_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, zq
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    zq,
+                    conv_cache.get(conv_cache_key),
                 )
             else:
-                hidden_states = resnet(hidden_states, temb, zq)
+                hidden_states, new_conv_cache[conv_cache_key] = resnet(
+                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                )
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
-        return hidden_states
+        return hidden_states, new_conv_cache
 class CogVideoXMidBlock3D(nn.Module):
@@ -480,9 +512,17 @@ class CogVideoXMidBlock3D(nn.Module):
         hidden_states: torch.Tensor,
         temb: Optional[torch.Tensor] = None,
         zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.Tensor:
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing:
+        r"""Forward method of the `CogVideoXMidBlock3D` class."""
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+        for i, resnet in enumerate(self.resnets):
+            conv_cache_key = f"resnet_{i}"
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module):
                     def create_forward(*inputs):
@@ -490,13 +530,15 @@ class CogVideoXMidBlock3D(nn.Module):
                     return create_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, zq
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
                 )
             else:
-                hidden_states = resnet(hidden_states, temb, zq)
+                hidden_states, new_conv_cache[conv_cache_key] = resnet(
+                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                )
-        return hidden_states
+        return hidden_states, new_conv_cache
 class CogVideoXUpBlock3D(nn.Module):
@@ -584,10 +626,17 @@ class CogVideoXUpBlock3D(nn.Module):
         hidden_states: torch.Tensor,
         temb: Optional[torch.Tensor] = None,
         zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.Tensor:
         r"""Forward method of the `CogVideoXUpBlock3D` class."""
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+        for i, resnet in enumerate(self.resnets):
+            conv_cache_key = f"resnet_{i}"
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module):
                     def create_forward(*inputs):
@@ -595,17 +644,23 @@ class CogVideoXUpBlock3D(nn.Module):
                     return create_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, zq
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    zq,
+                    conv_cache.get(conv_cache_key),
                 )
             else:
-                hidden_states = resnet(hidden_states, temb, zq)
+                hidden_states, new_conv_cache[conv_cache_key] = resnet(
+                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                )
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
                 hidden_states = upsampler(hidden_states)
-        return hidden_states
+        return hidden_states, new_conv_cache
 class CogVideoXEncoder3D(nn.Module):
@@ -705,11 +760,20 @@ class CogVideoXEncoder3D(nn.Module):
         self.gradient_checkpointing = False
-    def forward(self, sample: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(
+        self,
+        sample: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
         r"""The forward method of the `CogVideoXEncoder3D` class."""
-        hidden_states = self.conv_in(sample)
-        if self.training and self.gradient_checkpointing:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+        hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
             def create_custom_forward(module):
                 def custom_forward(*inputs):
@@ -718,28 +782,44 @@ class CogVideoXEncoder3D(nn.Module):
                 return custom_forward
             # 1. Down
-            for down_block in self.down_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(down_block), hidden_states, temb, None
+            for i, down_block in enumerate(self.down_blocks):
+                conv_cache_key = f"down_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(down_block),
+                    hidden_states,
+                    temb,
+                    None,
+                    conv_cache.get(conv_cache_key),
                 )
             # 2. Mid
-            hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block), hidden_states, temb, None
+            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block),
+                hidden_states,
+                temb,
+                None,
+                conv_cache.get("mid_block"),
             )
         else:
             # 1. Down
-            for down_block in self.down_blocks:
-                hidden_states = down_block(hidden_states, temb, None)
+            for i, down_block in enumerate(self.down_blocks):
+                conv_cache_key = f"down_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = down_block(
+                    hidden_states, temb, None, conv_cache.get(conv_cache_key)
+                )
             # 2. Mid
-            hidden_states = self.mid_block(hidden_states, temb, None)
+            hidden_states, new_conv_cache["mid_block"] = self.mid_block(
+                hidden_states, temb, None, conv_cache=conv_cache.get("mid_block")
+            )
         # 3. Post-process
         hidden_states = self.norm_out(hidden_states)
         hidden_states = self.conv_act(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
+        hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out"))
+        return hidden_states, new_conv_cache
 class CogVideoXDecoder3D(nn.Module):
@@ -846,11 +926,20 @@ class CogVideoXDecoder3D(nn.Module):
         self.gradient_checkpointing = False
-    def forward(self, sample: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(
+        self,
+        sample: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
         r"""The forward method of the `CogVideoXDecoder3D` class."""
-        hidden_states = self.conv_in(sample)
-        if self.training and self.gradient_checkpointing:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+        hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
             def create_custom_forward(module):
                 def custom_forward(*inputs):
@@ -859,28 +948,45 @@ class CogVideoXDecoder3D(nn.Module):
                 return custom_forward
             # 1. Mid
-            hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block), hidden_states, temb, sample
+            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block),
+                hidden_states,
+                temb,
+                sample,
+                conv_cache.get("mid_block"),
             )
             # 2. Up
-            for up_block in self.up_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(up_block), hidden_states, temb, sample
+            for i, up_block in enumerate(self.up_blocks):
+                conv_cache_key = f"up_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(up_block),
+                    hidden_states,
+                    temb,
+                    sample,
+                    conv_cache.get(conv_cache_key),
                 )
         else:
             # 1. Mid
-            hidden_states = self.mid_block(hidden_states, temb, sample)
+            hidden_states, new_conv_cache["mid_block"] = self.mid_block(
+                hidden_states, temb, sample, conv_cache=conv_cache.get("mid_block")
+            )
             # 2. Up
-            for up_block in self.up_blocks:
-                hidden_states = up_block(hidden_states, temb, sample)
+            for i, up_block in enumerate(self.up_blocks):
+                conv_cache_key = f"up_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = up_block(
+                    hidden_states, temb, sample, conv_cache=conv_cache.get(conv_cache_key)
+                )
         # 3. Post-process
-        hidden_states = self.norm_out(hidden_states, sample)
+        hidden_states, new_conv_cache["norm_out"] = self.norm_out(
+            hidden_states, sample, conv_cache=conv_cache.get("norm_out")
+        )
         hidden_states = self.conv_act(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
+        hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out"))
+        return hidden_states, new_conv_cache
 class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
@@ -951,6 +1057,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         force_upcast: float = True,
         use_quant_conv: bool = False,
         use_post_quant_conv: bool = False,
+        invert_scale_latents: bool = False,
     ):
         super().__init__()
@@ -1019,12 +1126,6 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
             module.gradient_checkpointing = value
-    def _clear_fake_context_parallel_cache(self):
-        for name, module in self.named_modules():
-            if isinstance(module, CogVideoXCausalConv3d):
-                logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
-                module._clear_fake_context_parallel_cache()
     def enable_tiling(
         self,
         tile_sample_min_height: Optional[int] = None,
@@ -1090,21 +1191,22 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         frame_batch_size = self.num_sample_frames_batch_size
         # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
-        num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+        # As the extra single frame is handled inside the loop, it is not required to round up here.
+        num_batches = max(num_frames // frame_batch_size, 1)
+        conv_cache = None
         enc = []
         for i in range(num_batches):
             remaining_frames = num_frames % frame_batch_size
             start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
             end_frame = frame_batch_size * (i + 1) + remaining_frames
             x_intermediate = x[:, :, start_frame:end_frame]
-            x_intermediate = self.encoder(x_intermediate)
+            x_intermediate, conv_cache = self.encoder(x_intermediate, conv_cache=conv_cache)
             if self.quant_conv is not None:
                 x_intermediate = self.quant_conv(x_intermediate)
             enc.append(x_intermediate)
-        self._clear_fake_context_parallel_cache()
         enc = torch.cat(enc, dim=2)
         return enc
     @apply_forward_hook
@@ -1142,8 +1244,10 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             return self.tiled_decode(z, return_dict=return_dict)
         frame_batch_size = self.num_latent_frames_batch_size
-        num_batches = num_frames // frame_batch_size
+        num_batches = max(num_frames // frame_batch_size, 1)
+        conv_cache = None
         dec = []
         for i in range(num_batches):
             remaining_frames = num_frames % frame_batch_size
             start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
@@ -1151,10 +1255,9 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             z_intermediate = z[:, :, start_frame:end_frame]
             if self.post_quant_conv is not None:
                 z_intermediate = self.post_quant_conv(z_intermediate)
-            z_intermediate = self.decoder(z_intermediate)
+            z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
             dec.append(z_intermediate)
-        self._clear_fake_context_parallel_cache()
         dec = torch.cat(dec, dim=2)
         if not return_dict:
@@ -1237,8 +1340,11 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             row = []
             for j in range(0, width, overlap_width):
                 # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
-                num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+                # As the extra single frame is handled inside the loop, it is not required to round up here.
+                num_batches = max(num_frames // frame_batch_size, 1)
+                conv_cache = None
                 time = []
                 for k in range(num_batches):
                     remaining_frames = num_frames % frame_batch_size
                     start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
@@ -1250,11 +1356,11 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
                         i : i + self.tile_sample_min_height,
                         j : j + self.tile_sample_min_width,
                     ]
-                    tile = self.encoder(tile)
+                    tile, conv_cache = self.encoder(tile, conv_cache=conv_cache)
                     if self.quant_conv is not None:
                         tile = self.quant_conv(tile)
                     time.append(tile)
-                self._clear_fake_context_parallel_cache()
                 row.append(torch.cat(time, dim=2))
             rows.append(row)
@@ -1314,8 +1420,10 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         for i in range(0, height, overlap_height):
             row = []
             for j in range(0, width, overlap_width):
-                num_batches = num_frames // frame_batch_size
+                num_batches = max(num_frames // frame_batch_size, 1)
+                conv_cache = None
                 time = []
                 for k in range(num_batches):
                     remaining_frames = num_frames % frame_batch_size
                     start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
@@ -1329,9 +1437,9 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
                     ]
                     if self.post_quant_conv is not None:
                         tile = self.post_quant_conv(tile)
-                    tile = self.decoder(tile)
+                    tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
                     time.append(tile)
-                self._clear_fake_context_parallel_cache()
                 row.append(torch.cat(time, dim=2))
             rows.append(row)
@@ -1368,7 +1476,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             z = posterior.sample(generator=generator)
         else:
             z = posterior.mode()
-        dec = self.decode(z)
+        dec = self.decode(z).sample
         if not return_dict:
             return (dec,)
-        return dec
+        return DecoderOutput(sample=dec)

diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl

diffusers 0.30.3py3-none-any.whl → 0.32.0py3-none-any.whl