PyPI - diffusers - Versions diffs - 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl - Mend

diffusers 0.30.3py3-none-any.whl → 0.32.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (268) hide show

diffusers/__init__.py +97 -4
diffusers/callbacks.py +56 -3
diffusers/configuration_utils.py +13 -1
diffusers/image_processor.py +282 -71
diffusers/loaders/__init__.py +24 -3
diffusers/loaders/ip_adapter.py +543 -16
diffusers/loaders/lora_base.py +138 -125
diffusers/loaders/lora_conversion_utils.py +647 -0
diffusers/loaders/lora_pipeline.py +2216 -230
diffusers/loaders/peft.py +380 -0
diffusers/loaders/single_file_model.py +71 -4
diffusers/loaders/single_file_utils.py +597 -10
diffusers/loaders/textual_inversion.py +5 -3
diffusers/loaders/transformer_flux.py +181 -0
diffusers/loaders/transformer_sd3.py +89 -0
diffusers/loaders/unet.py +56 -12
diffusers/models/__init__.py +49 -12
diffusers/models/activations.py +22 -9
diffusers/models/adapter.py +53 -53
diffusers/models/attention.py +98 -13
diffusers/models/attention_flax.py +1 -1
diffusers/models/attention_processor.py +2160 -346
diffusers/models/autoencoders/__init__.py +5 -0
diffusers/models/autoencoders/autoencoder_dc.py +620 -0
diffusers/models/autoencoders/autoencoder_kl.py +73 -12
diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
diffusers/models/autoencoders/vae.py +18 -5
diffusers/models/controlnet.py +47 -802
diffusers/models/controlnet_flux.py +70 -0
diffusers/models/controlnet_sd3.py +26 -376
diffusers/models/controlnet_sparsectrl.py +46 -719
diffusers/models/controlnets/__init__.py +23 -0
diffusers/models/controlnets/controlnet.py +872 -0
diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
diffusers/models/controlnets/controlnet_flux.py +536 -0
diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
diffusers/models/controlnets/controlnet_sd3.py +489 -0
diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
diffusers/models/controlnets/controlnet_union.py +832 -0
diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
diffusers/models/controlnets/multicontrolnet.py +183 -0
diffusers/models/embeddings.py +996 -92
diffusers/models/embeddings_flax.py +23 -9
diffusers/models/model_loading_utils.py +264 -14
diffusers/models/modeling_flax_utils.py +1 -1
diffusers/models/modeling_utils.py +334 -51
diffusers/models/normalization.py +157 -13
diffusers/models/transformers/__init__.py +6 -0
diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
diffusers/models/transformers/dit_transformer_2d.py +1 -1
diffusers/models/transformers/latte_transformer_3d.py +4 -4
diffusers/models/transformers/pixart_transformer_2d.py +10 -2
diffusers/models/transformers/sana_transformer.py +488 -0
diffusers/models/transformers/stable_audio_transformer.py +1 -1
diffusers/models/transformers/transformer_2d.py +1 -1
diffusers/models/transformers/transformer_allegro.py +422 -0
diffusers/models/transformers/transformer_cogview3plus.py +386 -0
diffusers/models/transformers/transformer_flux.py +189 -51
diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
diffusers/models/transformers/transformer_ltx.py +469 -0
diffusers/models/transformers/transformer_mochi.py +499 -0
diffusers/models/transformers/transformer_sd3.py +112 -18
diffusers/models/transformers/transformer_temporal.py +1 -1
diffusers/models/unets/unet_1d_blocks.py +1 -1
diffusers/models/unets/unet_2d.py +8 -1
diffusers/models/unets/unet_2d_blocks.py +88 -21
diffusers/models/unets/unet_2d_condition.py +9 -9
diffusers/models/unets/unet_3d_blocks.py +9 -7
diffusers/models/unets/unet_motion_model.py +46 -68
diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
diffusers/models/unets/unet_stable_cascade.py +2 -2
diffusers/models/unets/uvit_2d.py +1 -1
diffusers/models/upsampling.py +14 -6
diffusers/pipelines/__init__.py +69 -6
diffusers/pipelines/allegro/__init__.py +48 -0
diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
diffusers/pipelines/allegro/pipeline_output.py +23 -0
diffusers/pipelines/animatediff/__init__.py +2 -0
diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
diffusers/pipelines/auto_pipeline.py +88 -10
diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
diffusers/pipelines/cogvideo/__init__.py +2 -0
diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
diffusers/pipelines/cogview3/__init__.py +47 -0
diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
diffusers/pipelines/cogview3/pipeline_output.py +21 -0
diffusers/pipelines/controlnet/__init__.py +86 -80
diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
diffusers/pipelines/flux/__init__.py +23 -1
diffusers/pipelines/flux/modeling_flux.py +47 -0
diffusers/pipelines/flux/pipeline_flux.py +256 -48
diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
diffusers/pipelines/flux/pipeline_output.py +16 -0
diffusers/pipelines/free_noise_utils.py +365 -5
diffusers/pipelines/hunyuan_video/__init__.py +48 -0
diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
diffusers/pipelines/kolors/text_encoder.py +2 -2
diffusers/pipelines/kolors/tokenizer.py +4 -0
diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
diffusers/pipelines/latte/pipeline_latte.py +2 -2
diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
diffusers/pipelines/ltx/__init__.py +50 -0
diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
diffusers/pipelines/ltx/pipeline_output.py +20 -0
diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
diffusers/pipelines/mochi/__init__.py +48 -0
diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
diffusers/pipelines/mochi/pipeline_output.py +20 -0
diffusers/pipelines/pag/__init__.py +13 -0
diffusers/pipelines/pag/pag_utils.py +8 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
diffusers/pipelines/pia/pipeline_pia.py +2 -0
diffusers/pipelines/pipeline_flax_utils.py +1 -1
diffusers/pipelines/pipeline_loading_utils.py +250 -31
diffusers/pipelines/pipeline_utils.py +158 -186
diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
diffusers/pipelines/sana/__init__.py +47 -0
diffusers/pipelines/sana/pipeline_output.py +21 -0
diffusers/pipelines/sana/pipeline_sana.py +884 -0
diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
diffusers/quantizers/__init__.py +16 -0
diffusers/quantizers/auto.py +139 -0
diffusers/quantizers/base.py +233 -0
diffusers/quantizers/bitsandbytes/__init__.py +2 -0
diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
diffusers/quantizers/bitsandbytes/utils.py +306 -0
diffusers/quantizers/gguf/__init__.py +1 -0
diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
diffusers/quantizers/gguf/utils.py +456 -0
diffusers/quantizers/quantization_config.py +669 -0
diffusers/quantizers/torchao/__init__.py +15 -0
diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
diffusers/schedulers/scheduling_ddim.py +4 -1
diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
diffusers/schedulers/scheduling_ddpm.py +6 -7
diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
diffusers/schedulers/scheduling_deis_multistep.py +102 -6
diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
diffusers/schedulers/scheduling_edm_euler.py +8 -6
diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
diffusers/schedulers/scheduling_euler_discrete.py +92 -7
diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
diffusers/schedulers/scheduling_heun_discrete.py +114 -8
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
diffusers/schedulers/scheduling_lcm.py +2 -6
diffusers/schedulers/scheduling_lms_discrete.py +76 -1
diffusers/schedulers/scheduling_repaint.py +1 -1
diffusers/schedulers/scheduling_sasolver.py +102 -6
diffusers/schedulers/scheduling_tcd.py +2 -6
diffusers/schedulers/scheduling_unclip.py +4 -1
diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
diffusers/training_utils.py +63 -19
diffusers/utils/__init__.py +7 -1
diffusers/utils/constants.py +1 -0
diffusers/utils/dummy_pt_objects.py +240 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
diffusers/utils/dynamic_modules_utils.py +3 -3
diffusers/utils/hub_utils.py +44 -40
diffusers/utils/import_utils.py +98 -8
diffusers/utils/loading_utils.py +28 -4
diffusers/utils/peft_utils.py +6 -3
diffusers/utils/testing_utils.py +115 -1
diffusers/utils/torch_utils.py +3 -0
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
{diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0

diffusers/pipelines/audioldm2/modeling_audioldm2.py CHANGED Viewed

@@ -1112,7 +1112,7 @@ class CrossAttnDownBlock2D(nn.Module):
         )
         for i in range(num_layers):
-            if self.training and self.gradient_checkpointing:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
@@ -1290,7 +1290,7 @@ class UNetMidBlock2DCrossAttn(nn.Module):
         )
         for i in range(len(self.resnets[1:])):
-            if self.training and self.gradient_checkpointing:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
@@ -1464,7 +1464,7 @@ class CrossAttnUpBlock2D(nn.Module):
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
             hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            if self.training and self.gradient_checkpointing:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):

diffusers/pipelines/aura_flow/pipeline_aura_flow.py CHANGED Viewed

@@ -53,7 +53,7 @@ def retrieve_timesteps(
     sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
-    """
+    r"""
     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
@@ -387,7 +387,6 @@ class AuraFlowPipeline(DiffusionPipeline):
         prompt: Union[str, List[str]] = None,
         negative_prompt: Union[str, List[str]] = None,
         num_inference_steps: int = 50,
-        timesteps: List[int] = None,
         sigmas: List[float] = None,
         guidance_scale: float = 3.5,
         num_images_per_prompt: Optional[int] = 1,
@@ -424,10 +423,6 @@ class AuraFlowPipeline(DiffusionPipeline):
             sigmas (`List[float]`, *optional*):
                 Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
                 `num_inference_steps` and `timesteps` must be `None`.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
             guidance_scale (`float`, *optional*, defaults to 5.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -522,9 +517,7 @@ class AuraFlowPipeline(DiffusionPipeline):
         # 4. Prepare timesteps
         # sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps, sigmas
-        )
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
         # 5. Prepare latents.
         latent_channels = self.transformer.config.in_channels

diffusers/pipelines/auto_pipeline.py CHANGED Viewed

@@ -18,8 +18,10 @@ from collections import OrderedDict
 from huggingface_hub.utils import validate_hf_hub_args
 from ..configuration_utils import ConfigMixin
+from ..models.controlnets import ControlNetUnionModel
 from ..utils import is_sentencepiece_available
 from .aura_flow import AuraFlowPipeline
+from .cogview3 import CogView3PlusPipeline
 from .controlnet import (
     StableDiffusionControlNetImg2ImgPipeline,
     StableDiffusionControlNetInpaintPipeline,
@@ -27,9 +29,22 @@ from .controlnet import (
     StableDiffusionXLControlNetImg2ImgPipeline,
     StableDiffusionXLControlNetInpaintPipeline,
     StableDiffusionXLControlNetPipeline,
+    StableDiffusionXLControlNetUnionImg2ImgPipeline,
+    StableDiffusionXLControlNetUnionInpaintPipeline,
+    StableDiffusionXLControlNetUnionPipeline,
 )
 from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
-from .flux import FluxPipeline
+from .flux import (
+    FluxControlImg2ImgPipeline,
+    FluxControlInpaintPipeline,
+    FluxControlNetImg2ImgPipeline,
+    FluxControlNetInpaintPipeline,
+    FluxControlNetPipeline,
+    FluxControlPipeline,
+    FluxImg2ImgPipeline,
+    FluxInpaintPipeline,
+    FluxPipeline,
+)
 from .hunyuandit import HunyuanDiTPipeline
 from .kandinsky import (
     KandinskyCombinedPipeline,
@@ -49,12 +64,18 @@ from .kandinsky2_2 import (
 )
 from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
 from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
+from .lumina import LuminaText2ImgPipeline
 from .pag import (
     HunyuanDiTPAGPipeline,
     PixArtSigmaPAGPipeline,
+    StableDiffusion3PAGImg2ImgPipeline,
     StableDiffusion3PAGPipeline,
+    StableDiffusionControlNetPAGInpaintPipeline,
     StableDiffusionControlNetPAGPipeline,
+    StableDiffusionPAGImg2ImgPipeline,
+    StableDiffusionPAGInpaintPipeline,
     StableDiffusionPAGPipeline,
+    StableDiffusionXLControlNetPAGImg2ImgPipeline,
     StableDiffusionXLControlNetPAGPipeline,
     StableDiffusionXLPAGImg2ImgPipeline,
     StableDiffusionXLPAGInpaintPipeline,
@@ -94,6 +115,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
         ("kandinsky3", Kandinsky3Pipeline),
         ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
         ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
+        ("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionPipeline),
         ("wuerstchen", WuerstchenCombinedPipeline),
         ("cascade", StableCascadeCombinedPipeline),
         ("lcm", LatentConsistencyModelPipeline),
@@ -106,6 +128,10 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
         ("pixart-sigma-pag", PixArtSigmaPAGPipeline),
         ("auraflow", AuraFlowPipeline),
         ("flux", FluxPipeline),
+        ("flux-control", FluxControlPipeline),
+        ("flux-controlnet", FluxControlNetPipeline),
+        ("lumina", LuminaText2ImgPipeline),
+        ("cogview3", CogView3PlusPipeline),
     ]
 )
@@ -114,14 +140,21 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
         ("stable-diffusion", StableDiffusionImg2ImgPipeline),
         ("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline),
         ("stable-diffusion-3", StableDiffusion3Img2ImgPipeline),
+        ("stable-diffusion-3-pag", StableDiffusion3PAGImg2ImgPipeline),
         ("if", IFImg2ImgPipeline),
         ("kandinsky", KandinskyImg2ImgCombinedPipeline),
         ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
         ("kandinsky3", Kandinsky3Img2ImgPipeline),
         ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
+        ("stable-diffusion-pag", StableDiffusionPAGImg2ImgPipeline),
         ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
+        ("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionImg2ImgPipeline),
         ("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
+        ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
         ("lcm", LatentConsistencyModelImg2ImgPipeline),
+        ("flux", FluxImg2ImgPipeline),
+        ("flux-controlnet", FluxControlNetImg2ImgPipeline),
+        ("flux-control", FluxControlImg2ImgPipeline),
     ]
 )
@@ -134,8 +167,14 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
         ("kandinsky", KandinskyInpaintCombinedPipeline),
         ("kandinsky22", KandinskyV22InpaintCombinedPipeline),
         ("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
+        ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGInpaintPipeline),
         ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
+        ("stable-diffusion-xl-controlnet-union", StableDiffusionXLControlNetUnionInpaintPipeline),
         ("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
+        ("flux", FluxInpaintPipeline),
+        ("flux-controlnet", FluxControlNetInpaintPipeline),
+        ("flux-control", FluxControlInpaintPipeline),
+        ("stable-diffusion-pag", StableDiffusionPAGInpaintPipeline),
     ]
 )
@@ -161,12 +200,12 @@ _AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
 )
 if is_sentencepiece_available():
-    from .kolors import KolorsPipeline
+    from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
     from .pag import KolorsPAGPipeline
     AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
     AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
-    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
 SUPPORTED_TASKS_MAPPINGS = [
     AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
@@ -368,13 +407,20 @@ class AutoPipelineForText2Image(ConfigMixin):
         config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
         orig_class_name = config["_class_name"]
+        if "ControlPipeline" in orig_class_name:
+            to_replace = "ControlPipeline"
+        else:
+            to_replace = "Pipeline"
         if "controlnet" in kwargs:
-            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+            if isinstance(kwargs["controlnet"], ControlNetUnionModel):
+                orig_class_name = config["_class_name"].replace(to_replace, "ControlNetUnionPipeline")
+            else:
+                orig_class_name = config["_class_name"].replace(to_replace, "ControlNetPipeline")
         if "enable_pag" in kwargs:
             enable_pag = kwargs.pop("enable_pag")
             if enable_pag:
-                orig_class_name = orig_class_name.replace("Pipeline", "PAGPipeline")
+                orig_class_name = orig_class_name.replace(to_replace, "PAGPipeline")
         text_2_image_cls = _get_task_class(AUTO_TEXT2IMAGE_PIPELINES_MAPPING, orig_class_name)
@@ -656,12 +702,29 @@ class AutoPipelineForImage2Image(ConfigMixin):
         config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
         orig_class_name = config["_class_name"]
+        # the `orig_class_name` can be:
+        # `- *Pipeline` (for regular text-to-image checkpoint)
+        #  - `*ControlPipeline` (for Flux tools specific checkpoint)
+        # `- *Img2ImgPipeline` (for refiner checkpoint)
+        if "Img2Img" in orig_class_name:
+            to_replace = "Img2ImgPipeline"
+        elif "ControlPipeline" in orig_class_name:
+            to_replace = "ControlPipeline"
+        else:
+            to_replace = "Pipeline"
         if "controlnet" in kwargs:
-            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+            if isinstance(kwargs["controlnet"], ControlNetUnionModel):
+                orig_class_name = orig_class_name.replace(to_replace, "ControlNetUnion" + to_replace)
+            else:
+                orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
         if "enable_pag" in kwargs:
             enable_pag = kwargs.pop("enable_pag")
             if enable_pag:
-                orig_class_name = orig_class_name.replace("Pipeline", "PAGPipeline")
+                orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
+        if to_replace == "ControlPipeline":
+            orig_class_name = orig_class_name.replace(to_replace, "ControlImg2ImgPipeline")
         image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
@@ -948,13 +1011,28 @@ class AutoPipelineForInpainting(ConfigMixin):
         config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
         orig_class_name = config["_class_name"]
+        # The `orig_class_name`` can be:
+        # `- *InpaintPipeline` (for inpaint-specific checkpoint)
+        #  - `*ControlPipeline` (for Flux tools specific checkpoint)
+        #  - or *Pipeline (for regular text-to-image checkpoint)
+        if "Inpaint" in orig_class_name:
+            to_replace = "InpaintPipeline"
+        elif "ControlPipeline" in orig_class_name:
+            to_replace = "ControlPipeline"
+        else:
+            to_replace = "Pipeline"
         if "controlnet" in kwargs:
-            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+            if isinstance(kwargs["controlnet"], ControlNetUnionModel):
+                orig_class_name = orig_class_name.replace(to_replace, "ControlNetUnion" + to_replace)
+            else:
+                orig_class_name = orig_class_name.replace(to_replace, "ControlNet" + to_replace)
         if "enable_pag" in kwargs:
             enable_pag = kwargs.pop("enable_pag")
             if enable_pag:
-                orig_class_name = config["_class_name"].replace("Pipeline", "PAGPipeline")
+                orig_class_name = orig_class_name.replace(to_replace, "PAG" + to_replace)
+        if to_replace == "ControlPipeline":
+            orig_class_name = orig_class_name.replace(to_replace, "ControlInpaintPipeline")
         inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
         kwargs = {**load_config_kwargs, **kwargs}

diffusers/pipelines/blip_diffusion/modeling_blip2.py CHANGED Viewed

@@ -167,7 +167,7 @@ class Blip2QFormerEncoder(nn.Module):
             layer_head_mask = head_mask[i] if head_mask is not None else None
             past_key_value = past_key_values[i] if past_key_values is not None else None
-            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+            if getattr(self.config, "gradient_checkpointing", False) and torch.is_grad_enabled():
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."

diffusers/pipelines/cogvideo/__init__.py CHANGED Viewed

@@ -23,6 +23,7 @@ except OptionalDependencyNotAvailable:
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
+    _import_structure["pipeline_cogvideox_fun_control"] = ["CogVideoXFunControlPipeline"]
     _import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
     _import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_cogvideox import CogVideoXPipeline
+        from .pipeline_cogvideox_fun_control import CogVideoXFunControlPipeline
         from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
         from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline

diffusers/pipelines/cogvideo/pipeline_cogvideox.py CHANGED Viewed

@@ -15,12 +15,13 @@
 import inspect
 import math
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 from transformers import T5EncoderModel, T5Tokenizer
 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...loaders import CogVideoXLoraLoaderMixin
 from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from ...models.embeddings import get_3d_rotary_pos_embed
 from ...pipelines.pipeline_utils import DiffusionPipeline
@@ -85,7 +86,7 @@ def retrieve_timesteps(
     sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
-    """
+    r"""
     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
@@ -136,7 +137,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
-class CogVideoXPipeline(DiffusionPipeline):
+class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation using CogVideoX.
@@ -187,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline):
         self.vae_scale_factor_temporal = (
             self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
         )
+        self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
+        )
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
@@ -316,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline):
     def prepare_latents(
         self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
     ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
         shape = (
             batch_size,
             (num_frames - 1) // self.vae_scale_factor_temporal + 1,
@@ -323,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline):
             height // self.vae_scale_factor_spatial,
             width // self.vae_scale_factor_spatial,
         )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -340,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline):
     def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
-        latents = 1 / self.vae.config.scaling_factor * latents
+        latents = 1 / self.vae_scaling_factor_image * latents
         frames = self.vae.decode(latents).sample
         return frames
@@ -437,22 +442,39 @@ class CogVideoXPipeline(DiffusionPipeline):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
         grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
-        base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
-        base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
-        grid_crops_coords = get_resize_crop_region_for_grid(
-            (grid_height, grid_width), base_size_width, base_size_height
-        )
-        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
-            embed_dim=self.transformer.config.attention_head_dim,
-            crops_coords=grid_crops_coords,
-            grid_size=(grid_height, grid_width),
-            temporal_size=num_frames,
-            use_real=True,
-        )
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+        if p_t is None:
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,
+                grid_size=(grid_height, grid_width),
+                temporal_size=num_frames,
+                device=device,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+                device=device,
+            )
-        freqs_cos = freqs_cos.to(device=device)
-        freqs_sin = freqs_sin.to(device=device)
         return freqs_cos, freqs_sin
     @property
@@ -463,6 +485,10 @@ class CogVideoXPipeline(DiffusionPipeline):
     def num_timesteps(self):
         return self._num_timesteps
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
     @property
     def interrupt(self):
         return self._interrupt
@@ -473,9 +499,9 @@ class CogVideoXPipeline(DiffusionPipeline):
         self,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 480,
-        width: int = 720,
-        num_frames: int = 49,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: Optional[int] = None,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         guidance_scale: float = 6,
@@ -488,6 +514,7 @@ class CogVideoXPipeline(DiffusionPipeline):
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         output_type: str = "pil",
         return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
         callback_on_step_end: Optional[
             Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
         ] = None,
@@ -505,14 +532,14 @@ class CogVideoXPipeline(DiffusionPipeline):
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                 less than `1`).
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. This is set to 1024 by default for the best results.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
             num_frames (`int`, defaults to `48`):
                 Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
                 contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
-                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
+                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
                 needs to be satisfied is that of divisibility mentioned above.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -549,6 +576,10 @@ class CogVideoXPipeline(DiffusionPipeline):
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                 of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             callback_on_step_end (`Callable`, *optional*):
                 A function that calls at the end of each denoising steps during the inference. The function is called
                 with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -570,16 +601,13 @@ class CogVideoXPipeline(DiffusionPipeline):
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
-        if num_frames > 49:
-            raise ValueError(
-                "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
-            )
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-        height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
-        width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
         num_videos_per_prompt = 1
         # 1. Check inputs. Raise error if not correct
@@ -593,6 +621,7 @@ class CogVideoXPipeline(DiffusionPipeline):
             negative_prompt_embeds,
         )
         self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
         self._interrupt = False
         # 2. Default call parameters
@@ -628,7 +657,16 @@ class CogVideoXPipeline(DiffusionPipeline):
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         self._num_timesteps = len(timesteps)
-        # 5. Prepare latents.
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
         latent_channels = self.transformer.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,
@@ -674,6 +712,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                     encoder_hidden_states=prompt_embeds,
                     timestep=timestep,
                     image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
                     return_dict=False,
                 )[0]
                 noise_pred = noise_pred.float()
@@ -717,6 +756,8 @@ class CogVideoXPipeline(DiffusionPipeline):
                     progress_bar.update()
         if not output_type == "latent":
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
             video = self.decode_latents(latents)
             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
         else:

diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl

diffusers 0.30.3py3-none-any.whl → 0.32.0py3-none-any.whl