diffusers 0.30.3__py3-none-any.whl → 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. diffusers/__init__.py +34 -2
  2. diffusers/configuration_utils.py +12 -0
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +257 -54
  5. diffusers/loaders/__init__.py +2 -0
  6. diffusers/loaders/ip_adapter.py +5 -1
  7. diffusers/loaders/lora_base.py +14 -7
  8. diffusers/loaders/lora_conversion_utils.py +332 -0
  9. diffusers/loaders/lora_pipeline.py +707 -41
  10. diffusers/loaders/peft.py +1 -0
  11. diffusers/loaders/single_file_utils.py +81 -4
  12. diffusers/loaders/textual_inversion.py +2 -0
  13. diffusers/loaders/unet.py +39 -8
  14. diffusers/models/__init__.py +4 -0
  15. diffusers/models/adapter.py +53 -53
  16. diffusers/models/attention.py +86 -10
  17. diffusers/models/attention_processor.py +169 -133
  18. diffusers/models/autoencoders/autoencoder_kl.py +71 -11
  19. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +187 -88
  20. diffusers/models/controlnet_flux.py +536 -0
  21. diffusers/models/controlnet_sd3.py +7 -3
  22. diffusers/models/controlnet_sparsectrl.py +0 -1
  23. diffusers/models/embeddings.py +170 -61
  24. diffusers/models/embeddings_flax.py +23 -9
  25. diffusers/models/model_loading_utils.py +182 -14
  26. diffusers/models/modeling_utils.py +283 -46
  27. diffusers/models/normalization.py +79 -0
  28. diffusers/models/transformers/__init__.py +1 -0
  29. diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
  30. diffusers/models/transformers/cogvideox_transformer_3d.py +23 -2
  31. diffusers/models/transformers/pixart_transformer_2d.py +9 -1
  32. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  33. diffusers/models/transformers/transformer_flux.py +161 -44
  34. diffusers/models/transformers/transformer_sd3.py +7 -1
  35. diffusers/models/unets/unet_2d_condition.py +8 -8
  36. diffusers/models/unets/unet_motion_model.py +41 -63
  37. diffusers/models/upsampling.py +6 -6
  38. diffusers/pipelines/__init__.py +35 -6
  39. diffusers/pipelines/animatediff/__init__.py +2 -0
  40. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  41. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
  42. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  43. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
  44. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
  45. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  46. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
  47. diffusers/pipelines/auto_pipeline.py +39 -8
  48. diffusers/pipelines/cogvideo/__init__.py +2 -0
  49. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +30 -17
  50. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
  51. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +41 -31
  52. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +42 -29
  53. diffusers/pipelines/cogview3/__init__.py +47 -0
  54. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  55. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  56. diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
  57. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
  58. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
  59. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
  60. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
  61. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
  62. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
  63. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  64. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
  65. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  66. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  67. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  68. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  69. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  70. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  71. diffusers/pipelines/flux/__init__.py +10 -0
  72. diffusers/pipelines/flux/pipeline_flux.py +53 -20
  73. diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
  74. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
  75. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
  76. diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
  77. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
  78. diffusers/pipelines/free_noise_utils.py +365 -5
  79. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
  80. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  81. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  82. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  83. diffusers/pipelines/kolors/tokenizer.py +4 -0
  84. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  85. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  86. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  87. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  88. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  89. diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
  90. diffusers/pipelines/pag/__init__.py +6 -0
  91. diffusers/pipelines/pag/pag_utils.py +8 -2
  92. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
  93. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
  94. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
  95. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
  96. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
  97. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  98. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
  99. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  100. diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
  101. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  102. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
  103. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  104. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  105. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  106. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  107. diffusers/pipelines/pipeline_loading_utils.py +225 -27
  108. diffusers/pipelines/pipeline_utils.py +123 -180
  109. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  110. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  111. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  112. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  113. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
  114. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  115. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  116. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  117. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
  118. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
  119. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
  120. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  121. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  122. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  123. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
  124. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
  125. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
  126. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  127. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  128. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  129. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  130. diffusers/quantizers/__init__.py +16 -0
  131. diffusers/quantizers/auto.py +126 -0
  132. diffusers/quantizers/base.py +233 -0
  133. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  134. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
  135. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  136. diffusers/quantizers/quantization_config.py +391 -0
  137. diffusers/schedulers/scheduling_ddim.py +4 -1
  138. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  139. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  140. diffusers/schedulers/scheduling_ddpm.py +4 -1
  141. diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
  142. diffusers/schedulers/scheduling_deis_multistep.py +78 -1
  143. diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
  144. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
  145. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  146. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
  147. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  148. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  149. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  150. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  151. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  152. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  153. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  154. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  155. diffusers/schedulers/scheduling_sasolver.py +78 -1
  156. diffusers/schedulers/scheduling_unclip.py +4 -1
  157. diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
  158. diffusers/training_utils.py +48 -18
  159. diffusers/utils/__init__.py +2 -1
  160. diffusers/utils/dummy_pt_objects.py +60 -0
  161. diffusers/utils/dummy_torch_and_transformers_objects.py +165 -0
  162. diffusers/utils/hub_utils.py +16 -4
  163. diffusers/utils/import_utils.py +31 -8
  164. diffusers/utils/loading_utils.py +28 -4
  165. diffusers/utils/peft_utils.py +3 -3
  166. diffusers/utils/testing_utils.py +59 -0
  167. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
  168. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/RECORD +172 -149
  169. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
  170. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/WHEEL +0 -0
  171. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
  172. {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import PIL
21
21
  import torch
@@ -23,6 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer
23
23
 
24
24
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
25
  from ...image_processor import PipelineImageInput
26
+ from ...loaders import CogVideoXLoraLoaderMixin
26
27
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
27
28
  from ...models.embeddings import get_3d_rotary_pos_embed
28
29
  from ...pipelines.pipeline_utils import DiffusionPipeline
@@ -87,7 +88,7 @@ def retrieve_timesteps(
87
88
  sigmas: Optional[List[float]] = None,
88
89
  **kwargs,
89
90
  ):
90
- """
91
+ r"""
91
92
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
92
93
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
93
94
 
@@ -152,7 +153,7 @@ def retrieve_latents(
152
153
  raise AttributeError("Could not access latents of provided encoder_output")
153
154
 
154
155
 
155
- class CogVideoXImageToVideoPipeline(DiffusionPipeline):
156
+ class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
156
157
  r"""
157
158
  Pipeline for image-to-video generation using CogVideoX.
158
159
 
@@ -207,6 +208,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
207
208
  self.vae_scale_factor_temporal = (
208
209
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
209
210
  )
211
+ self.vae_scaling_factor_image = (
212
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
213
+ )
210
214
 
211
215
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
212
216
 
@@ -348,6 +352,12 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
348
352
  generator: Optional[torch.Generator] = None,
349
353
  latents: Optional[torch.Tensor] = None,
350
354
  ):
355
+ if isinstance(generator, list) and len(generator) != batch_size:
356
+ raise ValueError(
357
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
358
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
359
+ )
360
+
351
361
  num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
352
362
  shape = (
353
363
  batch_size,
@@ -357,12 +367,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
357
367
  width // self.vae_scale_factor_spatial,
358
368
  )
359
369
 
360
- if isinstance(generator, list) and len(generator) != batch_size:
361
- raise ValueError(
362
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
363
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
364
- )
365
-
366
370
  image = image.unsqueeze(2) # [B, C, F, H, W]
367
371
 
368
372
  if isinstance(generator, list):
@@ -373,7 +377,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
373
377
  image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
374
378
 
375
379
  image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
376
- image_latents = self.vae.config.scaling_factor * image_latents
380
+ image_latents = self.vae_scaling_factor_image * image_latents
377
381
 
378
382
  padding_shape = (
379
383
  batch_size,
@@ -397,7 +401,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
397
401
  # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
398
402
  def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
399
403
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
400
- latents = 1 / self.vae.config.scaling_factor * latents
404
+ latents = 1 / self.vae_scaling_factor_image * latents
401
405
 
402
406
  frames = self.vae.decode(latents).sample
403
407
  return frames
@@ -438,7 +442,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
438
442
  width,
439
443
  negative_prompt,
440
444
  callback_on_step_end_tensor_inputs,
441
- video=None,
442
445
  latents=None,
443
446
  prompt_embeds=None,
444
447
  negative_prompt_embeds=None,
@@ -494,9 +497,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
494
497
  f" {negative_prompt_embeds.shape}."
495
498
  )
496
499
 
497
- if video is not None and latents is not None:
498
- raise ValueError("Only one of `video` or `latents` should be provided")
499
-
500
500
  # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
501
501
  def fuse_qkv_projections(self) -> None:
502
502
  r"""Enables fused QKV projections."""
@@ -547,6 +547,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
547
547
  def num_timesteps(self):
548
548
  return self._num_timesteps
549
549
 
550
+ @property
551
+ def attention_kwargs(self):
552
+ return self._attention_kwargs
553
+
550
554
  @property
551
555
  def interrupt(self):
552
556
  return self._interrupt
@@ -573,6 +577,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
573
577
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
574
578
  output_type: str = "pil",
575
579
  return_dict: bool = True,
580
+ attention_kwargs: Optional[Dict[str, Any]] = None,
576
581
  callback_on_step_end: Optional[
577
582
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
578
583
  ] = None,
@@ -584,7 +589,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
584
589
 
585
590
  Args:
586
591
  image (`PipelineImageInput`):
587
- The input video to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
592
+ The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
588
593
  prompt (`str` or `List[str]`, *optional*):
589
594
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
590
595
  instead.
@@ -592,14 +597,14 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
592
597
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
593
598
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
594
599
  less than `1`).
595
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
596
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
597
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
598
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
600
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
601
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
602
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
603
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
599
604
  num_frames (`int`, defaults to `48`):
600
605
  Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
601
606
  contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
602
- num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
607
+ num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
603
608
  needs to be satisfied is that of divisibility mentioned above.
604
609
  num_inference_steps (`int`, *optional*, defaults to 50):
605
610
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -636,6 +641,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
636
641
  return_dict (`bool`, *optional*, defaults to `True`):
637
642
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
638
643
  of a plain tuple.
644
+ attention_kwargs (`dict`, *optional*):
645
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
646
+ `self.processor` in
647
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
639
648
  callback_on_step_end (`Callable`, *optional*):
640
649
  A function that calls at the end of each denoising steps during the inference. The function is called
641
650
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -665,22 +674,22 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
665
674
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
666
675
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
667
676
 
668
- height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
669
- width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
670
677
  num_videos_per_prompt = 1
671
678
 
672
679
  # 1. Check inputs. Raise error if not correct
673
680
  self.check_inputs(
674
- image,
675
- prompt,
676
- height,
677
- width,
678
- negative_prompt,
679
- callback_on_step_end_tensor_inputs,
680
- prompt_embeds,
681
- negative_prompt_embeds,
681
+ image=image,
682
+ prompt=prompt,
683
+ height=height,
684
+ width=width,
685
+ negative_prompt=negative_prompt,
686
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
687
+ latents=latents,
688
+ prompt_embeds=prompt_embeds,
689
+ negative_prompt_embeds=negative_prompt_embeds,
682
690
  )
683
691
  self._guidance_scale = guidance_scale
692
+ self._attention_kwargs = attention_kwargs
684
693
  self._interrupt = False
685
694
 
686
695
  # 2. Default call parameters
@@ -770,6 +779,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
770
779
  encoder_hidden_states=prompt_embeds,
771
780
  timestep=timestep,
772
781
  image_rotary_emb=image_rotary_emb,
782
+ attention_kwargs=attention_kwargs,
773
783
  return_dict=False,
774
784
  )[0]
775
785
  noise_pred = noise_pred.float()
@@ -15,21 +15,19 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import torch
21
21
  from PIL import Image
22
22
  from transformers import T5EncoderModel, T5Tokenizer
23
23
 
24
24
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
+ from ...loaders import CogVideoXLoraLoaderMixin
25
26
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
26
27
  from ...models.embeddings import get_3d_rotary_pos_embed
27
28
  from ...pipelines.pipeline_utils import DiffusionPipeline
28
29
  from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
29
- from ...utils import (
30
- logging,
31
- replace_example_docstring,
32
- )
30
+ from ...utils import logging, replace_example_docstring
33
31
  from ...utils.torch_utils import randn_tensor
34
32
  from ...video_processor import VideoProcessor
35
33
  from .pipeline_output import CogVideoXPipelineOutput
@@ -96,7 +94,7 @@ def retrieve_timesteps(
96
94
  sigmas: Optional[List[float]] = None,
97
95
  **kwargs,
98
96
  ):
99
- """
97
+ r"""
100
98
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
101
99
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
102
100
 
@@ -161,7 +159,7 @@ def retrieve_latents(
161
159
  raise AttributeError("Could not access latents of provided encoder_output")
162
160
 
163
161
 
164
- class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
162
+ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
165
163
  r"""
166
164
  Pipeline for video-to-video generation using CogVideoX.
167
165
 
@@ -206,12 +204,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
206
204
  self.register_modules(
207
205
  tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
208
206
  )
207
+
209
208
  self.vae_scale_factor_spatial = (
210
209
  2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
211
210
  )
212
211
  self.vae_scale_factor_temporal = (
213
212
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
214
213
  )
214
+ self.vae_scaling_factor_image = (
215
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
216
+ )
215
217
 
216
218
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
217
219
 
@@ -353,6 +355,12 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
353
355
  latents: Optional[torch.Tensor] = None,
354
356
  timestep: Optional[torch.Tensor] = None,
355
357
  ):
358
+ if isinstance(generator, list) and len(generator) != batch_size:
359
+ raise ValueError(
360
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
361
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
362
+ )
363
+
356
364
  num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
357
365
 
358
366
  shape = (
@@ -363,12 +371,6 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
363
371
  width // self.vae_scale_factor_spatial,
364
372
  )
365
373
 
366
- if isinstance(generator, list) and len(generator) != batch_size:
367
- raise ValueError(
368
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
369
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
370
- )
371
-
372
374
  if latents is None:
373
375
  if isinstance(generator, list):
374
376
  if len(generator) != batch_size:
@@ -384,7 +386,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
384
386
  init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
385
387
 
386
388
  init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
387
- init_latents = self.vae.config.scaling_factor * init_latents
389
+ init_latents = self.vae_scaling_factor_image * init_latents
388
390
 
389
391
  noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
390
392
  latents = self.scheduler.add_noise(init_latents, noise, timestep)
@@ -398,7 +400,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
398
400
  # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
399
401
  def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
400
402
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
401
- latents = 1 / self.vae.config.scaling_factor * latents
403
+ latents = 1 / self.vae_scaling_factor_image * latents
402
404
 
403
405
  frames = self.vae.decode(latents).sample
404
406
  return frames
@@ -541,6 +543,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
541
543
  def num_timesteps(self):
542
544
  return self._num_timesteps
543
545
 
546
+ @property
547
+ def attention_kwargs(self):
548
+ return self._attention_kwargs
549
+
544
550
  @property
545
551
  def interrupt(self):
546
552
  return self._interrupt
@@ -567,6 +573,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
567
573
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
568
574
  output_type: str = "pil",
569
575
  return_dict: bool = True,
576
+ attention_kwargs: Optional[Dict[str, Any]] = None,
570
577
  callback_on_step_end: Optional[
571
578
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
572
579
  ] = None,
@@ -586,10 +593,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
586
593
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
587
594
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
588
595
  less than `1`).
589
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
590
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
591
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
592
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
596
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
597
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
598
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
599
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
593
600
  num_inference_steps (`int`, *optional*, defaults to 50):
594
601
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
595
602
  expense of slower inference.
@@ -627,6 +634,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
627
634
  return_dict (`bool`, *optional*, defaults to `True`):
628
635
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
629
636
  of a plain tuple.
637
+ attention_kwargs (`dict`, *optional*):
638
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
639
+ `self.processor` in
640
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
630
641
  callback_on_step_end (`Callable`, *optional*):
631
642
  A function that calls at the end of each denoising steps during the inference. The function is called
632
643
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -651,22 +662,23 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
651
662
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
652
663
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
653
664
 
654
- height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
655
- width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
656
665
  num_videos_per_prompt = 1
657
666
 
658
667
  # 1. Check inputs. Raise error if not correct
659
668
  self.check_inputs(
660
- prompt,
661
- height,
662
- width,
663
- strength,
664
- negative_prompt,
665
- callback_on_step_end_tensor_inputs,
666
- prompt_embeds,
667
- negative_prompt_embeds,
669
+ prompt=prompt,
670
+ height=height,
671
+ width=width,
672
+ strength=strength,
673
+ negative_prompt=negative_prompt,
674
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
675
+ video=video,
676
+ latents=latents,
677
+ prompt_embeds=prompt_embeds,
678
+ negative_prompt_embeds=negative_prompt_embeds,
668
679
  )
669
680
  self._guidance_scale = guidance_scale
681
+ self._attention_kwargs = attention_kwargs
670
682
  self._interrupt = False
671
683
 
672
684
  # 2. Default call parameters
@@ -755,6 +767,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
755
767
  encoder_hidden_states=prompt_embeds,
756
768
  timestep=timestep,
757
769
  image_rotary_emb=image_rotary_emb,
770
+ attention_kwargs=attention_kwargs,
758
771
  return_dict=False,
759
772
  )[0]
760
773
  noise_pred = noise_pred.float()
@@ -0,0 +1,47 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _additional_imports = {}
15
+ _import_structure = {"pipeline_output": ["CogView3PlusPipelineOutput"]}
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_cogview3plus"] = ["CogView3PlusPipeline"]
26
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
27
+ try:
28
+ if not (is_transformers_available() and is_torch_available()):
29
+ raise OptionalDependencyNotAvailable()
30
+ except OptionalDependencyNotAvailable:
31
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
32
+ else:
33
+ from .pipeline_cogview3plus import CogView3PlusPipeline
34
+ else:
35
+ import sys
36
+
37
+ sys.modules[__name__] = _LazyModule(
38
+ __name__,
39
+ globals()["__file__"],
40
+ _import_structure,
41
+ module_spec=__spec__,
42
+ )
43
+
44
+ for name, value in _dummy_objects.items():
45
+ setattr(sys.modules[__name__], name, value)
46
+ for name, value in _additional_imports.items():
47
+ setattr(sys.modules[__name__], name, value)