diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. diffusers/__init__.py +97 -4
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +13 -1
  4. diffusers/image_processor.py +282 -71
  5. diffusers/loaders/__init__.py +24 -3
  6. diffusers/loaders/ip_adapter.py +543 -16
  7. diffusers/loaders/lora_base.py +138 -125
  8. diffusers/loaders/lora_conversion_utils.py +647 -0
  9. diffusers/loaders/lora_pipeline.py +2216 -230
  10. diffusers/loaders/peft.py +380 -0
  11. diffusers/loaders/single_file_model.py +71 -4
  12. diffusers/loaders/single_file_utils.py +597 -10
  13. diffusers/loaders/textual_inversion.py +5 -3
  14. diffusers/loaders/transformer_flux.py +181 -0
  15. diffusers/loaders/transformer_sd3.py +89 -0
  16. diffusers/loaders/unet.py +56 -12
  17. diffusers/models/__init__.py +49 -12
  18. diffusers/models/activations.py +22 -9
  19. diffusers/models/adapter.py +53 -53
  20. diffusers/models/attention.py +98 -13
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2160 -346
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +73 -12
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +70 -0
  36. diffusers/models/controlnet_sd3.py +26 -376
  37. diffusers/models/controlnet_sparsectrl.py +46 -719
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +996 -92
  49. diffusers/models/embeddings_flax.py +23 -9
  50. diffusers/models/model_loading_utils.py +264 -14
  51. diffusers/models/modeling_flax_utils.py +1 -1
  52. diffusers/models/modeling_utils.py +334 -51
  53. diffusers/models/normalization.py +157 -13
  54. diffusers/models/transformers/__init__.py +6 -0
  55. diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
  56. diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
  57. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  58. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  59. diffusers/models/transformers/pixart_transformer_2d.py +10 -2
  60. diffusers/models/transformers/sana_transformer.py +488 -0
  61. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  62. diffusers/models/transformers/transformer_2d.py +1 -1
  63. diffusers/models/transformers/transformer_allegro.py +422 -0
  64. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  65. diffusers/models/transformers/transformer_flux.py +189 -51
  66. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  67. diffusers/models/transformers/transformer_ltx.py +469 -0
  68. diffusers/models/transformers/transformer_mochi.py +499 -0
  69. diffusers/models/transformers/transformer_sd3.py +112 -18
  70. diffusers/models/transformers/transformer_temporal.py +1 -1
  71. diffusers/models/unets/unet_1d_blocks.py +1 -1
  72. diffusers/models/unets/unet_2d.py +8 -1
  73. diffusers/models/unets/unet_2d_blocks.py +88 -21
  74. diffusers/models/unets/unet_2d_condition.py +9 -9
  75. diffusers/models/unets/unet_3d_blocks.py +9 -7
  76. diffusers/models/unets/unet_motion_model.py +46 -68
  77. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  78. diffusers/models/unets/unet_stable_cascade.py +2 -2
  79. diffusers/models/unets/uvit_2d.py +1 -1
  80. diffusers/models/upsampling.py +14 -6
  81. diffusers/pipelines/__init__.py +69 -6
  82. diffusers/pipelines/allegro/__init__.py +48 -0
  83. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  84. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  85. diffusers/pipelines/animatediff/__init__.py +2 -0
  86. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  87. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
  88. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  89. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
  90. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
  91. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  92. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  93. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
  94. diffusers/pipelines/auto_pipeline.py +88 -10
  95. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  96. diffusers/pipelines/cogvideo/__init__.py +2 -0
  97. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
  98. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
  99. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
  100. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
  101. diffusers/pipelines/cogview3/__init__.py +47 -0
  102. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  103. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  104. diffusers/pipelines/controlnet/__init__.py +86 -80
  105. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  106. diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
  107. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
  108. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
  109. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
  110. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
  111. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
  112. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  113. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  114. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  115. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
  116. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  117. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
  118. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  119. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  120. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  121. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  122. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  123. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
  124. diffusers/pipelines/flux/__init__.py +23 -1
  125. diffusers/pipelines/flux/modeling_flux.py +47 -0
  126. diffusers/pipelines/flux/pipeline_flux.py +256 -48
  127. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  128. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  129. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  130. diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
  131. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
  132. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
  133. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  134. diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
  135. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
  136. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  137. diffusers/pipelines/flux/pipeline_output.py +16 -0
  138. diffusers/pipelines/free_noise_utils.py +365 -5
  139. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  140. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  141. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  142. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
  143. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  145. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  146. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  147. diffusers/pipelines/kolors/text_encoder.py +2 -2
  148. diffusers/pipelines/kolors/tokenizer.py +4 -0
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  155. diffusers/pipelines/ltx/__init__.py +50 -0
  156. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  157. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  158. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  159. diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
  160. diffusers/pipelines/mochi/__init__.py +48 -0
  161. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  162. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  163. diffusers/pipelines/pag/__init__.py +13 -0
  164. diffusers/pipelines/pag/pag_utils.py +8 -2
  165. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
  166. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
  167. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
  168. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
  169. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
  170. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  171. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
  172. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  173. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  174. diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
  175. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  176. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  177. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
  178. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  179. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  180. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  181. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  182. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  183. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  184. diffusers/pipelines/pipeline_loading_utils.py +250 -31
  185. diffusers/pipelines/pipeline_utils.py +158 -186
  186. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
  187. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
  188. diffusers/pipelines/sana/__init__.py +47 -0
  189. diffusers/pipelines/sana/pipeline_output.py +21 -0
  190. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  191. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  192. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  193. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  194. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
  195. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  196. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  197. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  198. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
  199. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
  200. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
  201. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  202. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  203. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  204. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  205. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
  206. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
  208. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  209. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  210. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  211. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  212. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  213. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  214. diffusers/quantizers/__init__.py +16 -0
  215. diffusers/quantizers/auto.py +139 -0
  216. diffusers/quantizers/base.py +233 -0
  217. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  218. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
  219. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  220. diffusers/quantizers/gguf/__init__.py +1 -0
  221. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  222. diffusers/quantizers/gguf/utils.py +456 -0
  223. diffusers/quantizers/quantization_config.py +669 -0
  224. diffusers/quantizers/torchao/__init__.py +15 -0
  225. diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
  226. diffusers/schedulers/scheduling_ddim.py +4 -1
  227. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  228. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  229. diffusers/schedulers/scheduling_ddpm.py +6 -7
  230. diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
  231. diffusers/schedulers/scheduling_deis_multistep.py +102 -6
  232. diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
  236. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  237. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  238. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  239. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  240. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  241. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  244. diffusers/schedulers/scheduling_lcm.py +2 -6
  245. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  246. diffusers/schedulers/scheduling_repaint.py +1 -1
  247. diffusers/schedulers/scheduling_sasolver.py +102 -6
  248. diffusers/schedulers/scheduling_tcd.py +2 -6
  249. diffusers/schedulers/scheduling_unclip.py +4 -1
  250. diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
  251. diffusers/training_utils.py +63 -19
  252. diffusers/utils/__init__.py +7 -1
  253. diffusers/utils/constants.py +1 -0
  254. diffusers/utils/dummy_pt_objects.py +240 -0
  255. diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
  256. diffusers/utils/dynamic_modules_utils.py +3 -3
  257. diffusers/utils/hub_utils.py +44 -40
  258. diffusers/utils/import_utils.py +98 -8
  259. diffusers/utils/loading_utils.py +28 -4
  260. diffusers/utils/peft_utils.py +6 -3
  261. diffusers/utils/testing_utils.py +115 -1
  262. diffusers/utils/torch_utils.py +3 -0
  263. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
  264. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
  265. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
  266. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
  267. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
  268. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import PIL
21
21
  import torch
@@ -23,6 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer
23
23
 
24
24
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
25
  from ...image_processor import PipelineImageInput
26
+ from ...loaders import CogVideoXLoraLoaderMixin
26
27
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
27
28
  from ...models.embeddings import get_3d_rotary_pos_embed
28
29
  from ...pipelines.pipeline_utils import DiffusionPipeline
@@ -87,7 +88,7 @@ def retrieve_timesteps(
87
88
  sigmas: Optional[List[float]] = None,
88
89
  **kwargs,
89
90
  ):
90
- """
91
+ r"""
91
92
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
92
93
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
93
94
 
@@ -152,7 +153,7 @@ def retrieve_latents(
152
153
  raise AttributeError("Could not access latents of provided encoder_output")
153
154
 
154
155
 
155
- class CogVideoXImageToVideoPipeline(DiffusionPipeline):
156
+ class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
156
157
  r"""
157
158
  Pipeline for image-to-video generation using CogVideoX.
158
159
 
@@ -207,6 +208,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
207
208
  self.vae_scale_factor_temporal = (
208
209
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
209
210
  )
211
+ self.vae_scaling_factor_image = (
212
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
213
+ )
210
214
 
211
215
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
212
216
 
@@ -348,6 +352,12 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
348
352
  generator: Optional[torch.Generator] = None,
349
353
  latents: Optional[torch.Tensor] = None,
350
354
  ):
355
+ if isinstance(generator, list) and len(generator) != batch_size:
356
+ raise ValueError(
357
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
358
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
359
+ )
360
+
351
361
  num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
352
362
  shape = (
353
363
  batch_size,
@@ -357,11 +367,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
357
367
  width // self.vae_scale_factor_spatial,
358
368
  )
359
369
 
360
- if isinstance(generator, list) and len(generator) != batch_size:
361
- raise ValueError(
362
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
363
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
364
- )
370
+ # For CogVideoX1.5, the latent should add 1 for padding (Not use)
371
+ if self.transformer.config.patch_size_t is not None:
372
+ shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
365
373
 
366
374
  image = image.unsqueeze(2) # [B, C, F, H, W]
367
375
 
@@ -373,7 +381,13 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
373
381
  image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
374
382
 
375
383
  image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
376
- image_latents = self.vae.config.scaling_factor * image_latents
384
+
385
+ if not self.vae.config.invert_scale_latents:
386
+ image_latents = self.vae_scaling_factor_image * image_latents
387
+ else:
388
+ # This is awkward but required because the CogVideoX team forgot to multiply the
389
+ # scaling factor during training :)
390
+ image_latents = 1 / self.vae_scaling_factor_image * image_latents
377
391
 
378
392
  padding_shape = (
379
393
  batch_size,
@@ -382,9 +396,15 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
382
396
  height // self.vae_scale_factor_spatial,
383
397
  width // self.vae_scale_factor_spatial,
384
398
  )
399
+
385
400
  latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
386
401
  image_latents = torch.cat([image_latents, latent_padding], dim=1)
387
402
 
403
+ # Select the first frame along the second dimension
404
+ if self.transformer.config.patch_size_t is not None:
405
+ first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
406
+ image_latents = torch.cat([first_frame, image_latents], dim=1)
407
+
388
408
  if latents is None:
389
409
  latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
390
410
  else:
@@ -397,7 +417,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
397
417
  # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
398
418
  def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
399
419
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
400
- latents = 1 / self.vae.config.scaling_factor * latents
420
+ latents = 1 / self.vae_scaling_factor_image * latents
401
421
 
402
422
  frames = self.vae.decode(latents).sample
403
423
  return frames
@@ -438,7 +458,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
438
458
  width,
439
459
  negative_prompt,
440
460
  callback_on_step_end_tensor_inputs,
441
- video=None,
442
461
  latents=None,
443
462
  prompt_embeds=None,
444
463
  negative_prompt_embeds=None,
@@ -494,9 +513,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
494
513
  f" {negative_prompt_embeds.shape}."
495
514
  )
496
515
 
497
- if video is not None and latents is not None:
498
- raise ValueError("Only one of `video` or `latents` should be provided")
499
-
500
516
  # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
501
517
  def fuse_qkv_projections(self) -> None:
502
518
  r"""Enables fused QKV projections."""
@@ -522,21 +538,39 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
522
538
  ) -> Tuple[torch.Tensor, torch.Tensor]:
523
539
  grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
524
540
  grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
525
- base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
526
- base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
527
541
 
528
- grid_crops_coords = get_resize_crop_region_for_grid(
529
- (grid_height, grid_width), base_size_width, base_size_height
530
- )
531
- freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
532
- embed_dim=self.transformer.config.attention_head_dim,
533
- crops_coords=grid_crops_coords,
534
- grid_size=(grid_height, grid_width),
535
- temporal_size=num_frames,
536
- )
542
+ p = self.transformer.config.patch_size
543
+ p_t = self.transformer.config.patch_size_t
544
+
545
+ base_size_width = self.transformer.config.sample_width // p
546
+ base_size_height = self.transformer.config.sample_height // p
547
+
548
+ if p_t is None:
549
+ # CogVideoX 1.0
550
+ grid_crops_coords = get_resize_crop_region_for_grid(
551
+ (grid_height, grid_width), base_size_width, base_size_height
552
+ )
553
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
554
+ embed_dim=self.transformer.config.attention_head_dim,
555
+ crops_coords=grid_crops_coords,
556
+ grid_size=(grid_height, grid_width),
557
+ temporal_size=num_frames,
558
+ device=device,
559
+ )
560
+ else:
561
+ # CogVideoX 1.5
562
+ base_num_frames = (num_frames + p_t - 1) // p_t
563
+
564
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
565
+ embed_dim=self.transformer.config.attention_head_dim,
566
+ crops_coords=None,
567
+ grid_size=(grid_height, grid_width),
568
+ temporal_size=base_num_frames,
569
+ grid_type="slice",
570
+ max_size=(base_size_height, base_size_width),
571
+ device=device,
572
+ )
537
573
 
538
- freqs_cos = freqs_cos.to(device=device)
539
- freqs_sin = freqs_sin.to(device=device)
540
574
  return freqs_cos, freqs_sin
541
575
 
542
576
  @property
@@ -547,6 +581,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
547
581
  def num_timesteps(self):
548
582
  return self._num_timesteps
549
583
 
584
+ @property
585
+ def attention_kwargs(self):
586
+ return self._attention_kwargs
587
+
550
588
  @property
551
589
  def interrupt(self):
552
590
  return self._interrupt
@@ -558,8 +596,8 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
558
596
  image: PipelineImageInput,
559
597
  prompt: Optional[Union[str, List[str]]] = None,
560
598
  negative_prompt: Optional[Union[str, List[str]]] = None,
561
- height: int = 480,
562
- width: int = 720,
599
+ height: Optional[int] = None,
600
+ width: Optional[int] = None,
563
601
  num_frames: int = 49,
564
602
  num_inference_steps: int = 50,
565
603
  timesteps: Optional[List[int]] = None,
@@ -573,6 +611,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
573
611
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
574
612
  output_type: str = "pil",
575
613
  return_dict: bool = True,
614
+ attention_kwargs: Optional[Dict[str, Any]] = None,
576
615
  callback_on_step_end: Optional[
577
616
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
578
617
  ] = None,
@@ -584,7 +623,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
584
623
 
585
624
  Args:
586
625
  image (`PipelineImageInput`):
587
- The input video to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
626
+ The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
588
627
  prompt (`str` or `List[str]`, *optional*):
589
628
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
590
629
  instead.
@@ -592,14 +631,14 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
592
631
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
593
632
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
594
633
  less than `1`).
595
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
596
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
597
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
598
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
634
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
635
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
636
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
637
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
599
638
  num_frames (`int`, defaults to `48`):
600
639
  Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
601
640
  contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
602
- num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
641
+ num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
603
642
  needs to be satisfied is that of divisibility mentioned above.
604
643
  num_inference_steps (`int`, *optional*, defaults to 50):
605
644
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -636,6 +675,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
636
675
  return_dict (`bool`, *optional*, defaults to `True`):
637
676
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
638
677
  of a plain tuple.
678
+ attention_kwargs (`dict`, *optional*):
679
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
680
+ `self.processor` in
681
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
639
682
  callback_on_step_end (`Callable`, *optional*):
640
683
  A function that calls at the end of each denoising steps during the inference. The function is called
641
684
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -657,30 +700,29 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
657
700
  `tuple`. When returning a tuple, the first element is a list with the generated images.
658
701
  """
659
702
 
660
- if num_frames > 49:
661
- raise ValueError(
662
- "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
663
- )
664
-
665
703
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
666
704
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
667
705
 
668
- height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
669
- width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
706
+ height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
707
+ width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
708
+ num_frames = num_frames or self.transformer.config.sample_frames
709
+
670
710
  num_videos_per_prompt = 1
671
711
 
672
712
  # 1. Check inputs. Raise error if not correct
673
713
  self.check_inputs(
674
- image,
675
- prompt,
676
- height,
677
- width,
678
- negative_prompt,
679
- callback_on_step_end_tensor_inputs,
680
- prompt_embeds,
681
- negative_prompt_embeds,
714
+ image=image,
715
+ prompt=prompt,
716
+ height=height,
717
+ width=width,
718
+ negative_prompt=negative_prompt,
719
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
720
+ latents=latents,
721
+ prompt_embeds=prompt_embeds,
722
+ negative_prompt_embeds=negative_prompt_embeds,
682
723
  )
683
724
  self._guidance_scale = guidance_scale
725
+ self._attention_kwargs = attention_kwargs
684
726
  self._interrupt = False
685
727
 
686
728
  # 2. Default call parameters
@@ -717,6 +759,15 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
717
759
  self._num_timesteps = len(timesteps)
718
760
 
719
761
  # 5. Prepare latents
762
+ latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
763
+
764
+ # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
765
+ patch_size_t = self.transformer.config.patch_size_t
766
+ additional_frames = 0
767
+ if patch_size_t is not None and latent_frames % patch_size_t != 0:
768
+ additional_frames = patch_size_t - latent_frames % patch_size_t
769
+ num_frames += additional_frames * self.vae_scale_factor_temporal
770
+
720
771
  image = self.video_processor.preprocess(image, height=height, width=width).to(
721
772
  device, dtype=prompt_embeds.dtype
722
773
  )
@@ -745,6 +796,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
745
796
  else None
746
797
  )
747
798
 
799
+ # 8. Create ofs embeds if required
800
+ ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
801
+
748
802
  # 8. Denoising loop
749
803
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
750
804
 
@@ -769,7 +823,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
769
823
  hidden_states=latent_model_input,
770
824
  encoder_hidden_states=prompt_embeds,
771
825
  timestep=timestep,
826
+ ofs=ofs_emb,
772
827
  image_rotary_emb=image_rotary_emb,
828
+ attention_kwargs=attention_kwargs,
773
829
  return_dict=False,
774
830
  )[0]
775
831
  noise_pred = noise_pred.float()
@@ -813,6 +869,8 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
813
869
  progress_bar.update()
814
870
 
815
871
  if not output_type == "latent":
872
+ # Discard any padding frames that were added for CogVideoX 1.5
873
+ latents = latents[:, additional_frames:]
816
874
  video = self.decode_latents(latents)
817
875
  video = self.video_processor.postprocess_video(video=video, output_type=output_type)
818
876
  else:
@@ -15,21 +15,19 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import torch
21
21
  from PIL import Image
22
22
  from transformers import T5EncoderModel, T5Tokenizer
23
23
 
24
24
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
+ from ...loaders import CogVideoXLoraLoaderMixin
25
26
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
26
27
  from ...models.embeddings import get_3d_rotary_pos_embed
27
28
  from ...pipelines.pipeline_utils import DiffusionPipeline
28
29
  from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
29
- from ...utils import (
30
- logging,
31
- replace_example_docstring,
32
- )
30
+ from ...utils import logging, replace_example_docstring
33
31
  from ...utils.torch_utils import randn_tensor
34
32
  from ...video_processor import VideoProcessor
35
33
  from .pipeline_output import CogVideoXPipelineOutput
@@ -96,7 +94,7 @@ def retrieve_timesteps(
96
94
  sigmas: Optional[List[float]] = None,
97
95
  **kwargs,
98
96
  ):
99
- """
97
+ r"""
100
98
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
101
99
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
102
100
 
@@ -161,7 +159,7 @@ def retrieve_latents(
161
159
  raise AttributeError("Could not access latents of provided encoder_output")
162
160
 
163
161
 
164
- class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
162
+ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
165
163
  r"""
166
164
  Pipeline for video-to-video generation using CogVideoX.
167
165
 
@@ -206,12 +204,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
206
204
  self.register_modules(
207
205
  tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
208
206
  )
207
+
209
208
  self.vae_scale_factor_spatial = (
210
209
  2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
211
210
  )
212
211
  self.vae_scale_factor_temporal = (
213
212
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
214
213
  )
214
+ self.vae_scaling_factor_image = (
215
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
216
+ )
215
217
 
216
218
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
217
219
 
@@ -353,6 +355,12 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
353
355
  latents: Optional[torch.Tensor] = None,
354
356
  timestep: Optional[torch.Tensor] = None,
355
357
  ):
358
+ if isinstance(generator, list) and len(generator) != batch_size:
359
+ raise ValueError(
360
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
361
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
362
+ )
363
+
356
364
  num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
357
365
 
358
366
  shape = (
@@ -363,20 +371,8 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
363
371
  width // self.vae_scale_factor_spatial,
364
372
  )
365
373
 
366
- if isinstance(generator, list) and len(generator) != batch_size:
367
- raise ValueError(
368
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
369
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
370
- )
371
-
372
374
  if latents is None:
373
375
  if isinstance(generator, list):
374
- if len(generator) != batch_size:
375
- raise ValueError(
376
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
377
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
378
- )
379
-
380
376
  init_latents = [
381
377
  retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
382
378
  ]
@@ -384,7 +380,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
384
380
  init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
385
381
 
386
382
  init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
387
- init_latents = self.vae.config.scaling_factor * init_latents
383
+ init_latents = self.vae_scaling_factor_image * init_latents
388
384
 
389
385
  noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
390
386
  latents = self.scheduler.add_noise(init_latents, noise, timestep)
@@ -398,7 +394,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
398
394
  # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
399
395
  def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
400
396
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
401
- latents = 1 / self.vae.config.scaling_factor * latents
397
+ latents = 1 / self.vae_scaling_factor_image * latents
402
398
 
403
399
  frames = self.vae.decode(latents).sample
404
400
  return frames
@@ -516,21 +512,39 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
516
512
  ) -> Tuple[torch.Tensor, torch.Tensor]:
517
513
  grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
518
514
  grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
519
- base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
520
- base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
521
515
 
522
- grid_crops_coords = get_resize_crop_region_for_grid(
523
- (grid_height, grid_width), base_size_width, base_size_height
524
- )
525
- freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
526
- embed_dim=self.transformer.config.attention_head_dim,
527
- crops_coords=grid_crops_coords,
528
- grid_size=(grid_height, grid_width),
529
- temporal_size=num_frames,
530
- )
516
+ p = self.transformer.config.patch_size
517
+ p_t = self.transformer.config.patch_size_t
518
+
519
+ base_size_width = self.transformer.config.sample_width // p
520
+ base_size_height = self.transformer.config.sample_height // p
521
+
522
+ if p_t is None:
523
+ # CogVideoX 1.0
524
+ grid_crops_coords = get_resize_crop_region_for_grid(
525
+ (grid_height, grid_width), base_size_width, base_size_height
526
+ )
527
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
528
+ embed_dim=self.transformer.config.attention_head_dim,
529
+ crops_coords=grid_crops_coords,
530
+ grid_size=(grid_height, grid_width),
531
+ temporal_size=num_frames,
532
+ device=device,
533
+ )
534
+ else:
535
+ # CogVideoX 1.5
536
+ base_num_frames = (num_frames + p_t - 1) // p_t
537
+
538
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
539
+ embed_dim=self.transformer.config.attention_head_dim,
540
+ crops_coords=None,
541
+ grid_size=(grid_height, grid_width),
542
+ temporal_size=base_num_frames,
543
+ grid_type="slice",
544
+ max_size=(base_size_height, base_size_width),
545
+ device=device,
546
+ )
531
547
 
532
- freqs_cos = freqs_cos.to(device=device)
533
- freqs_sin = freqs_sin.to(device=device)
534
548
  return freqs_cos, freqs_sin
535
549
 
536
550
  @property
@@ -541,6 +555,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
541
555
  def num_timesteps(self):
542
556
  return self._num_timesteps
543
557
 
558
+ @property
559
+ def attention_kwargs(self):
560
+ return self._attention_kwargs
561
+
544
562
  @property
545
563
  def interrupt(self):
546
564
  return self._interrupt
@@ -552,8 +570,8 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
552
570
  video: List[Image.Image] = None,
553
571
  prompt: Optional[Union[str, List[str]]] = None,
554
572
  negative_prompt: Optional[Union[str, List[str]]] = None,
555
- height: int = 480,
556
- width: int = 720,
573
+ height: Optional[int] = None,
574
+ width: Optional[int] = None,
557
575
  num_inference_steps: int = 50,
558
576
  timesteps: Optional[List[int]] = None,
559
577
  strength: float = 0.8,
@@ -567,6 +585,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
567
585
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
568
586
  output_type: str = "pil",
569
587
  return_dict: bool = True,
588
+ attention_kwargs: Optional[Dict[str, Any]] = None,
570
589
  callback_on_step_end: Optional[
571
590
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
572
591
  ] = None,
@@ -586,10 +605,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
586
605
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
587
606
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
588
607
  less than `1`).
589
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
590
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
591
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
592
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
608
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
609
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
610
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
611
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
593
612
  num_inference_steps (`int`, *optional*, defaults to 50):
594
613
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
595
614
  expense of slower inference.
@@ -627,6 +646,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
627
646
  return_dict (`bool`, *optional*, defaults to `True`):
628
647
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
629
648
  of a plain tuple.
649
+ attention_kwargs (`dict`, *optional*):
650
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
651
+ `self.processor` in
652
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
630
653
  callback_on_step_end (`Callable`, *optional*):
631
654
  A function that calls at the end of each denoising steps during the inference. The function is called
632
655
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -651,22 +674,27 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
651
674
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
652
675
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
653
676
 
654
- height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
655
- width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
677
+ height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
678
+ width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
679
+ num_frames = len(video) if latents is None else latents.size(1)
680
+
656
681
  num_videos_per_prompt = 1
657
682
 
658
683
  # 1. Check inputs. Raise error if not correct
659
684
  self.check_inputs(
660
- prompt,
661
- height,
662
- width,
663
- strength,
664
- negative_prompt,
665
- callback_on_step_end_tensor_inputs,
666
- prompt_embeds,
667
- negative_prompt_embeds,
685
+ prompt=prompt,
686
+ height=height,
687
+ width=width,
688
+ strength=strength,
689
+ negative_prompt=negative_prompt,
690
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
691
+ video=video,
692
+ latents=latents,
693
+ prompt_embeds=prompt_embeds,
694
+ negative_prompt_embeds=negative_prompt_embeds,
668
695
  )
669
696
  self._guidance_scale = guidance_scale
697
+ self._attention_kwargs = attention_kwargs
670
698
  self._interrupt = False
671
699
 
672
700
  # 2. Default call parameters
@@ -705,6 +733,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
705
733
  self._num_timesteps = len(timesteps)
706
734
 
707
735
  # 5. Prepare latents
736
+ latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
737
+
738
+ # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
739
+ patch_size_t = self.transformer.config.patch_size_t
740
+ if patch_size_t is not None and latent_frames % patch_size_t != 0:
741
+ raise ValueError(
742
+ f"The number of latent frames must be divisible by `{patch_size_t=}` but the given video "
743
+ f"contains {latent_frames=}, which is not divisible."
744
+ )
745
+
708
746
  if latents is None:
709
747
  video = self.video_processor.preprocess_video(video, height=height, width=width)
710
748
  video = video.to(device=device, dtype=prompt_embeds.dtype)
@@ -755,6 +793,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
755
793
  encoder_hidden_states=prompt_embeds,
756
794
  timestep=timestep,
757
795
  image_rotary_emb=image_rotary_emb,
796
+ attention_kwargs=attention_kwargs,
758
797
  return_dict=False,
759
798
  )[0]
760
799
  noise_pred = noise_pred.float()
@@ -0,0 +1,47 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _additional_imports = {}
15
+ _import_structure = {"pipeline_output": ["CogView3PlusPipelineOutput"]}
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_cogview3plus"] = ["CogView3PlusPipeline"]
26
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
27
+ try:
28
+ if not (is_transformers_available() and is_torch_available()):
29
+ raise OptionalDependencyNotAvailable()
30
+ except OptionalDependencyNotAvailable:
31
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
32
+ else:
33
+ from .pipeline_cogview3plus import CogView3PlusPipeline
34
+ else:
35
+ import sys
36
+
37
+ sys.modules[__name__] = _LazyModule(
38
+ __name__,
39
+ globals()["__file__"],
40
+ _import_structure,
41
+ module_spec=__spec__,
42
+ )
43
+
44
+ for name, value in _dummy_objects.items():
45
+ setattr(sys.modules[__name__], name, value)
46
+ for name, value in _additional_imports.items():
47
+ setattr(sys.modules[__name__], name, value)