diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +97 -4
- diffusers/callbacks.py +56 -3
- diffusers/configuration_utils.py +13 -1
- diffusers/image_processor.py +282 -71
- diffusers/loaders/__init__.py +24 -3
- diffusers/loaders/ip_adapter.py +543 -16
- diffusers/loaders/lora_base.py +138 -125
- diffusers/loaders/lora_conversion_utils.py +647 -0
- diffusers/loaders/lora_pipeline.py +2216 -230
- diffusers/loaders/peft.py +380 -0
- diffusers/loaders/single_file_model.py +71 -4
- diffusers/loaders/single_file_utils.py +597 -10
- diffusers/loaders/textual_inversion.py +5 -3
- diffusers/loaders/transformer_flux.py +181 -0
- diffusers/loaders/transformer_sd3.py +89 -0
- diffusers/loaders/unet.py +56 -12
- diffusers/models/__init__.py +49 -12
- diffusers/models/activations.py +22 -9
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +98 -13
- diffusers/models/attention_flax.py +1 -1
- diffusers/models/attention_processor.py +2160 -346
- diffusers/models/autoencoders/__init__.py +5 -0
- diffusers/models/autoencoders/autoencoder_dc.py +620 -0
- diffusers/models/autoencoders/autoencoder_kl.py +73 -12
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
- diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
- diffusers/models/autoencoders/vae.py +18 -5
- diffusers/models/controlnet.py +47 -802
- diffusers/models/controlnet_flux.py +70 -0
- diffusers/models/controlnet_sd3.py +26 -376
- diffusers/models/controlnet_sparsectrl.py +46 -719
- diffusers/models/controlnets/__init__.py +23 -0
- diffusers/models/controlnets/controlnet.py +872 -0
- diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
- diffusers/models/controlnets/controlnet_flux.py +536 -0
- diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
- diffusers/models/controlnets/controlnet_sd3.py +489 -0
- diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
- diffusers/models/controlnets/controlnet_union.py +832 -0
- diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
- diffusers/models/controlnets/multicontrolnet.py +183 -0
- diffusers/models/embeddings.py +996 -92
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +264 -14
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +334 -51
- diffusers/models/normalization.py +157 -13
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
- diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
- diffusers/models/transformers/dit_transformer_2d.py +1 -1
- diffusers/models/transformers/latte_transformer_3d.py +4 -4
- diffusers/models/transformers/pixart_transformer_2d.py +10 -2
- diffusers/models/transformers/sana_transformer.py +488 -0
- diffusers/models/transformers/stable_audio_transformer.py +1 -1
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +422 -0
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +189 -51
- diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
- diffusers/models/transformers/transformer_ltx.py +469 -0
- diffusers/models/transformers/transformer_mochi.py +499 -0
- diffusers/models/transformers/transformer_sd3.py +112 -18
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +8 -1
- diffusers/models/unets/unet_2d_blocks.py +88 -21
- diffusers/models/unets/unet_2d_condition.py +9 -9
- diffusers/models/unets/unet_3d_blocks.py +9 -7
- diffusers/models/unets/unet_motion_model.py +46 -68
- diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
- diffusers/models/unets/unet_stable_cascade.py +2 -2
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +14 -6
- diffusers/pipelines/__init__.py +69 -6
- diffusers/pipelines/allegro/__init__.py +48 -0
- diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
- diffusers/pipelines/allegro/pipeline_output.py +23 -0
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
- diffusers/pipelines/auto_pipeline.py +88 -10
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/cogvideo/__init__.py +2 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/__init__.py +86 -80
- diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
- diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
- diffusers/pipelines/flux/__init__.py +23 -1
- diffusers/pipelines/flux/modeling_flux.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +256 -48
- diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
- diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
- diffusers/pipelines/flux/pipeline_output.py +16 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuan_video/__init__.py +48 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
- diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/text_encoder.py +2 -2
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/ltx/__init__.py +50 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
- diffusers/pipelines/ltx/pipeline_output.py +20 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
- diffusers/pipelines/mochi/__init__.py +48 -0
- diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
- diffusers/pipelines/mochi/pipeline_output.py +20 -0
- diffusers/pipelines/pag/__init__.py +13 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
- diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_flax_utils.py +1 -1
- diffusers/pipelines/pipeline_loading_utils.py +250 -31
- diffusers/pipelines/pipeline_utils.py +158 -186
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
- diffusers/pipelines/sana/__init__.py +47 -0
- diffusers/pipelines/sana/pipeline_output.py +21 -0
- diffusers/pipelines/sana/pipeline_sana.py +884 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +139 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/gguf/__init__.py +1 -0
- diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
- diffusers/quantizers/gguf/utils.py +456 -0
- diffusers/quantizers/quantization_config.py +669 -0
- diffusers/quantizers/torchao/__init__.py +15 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +6 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
- diffusers/schedulers/scheduling_deis_multistep.py +102 -6
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lcm.py +2 -6
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_repaint.py +1 -1
- diffusers/schedulers/scheduling_sasolver.py +102 -6
- diffusers/schedulers/scheduling_tcd.py +2 -6
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
- diffusers/training_utils.py +63 -19
- diffusers/utils/__init__.py +7 -1
- diffusers/utils/constants.py +1 -0
- diffusers/utils/dummy_pt_objects.py +240 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
- diffusers/utils/dynamic_modules_utils.py +3 -3
- diffusers/utils/hub_utils.py +44 -40
- diffusers/utils/import_utils.py +98 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +6 -3
- diffusers/utils/testing_utils.py +115 -1
- diffusers/utils/torch_utils.py +3 -0
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import PIL
|
21
21
|
import torch
|
@@ -23,6 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer
|
|
23
23
|
|
24
24
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
25
25
|
from ...image_processor import PipelineImageInput
|
26
|
+
from ...loaders import CogVideoXLoraLoaderMixin
|
26
27
|
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
27
28
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
28
29
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
@@ -87,7 +88,7 @@ def retrieve_timesteps(
|
|
87
88
|
sigmas: Optional[List[float]] = None,
|
88
89
|
**kwargs,
|
89
90
|
):
|
90
|
-
"""
|
91
|
+
r"""
|
91
92
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
92
93
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
93
94
|
|
@@ -152,7 +153,7 @@ def retrieve_latents(
|
|
152
153
|
raise AttributeError("Could not access latents of provided encoder_output")
|
153
154
|
|
154
155
|
|
155
|
-
class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
156
|
+
class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
156
157
|
r"""
|
157
158
|
Pipeline for image-to-video generation using CogVideoX.
|
158
159
|
|
@@ -207,6 +208,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
207
208
|
self.vae_scale_factor_temporal = (
|
208
209
|
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
209
210
|
)
|
211
|
+
self.vae_scaling_factor_image = (
|
212
|
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
213
|
+
)
|
210
214
|
|
211
215
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
212
216
|
|
@@ -348,6 +352,12 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
348
352
|
generator: Optional[torch.Generator] = None,
|
349
353
|
latents: Optional[torch.Tensor] = None,
|
350
354
|
):
|
355
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
356
|
+
raise ValueError(
|
357
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
358
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
359
|
+
)
|
360
|
+
|
351
361
|
num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
352
362
|
shape = (
|
353
363
|
batch_size,
|
@@ -357,11 +367,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
357
367
|
width // self.vae_scale_factor_spatial,
|
358
368
|
)
|
359
369
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
364
|
-
)
|
370
|
+
# For CogVideoX1.5, the latent should add 1 for padding (Not use)
|
371
|
+
if self.transformer.config.patch_size_t is not None:
|
372
|
+
shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
|
365
373
|
|
366
374
|
image = image.unsqueeze(2) # [B, C, F, H, W]
|
367
375
|
|
@@ -373,7 +381,13 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
373
381
|
image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
|
374
382
|
|
375
383
|
image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
|
376
|
-
|
384
|
+
|
385
|
+
if not self.vae.config.invert_scale_latents:
|
386
|
+
image_latents = self.vae_scaling_factor_image * image_latents
|
387
|
+
else:
|
388
|
+
# This is awkward but required because the CogVideoX team forgot to multiply the
|
389
|
+
# scaling factor during training :)
|
390
|
+
image_latents = 1 / self.vae_scaling_factor_image * image_latents
|
377
391
|
|
378
392
|
padding_shape = (
|
379
393
|
batch_size,
|
@@ -382,9 +396,15 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
382
396
|
height // self.vae_scale_factor_spatial,
|
383
397
|
width // self.vae_scale_factor_spatial,
|
384
398
|
)
|
399
|
+
|
385
400
|
latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
|
386
401
|
image_latents = torch.cat([image_latents, latent_padding], dim=1)
|
387
402
|
|
403
|
+
# Select the first frame along the second dimension
|
404
|
+
if self.transformer.config.patch_size_t is not None:
|
405
|
+
first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
|
406
|
+
image_latents = torch.cat([first_frame, image_latents], dim=1)
|
407
|
+
|
388
408
|
if latents is None:
|
389
409
|
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
390
410
|
else:
|
@@ -397,7 +417,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
397
417
|
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
|
398
418
|
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
399
419
|
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
400
|
-
latents = 1 / self.
|
420
|
+
latents = 1 / self.vae_scaling_factor_image * latents
|
401
421
|
|
402
422
|
frames = self.vae.decode(latents).sample
|
403
423
|
return frames
|
@@ -438,7 +458,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
438
458
|
width,
|
439
459
|
negative_prompt,
|
440
460
|
callback_on_step_end_tensor_inputs,
|
441
|
-
video=None,
|
442
461
|
latents=None,
|
443
462
|
prompt_embeds=None,
|
444
463
|
negative_prompt_embeds=None,
|
@@ -494,9 +513,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
494
513
|
f" {negative_prompt_embeds.shape}."
|
495
514
|
)
|
496
515
|
|
497
|
-
if video is not None and latents is not None:
|
498
|
-
raise ValueError("Only one of `video` or `latents` should be provided")
|
499
|
-
|
500
516
|
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
|
501
517
|
def fuse_qkv_projections(self) -> None:
|
502
518
|
r"""Enables fused QKV projections."""
|
@@ -522,21 +538,39 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
522
538
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
523
539
|
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
524
540
|
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
525
|
-
base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
526
|
-
base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
527
541
|
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
542
|
+
p = self.transformer.config.patch_size
|
543
|
+
p_t = self.transformer.config.patch_size_t
|
544
|
+
|
545
|
+
base_size_width = self.transformer.config.sample_width // p
|
546
|
+
base_size_height = self.transformer.config.sample_height // p
|
547
|
+
|
548
|
+
if p_t is None:
|
549
|
+
# CogVideoX 1.0
|
550
|
+
grid_crops_coords = get_resize_crop_region_for_grid(
|
551
|
+
(grid_height, grid_width), base_size_width, base_size_height
|
552
|
+
)
|
553
|
+
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
|
554
|
+
embed_dim=self.transformer.config.attention_head_dim,
|
555
|
+
crops_coords=grid_crops_coords,
|
556
|
+
grid_size=(grid_height, grid_width),
|
557
|
+
temporal_size=num_frames,
|
558
|
+
device=device,
|
559
|
+
)
|
560
|
+
else:
|
561
|
+
# CogVideoX 1.5
|
562
|
+
base_num_frames = (num_frames + p_t - 1) // p_t
|
563
|
+
|
564
|
+
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
|
565
|
+
embed_dim=self.transformer.config.attention_head_dim,
|
566
|
+
crops_coords=None,
|
567
|
+
grid_size=(grid_height, grid_width),
|
568
|
+
temporal_size=base_num_frames,
|
569
|
+
grid_type="slice",
|
570
|
+
max_size=(base_size_height, base_size_width),
|
571
|
+
device=device,
|
572
|
+
)
|
537
573
|
|
538
|
-
freqs_cos = freqs_cos.to(device=device)
|
539
|
-
freqs_sin = freqs_sin.to(device=device)
|
540
574
|
return freqs_cos, freqs_sin
|
541
575
|
|
542
576
|
@property
|
@@ -547,6 +581,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
547
581
|
def num_timesteps(self):
|
548
582
|
return self._num_timesteps
|
549
583
|
|
584
|
+
@property
|
585
|
+
def attention_kwargs(self):
|
586
|
+
return self._attention_kwargs
|
587
|
+
|
550
588
|
@property
|
551
589
|
def interrupt(self):
|
552
590
|
return self._interrupt
|
@@ -558,8 +596,8 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
558
596
|
image: PipelineImageInput,
|
559
597
|
prompt: Optional[Union[str, List[str]]] = None,
|
560
598
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
561
|
-
height: int =
|
562
|
-
width: int =
|
599
|
+
height: Optional[int] = None,
|
600
|
+
width: Optional[int] = None,
|
563
601
|
num_frames: int = 49,
|
564
602
|
num_inference_steps: int = 50,
|
565
603
|
timesteps: Optional[List[int]] = None,
|
@@ -573,6 +611,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
573
611
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
574
612
|
output_type: str = "pil",
|
575
613
|
return_dict: bool = True,
|
614
|
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
576
615
|
callback_on_step_end: Optional[
|
577
616
|
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
578
617
|
] = None,
|
@@ -584,7 +623,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
584
623
|
|
585
624
|
Args:
|
586
625
|
image (`PipelineImageInput`):
|
587
|
-
The input
|
626
|
+
The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
|
588
627
|
prompt (`str` or `List[str]`, *optional*):
|
589
628
|
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
590
629
|
instead.
|
@@ -592,14 +631,14 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
592
631
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
593
632
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
594
633
|
less than `1`).
|
595
|
-
height (`int`, *optional*, defaults to self.
|
596
|
-
The height in pixels of the generated image. This is set to
|
597
|
-
width (`int`, *optional*, defaults to self.
|
598
|
-
The width in pixels of the generated image. This is set to
|
634
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
635
|
+
The height in pixels of the generated image. This is set to 480 by default for the best results.
|
636
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
637
|
+
The width in pixels of the generated image. This is set to 720 by default for the best results.
|
599
638
|
num_frames (`int`, defaults to `48`):
|
600
639
|
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
|
601
640
|
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
|
602
|
-
num_seconds is 6 and fps is
|
641
|
+
num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
|
603
642
|
needs to be satisfied is that of divisibility mentioned above.
|
604
643
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
605
644
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
@@ -636,6 +675,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
636
675
|
return_dict (`bool`, *optional*, defaults to `True`):
|
637
676
|
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
638
677
|
of a plain tuple.
|
678
|
+
attention_kwargs (`dict`, *optional*):
|
679
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
680
|
+
`self.processor` in
|
681
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
639
682
|
callback_on_step_end (`Callable`, *optional*):
|
640
683
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
641
684
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -657,30 +700,29 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
657
700
|
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
658
701
|
"""
|
659
702
|
|
660
|
-
if num_frames > 49:
|
661
|
-
raise ValueError(
|
662
|
-
"The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
|
663
|
-
)
|
664
|
-
|
665
703
|
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
666
704
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
667
705
|
|
668
|
-
height = height or self.transformer.config.
|
669
|
-
width = width or self.transformer.config.
|
706
|
+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
|
707
|
+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
|
708
|
+
num_frames = num_frames or self.transformer.config.sample_frames
|
709
|
+
|
670
710
|
num_videos_per_prompt = 1
|
671
711
|
|
672
712
|
# 1. Check inputs. Raise error if not correct
|
673
713
|
self.check_inputs(
|
674
|
-
image,
|
675
|
-
prompt,
|
676
|
-
height,
|
677
|
-
width,
|
678
|
-
negative_prompt,
|
679
|
-
callback_on_step_end_tensor_inputs,
|
680
|
-
|
681
|
-
|
714
|
+
image=image,
|
715
|
+
prompt=prompt,
|
716
|
+
height=height,
|
717
|
+
width=width,
|
718
|
+
negative_prompt=negative_prompt,
|
719
|
+
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
720
|
+
latents=latents,
|
721
|
+
prompt_embeds=prompt_embeds,
|
722
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
682
723
|
)
|
683
724
|
self._guidance_scale = guidance_scale
|
725
|
+
self._attention_kwargs = attention_kwargs
|
684
726
|
self._interrupt = False
|
685
727
|
|
686
728
|
# 2. Default call parameters
|
@@ -717,6 +759,15 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
717
759
|
self._num_timesteps = len(timesteps)
|
718
760
|
|
719
761
|
# 5. Prepare latents
|
762
|
+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
763
|
+
|
764
|
+
# For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
|
765
|
+
patch_size_t = self.transformer.config.patch_size_t
|
766
|
+
additional_frames = 0
|
767
|
+
if patch_size_t is not None and latent_frames % patch_size_t != 0:
|
768
|
+
additional_frames = patch_size_t - latent_frames % patch_size_t
|
769
|
+
num_frames += additional_frames * self.vae_scale_factor_temporal
|
770
|
+
|
720
771
|
image = self.video_processor.preprocess(image, height=height, width=width).to(
|
721
772
|
device, dtype=prompt_embeds.dtype
|
722
773
|
)
|
@@ -745,6 +796,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
745
796
|
else None
|
746
797
|
)
|
747
798
|
|
799
|
+
# 8. Create ofs embeds if required
|
800
|
+
ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
|
801
|
+
|
748
802
|
# 8. Denoising loop
|
749
803
|
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
750
804
|
|
@@ -769,7 +823,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
769
823
|
hidden_states=latent_model_input,
|
770
824
|
encoder_hidden_states=prompt_embeds,
|
771
825
|
timestep=timestep,
|
826
|
+
ofs=ofs_emb,
|
772
827
|
image_rotary_emb=image_rotary_emb,
|
828
|
+
attention_kwargs=attention_kwargs,
|
773
829
|
return_dict=False,
|
774
830
|
)[0]
|
775
831
|
noise_pred = noise_pred.float()
|
@@ -813,6 +869,8 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
813
869
|
progress_bar.update()
|
814
870
|
|
815
871
|
if not output_type == "latent":
|
872
|
+
# Discard any padding frames that were added for CogVideoX 1.5
|
873
|
+
latents = latents[:, additional_frames:]
|
816
874
|
video = self.decode_latents(latents)
|
817
875
|
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
|
818
876
|
else:
|
@@ -15,21 +15,19 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import torch
|
21
21
|
from PIL import Image
|
22
22
|
from transformers import T5EncoderModel, T5Tokenizer
|
23
23
|
|
24
24
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
25
|
+
from ...loaders import CogVideoXLoraLoaderMixin
|
25
26
|
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
26
27
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
27
28
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
28
29
|
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
29
|
-
from ...utils import
|
30
|
-
logging,
|
31
|
-
replace_example_docstring,
|
32
|
-
)
|
30
|
+
from ...utils import logging, replace_example_docstring
|
33
31
|
from ...utils.torch_utils import randn_tensor
|
34
32
|
from ...video_processor import VideoProcessor
|
35
33
|
from .pipeline_output import CogVideoXPipelineOutput
|
@@ -96,7 +94,7 @@ def retrieve_timesteps(
|
|
96
94
|
sigmas: Optional[List[float]] = None,
|
97
95
|
**kwargs,
|
98
96
|
):
|
99
|
-
"""
|
97
|
+
r"""
|
100
98
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
101
99
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
102
100
|
|
@@ -161,7 +159,7 @@ def retrieve_latents(
|
|
161
159
|
raise AttributeError("Could not access latents of provided encoder_output")
|
162
160
|
|
163
161
|
|
164
|
-
class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
162
|
+
class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
165
163
|
r"""
|
166
164
|
Pipeline for video-to-video generation using CogVideoX.
|
167
165
|
|
@@ -206,12 +204,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
206
204
|
self.register_modules(
|
207
205
|
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
|
208
206
|
)
|
207
|
+
|
209
208
|
self.vae_scale_factor_spatial = (
|
210
209
|
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
211
210
|
)
|
212
211
|
self.vae_scale_factor_temporal = (
|
213
212
|
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
214
213
|
)
|
214
|
+
self.vae_scaling_factor_image = (
|
215
|
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
216
|
+
)
|
215
217
|
|
216
218
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
217
219
|
|
@@ -353,6 +355,12 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
353
355
|
latents: Optional[torch.Tensor] = None,
|
354
356
|
timestep: Optional[torch.Tensor] = None,
|
355
357
|
):
|
358
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
359
|
+
raise ValueError(
|
360
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
361
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
362
|
+
)
|
363
|
+
|
356
364
|
num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
|
357
365
|
|
358
366
|
shape = (
|
@@ -363,20 +371,8 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
363
371
|
width // self.vae_scale_factor_spatial,
|
364
372
|
)
|
365
373
|
|
366
|
-
if isinstance(generator, list) and len(generator) != batch_size:
|
367
|
-
raise ValueError(
|
368
|
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
369
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
370
|
-
)
|
371
|
-
|
372
374
|
if latents is None:
|
373
375
|
if isinstance(generator, list):
|
374
|
-
if len(generator) != batch_size:
|
375
|
-
raise ValueError(
|
376
|
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
377
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
378
|
-
)
|
379
|
-
|
380
376
|
init_latents = [
|
381
377
|
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
|
382
378
|
]
|
@@ -384,7 +380,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
384
380
|
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
|
385
381
|
|
386
382
|
init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
|
387
|
-
init_latents = self.
|
383
|
+
init_latents = self.vae_scaling_factor_image * init_latents
|
388
384
|
|
389
385
|
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
390
386
|
latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
@@ -398,7 +394,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
398
394
|
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
|
399
395
|
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
400
396
|
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
401
|
-
latents = 1 / self.
|
397
|
+
latents = 1 / self.vae_scaling_factor_image * latents
|
402
398
|
|
403
399
|
frames = self.vae.decode(latents).sample
|
404
400
|
return frames
|
@@ -516,21 +512,39 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
516
512
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
517
513
|
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
518
514
|
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
519
|
-
base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
520
|
-
base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
521
515
|
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
516
|
+
p = self.transformer.config.patch_size
|
517
|
+
p_t = self.transformer.config.patch_size_t
|
518
|
+
|
519
|
+
base_size_width = self.transformer.config.sample_width // p
|
520
|
+
base_size_height = self.transformer.config.sample_height // p
|
521
|
+
|
522
|
+
if p_t is None:
|
523
|
+
# CogVideoX 1.0
|
524
|
+
grid_crops_coords = get_resize_crop_region_for_grid(
|
525
|
+
(grid_height, grid_width), base_size_width, base_size_height
|
526
|
+
)
|
527
|
+
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
|
528
|
+
embed_dim=self.transformer.config.attention_head_dim,
|
529
|
+
crops_coords=grid_crops_coords,
|
530
|
+
grid_size=(grid_height, grid_width),
|
531
|
+
temporal_size=num_frames,
|
532
|
+
device=device,
|
533
|
+
)
|
534
|
+
else:
|
535
|
+
# CogVideoX 1.5
|
536
|
+
base_num_frames = (num_frames + p_t - 1) // p_t
|
537
|
+
|
538
|
+
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
|
539
|
+
embed_dim=self.transformer.config.attention_head_dim,
|
540
|
+
crops_coords=None,
|
541
|
+
grid_size=(grid_height, grid_width),
|
542
|
+
temporal_size=base_num_frames,
|
543
|
+
grid_type="slice",
|
544
|
+
max_size=(base_size_height, base_size_width),
|
545
|
+
device=device,
|
546
|
+
)
|
531
547
|
|
532
|
-
freqs_cos = freqs_cos.to(device=device)
|
533
|
-
freqs_sin = freqs_sin.to(device=device)
|
534
548
|
return freqs_cos, freqs_sin
|
535
549
|
|
536
550
|
@property
|
@@ -541,6 +555,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
541
555
|
def num_timesteps(self):
|
542
556
|
return self._num_timesteps
|
543
557
|
|
558
|
+
@property
|
559
|
+
def attention_kwargs(self):
|
560
|
+
return self._attention_kwargs
|
561
|
+
|
544
562
|
@property
|
545
563
|
def interrupt(self):
|
546
564
|
return self._interrupt
|
@@ -552,8 +570,8 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
552
570
|
video: List[Image.Image] = None,
|
553
571
|
prompt: Optional[Union[str, List[str]]] = None,
|
554
572
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
555
|
-
height: int =
|
556
|
-
width: int =
|
573
|
+
height: Optional[int] = None,
|
574
|
+
width: Optional[int] = None,
|
557
575
|
num_inference_steps: int = 50,
|
558
576
|
timesteps: Optional[List[int]] = None,
|
559
577
|
strength: float = 0.8,
|
@@ -567,6 +585,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
567
585
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
568
586
|
output_type: str = "pil",
|
569
587
|
return_dict: bool = True,
|
588
|
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
570
589
|
callback_on_step_end: Optional[
|
571
590
|
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
572
591
|
] = None,
|
@@ -586,10 +605,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
586
605
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
587
606
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
588
607
|
less than `1`).
|
589
|
-
height (`int`, *optional*, defaults to self.
|
590
|
-
The height in pixels of the generated image. This is set to
|
591
|
-
width (`int`, *optional*, defaults to self.
|
592
|
-
The width in pixels of the generated image. This is set to
|
608
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
609
|
+
The height in pixels of the generated image. This is set to 480 by default for the best results.
|
610
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
611
|
+
The width in pixels of the generated image. This is set to 720 by default for the best results.
|
593
612
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
594
613
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
595
614
|
expense of slower inference.
|
@@ -627,6 +646,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
627
646
|
return_dict (`bool`, *optional*, defaults to `True`):
|
628
647
|
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
629
648
|
of a plain tuple.
|
649
|
+
attention_kwargs (`dict`, *optional*):
|
650
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
651
|
+
`self.processor` in
|
652
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
630
653
|
callback_on_step_end (`Callable`, *optional*):
|
631
654
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
632
655
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -651,22 +674,27 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
651
674
|
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
652
675
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
653
676
|
|
654
|
-
height = height or self.transformer.config.
|
655
|
-
width = width or self.transformer.config.
|
677
|
+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
|
678
|
+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
|
679
|
+
num_frames = len(video) if latents is None else latents.size(1)
|
680
|
+
|
656
681
|
num_videos_per_prompt = 1
|
657
682
|
|
658
683
|
# 1. Check inputs. Raise error if not correct
|
659
684
|
self.check_inputs(
|
660
|
-
prompt,
|
661
|
-
height,
|
662
|
-
width,
|
663
|
-
strength,
|
664
|
-
negative_prompt,
|
665
|
-
callback_on_step_end_tensor_inputs,
|
666
|
-
|
667
|
-
|
685
|
+
prompt=prompt,
|
686
|
+
height=height,
|
687
|
+
width=width,
|
688
|
+
strength=strength,
|
689
|
+
negative_prompt=negative_prompt,
|
690
|
+
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
691
|
+
video=video,
|
692
|
+
latents=latents,
|
693
|
+
prompt_embeds=prompt_embeds,
|
694
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
668
695
|
)
|
669
696
|
self._guidance_scale = guidance_scale
|
697
|
+
self._attention_kwargs = attention_kwargs
|
670
698
|
self._interrupt = False
|
671
699
|
|
672
700
|
# 2. Default call parameters
|
@@ -705,6 +733,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
705
733
|
self._num_timesteps = len(timesteps)
|
706
734
|
|
707
735
|
# 5. Prepare latents
|
736
|
+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
737
|
+
|
738
|
+
# For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
|
739
|
+
patch_size_t = self.transformer.config.patch_size_t
|
740
|
+
if patch_size_t is not None and latent_frames % patch_size_t != 0:
|
741
|
+
raise ValueError(
|
742
|
+
f"The number of latent frames must be divisible by `{patch_size_t=}` but the given video "
|
743
|
+
f"contains {latent_frames=}, which is not divisible."
|
744
|
+
)
|
745
|
+
|
708
746
|
if latents is None:
|
709
747
|
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
710
748
|
video = video.to(device=device, dtype=prompt_embeds.dtype)
|
@@ -755,6 +793,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
755
793
|
encoder_hidden_states=prompt_embeds,
|
756
794
|
timestep=timestep,
|
757
795
|
image_rotary_emb=image_rotary_emb,
|
796
|
+
attention_kwargs=attention_kwargs,
|
758
797
|
return_dict=False,
|
759
798
|
)[0]
|
760
799
|
noise_pred = noise_pred.float()
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from ...utils import (
|
4
|
+
DIFFUSERS_SLOW_IMPORT,
|
5
|
+
OptionalDependencyNotAvailable,
|
6
|
+
_LazyModule,
|
7
|
+
get_objects_from_module,
|
8
|
+
is_torch_available,
|
9
|
+
is_transformers_available,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
_dummy_objects = {}
|
14
|
+
_additional_imports = {}
|
15
|
+
_import_structure = {"pipeline_output": ["CogView3PlusPipelineOutput"]}
|
16
|
+
|
17
|
+
try:
|
18
|
+
if not (is_transformers_available() and is_torch_available()):
|
19
|
+
raise OptionalDependencyNotAvailable()
|
20
|
+
except OptionalDependencyNotAvailable:
|
21
|
+
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
22
|
+
|
23
|
+
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
|
+
else:
|
25
|
+
_import_structure["pipeline_cogview3plus"] = ["CogView3PlusPipeline"]
|
26
|
+
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
27
|
+
try:
|
28
|
+
if not (is_transformers_available() and is_torch_available()):
|
29
|
+
raise OptionalDependencyNotAvailable()
|
30
|
+
except OptionalDependencyNotAvailable:
|
31
|
+
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
|
32
|
+
else:
|
33
|
+
from .pipeline_cogview3plus import CogView3PlusPipeline
|
34
|
+
else:
|
35
|
+
import sys
|
36
|
+
|
37
|
+
sys.modules[__name__] = _LazyModule(
|
38
|
+
__name__,
|
39
|
+
globals()["__file__"],
|
40
|
+
_import_structure,
|
41
|
+
module_spec=__spec__,
|
42
|
+
)
|
43
|
+
|
44
|
+
for name, value in _dummy_objects.items():
|
45
|
+
setattr(sys.modules[__name__], name, value)
|
46
|
+
for name, value in _additional_imports.items():
|
47
|
+
setattr(sys.modules[__name__], name, value)
|