diffusers 0.30.2__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +38 -2
- diffusers/configuration_utils.py +12 -0
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +257 -54
- diffusers/loaders/__init__.py +2 -0
- diffusers/loaders/ip_adapter.py +5 -1
- diffusers/loaders/lora_base.py +14 -7
- diffusers/loaders/lora_conversion_utils.py +332 -0
- diffusers/loaders/lora_pipeline.py +707 -41
- diffusers/loaders/peft.py +1 -0
- diffusers/loaders/single_file_utils.py +81 -4
- diffusers/loaders/textual_inversion.py +2 -0
- diffusers/loaders/unet.py +39 -8
- diffusers/models/__init__.py +4 -0
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +86 -10
- diffusers/models/attention_processor.py +169 -133
- diffusers/models/autoencoders/autoencoder_kl.py +71 -11
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +287 -85
- diffusers/models/controlnet_flux.py +536 -0
- diffusers/models/controlnet_sd3.py +7 -3
- diffusers/models/controlnet_sparsectrl.py +0 -1
- diffusers/models/embeddings.py +238 -61
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +182 -14
- diffusers/models/modeling_utils.py +283 -46
- diffusers/models/normalization.py +79 -0
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +58 -36
- diffusers/models/transformers/pixart_transformer_2d.py +9 -1
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +161 -44
- diffusers/models/transformers/transformer_sd3.py +7 -1
- diffusers/models/unets/unet_2d_condition.py +8 -8
- diffusers/models/unets/unet_motion_model.py +41 -63
- diffusers/models/upsampling.py +6 -6
- diffusers/pipelines/__init__.py +40 -7
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
- diffusers/pipelines/auto_pipeline.py +39 -8
- diffusers/pipelines/cogvideo/__init__.py +6 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +32 -34
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +837 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +825 -0
- diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +10 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -20
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
- diffusers/pipelines/pag/__init__.py +6 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_loading_utils.py +225 -27
- diffusers/pipelines/pipeline_utils.py +123 -180
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +126 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/quantization_config.py +391 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +4 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
- diffusers/schedulers/scheduling_deis_multistep.py +78 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_sasolver.py +78 -1
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
- diffusers/training_utils.py +48 -18
- diffusers/utils/__init__.py +2 -1
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +195 -0
- diffusers/utils/hub_utils.py +16 -4
- diffusers/utils/import_utils.py +31 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +3 -3
- diffusers/utils/testing_utils.py +59 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/RECORD +173 -147
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/WHEEL +1 -1
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -119,7 +119,7 @@ def retrieve_timesteps(
|
|
119
119
|
sigmas: Optional[List[float]] = None,
|
120
120
|
**kwargs,
|
121
121
|
):
|
122
|
-
"""
|
122
|
+
r"""
|
123
123
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
124
124
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
125
125
|
|
@@ -246,7 +246,6 @@ class AnimateDiffVideoToVideoPipeline(
|
|
246
246
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
247
247
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)
|
248
248
|
|
249
|
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
|
250
249
|
def encode_prompt(
|
251
250
|
self,
|
252
251
|
prompt,
|
@@ -299,7 +298,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
299
298
|
else:
|
300
299
|
scale_lora_layers(self.text_encoder, lora_scale)
|
301
300
|
|
302
|
-
if prompt is not None and isinstance(prompt, str):
|
301
|
+
if prompt is not None and isinstance(prompt, (str, dict)):
|
303
302
|
batch_size = 1
|
304
303
|
elif prompt is not None and isinstance(prompt, list):
|
305
304
|
batch_size = len(prompt)
|
@@ -582,8 +581,8 @@ class AnimateDiffVideoToVideoPipeline(
|
|
582
581
|
raise ValueError(
|
583
582
|
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
584
583
|
)
|
585
|
-
elif prompt is not None and
|
586
|
-
raise ValueError(f"`prompt` has to be of type `str` or `
|
584
|
+
elif prompt is not None and not isinstance(prompt, (str, list, dict)):
|
585
|
+
raise ValueError(f"`prompt` has to be of type `str`, `list` or `dict` but is {type(prompt)}")
|
587
586
|
|
588
587
|
if negative_prompt is not None and negative_prompt_embeds is not None:
|
589
588
|
raise ValueError(
|
@@ -628,23 +627,20 @@ class AnimateDiffVideoToVideoPipeline(
|
|
628
627
|
|
629
628
|
def prepare_latents(
|
630
629
|
self,
|
631
|
-
video,
|
632
|
-
height,
|
633
|
-
width,
|
634
|
-
num_channels_latents,
|
635
|
-
batch_size,
|
636
|
-
timestep,
|
637
|
-
dtype,
|
638
|
-
device,
|
639
|
-
generator,
|
640
|
-
latents=None,
|
630
|
+
video: Optional[torch.Tensor] = None,
|
631
|
+
height: int = 64,
|
632
|
+
width: int = 64,
|
633
|
+
num_channels_latents: int = 4,
|
634
|
+
batch_size: int = 1,
|
635
|
+
timestep: Optional[int] = None,
|
636
|
+
dtype: Optional[torch.dtype] = None,
|
637
|
+
device: Optional[torch.device] = None,
|
638
|
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
639
|
+
latents: Optional[torch.Tensor] = None,
|
641
640
|
decode_chunk_size: int = 16,
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
else:
|
646
|
-
num_frames = latents.shape[2]
|
647
|
-
|
641
|
+
add_noise: bool = False,
|
642
|
+
) -> torch.Tensor:
|
643
|
+
num_frames = video.shape[1] if latents is None else latents.shape[2]
|
648
644
|
shape = (
|
649
645
|
batch_size,
|
650
646
|
num_channels_latents,
|
@@ -708,8 +704,13 @@ class AnimateDiffVideoToVideoPipeline(
|
|
708
704
|
if shape != latents.shape:
|
709
705
|
# [B, C, F, H, W]
|
710
706
|
raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
|
707
|
+
|
711
708
|
latents = latents.to(device, dtype=dtype)
|
712
709
|
|
710
|
+
if add_noise:
|
711
|
+
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
712
|
+
latents = self.scheduler.add_noise(latents, noise, timestep)
|
713
|
+
|
713
714
|
return latents
|
714
715
|
|
715
716
|
@property
|
@@ -735,6 +736,10 @@ class AnimateDiffVideoToVideoPipeline(
|
|
735
736
|
def num_timesteps(self):
|
736
737
|
return self._num_timesteps
|
737
738
|
|
739
|
+
@property
|
740
|
+
def interrupt(self):
|
741
|
+
return self._interrupt
|
742
|
+
|
738
743
|
@torch.no_grad()
|
739
744
|
def __call__(
|
740
745
|
self,
|
@@ -743,6 +748,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
743
748
|
height: Optional[int] = None,
|
744
749
|
width: Optional[int] = None,
|
745
750
|
num_inference_steps: int = 50,
|
751
|
+
enforce_inference_steps: bool = False,
|
746
752
|
timesteps: Optional[List[int]] = None,
|
747
753
|
sigmas: Optional[List[float]] = None,
|
748
754
|
guidance_scale: float = 7.5,
|
@@ -874,9 +880,10 @@ class AnimateDiffVideoToVideoPipeline(
|
|
874
880
|
self._guidance_scale = guidance_scale
|
875
881
|
self._clip_skip = clip_skip
|
876
882
|
self._cross_attention_kwargs = cross_attention_kwargs
|
883
|
+
self._interrupt = False
|
877
884
|
|
878
885
|
# 2. Define call parameters
|
879
|
-
if prompt is not None and isinstance(prompt, str):
|
886
|
+
if prompt is not None and isinstance(prompt, (str, dict)):
|
880
887
|
batch_size = 1
|
881
888
|
elif prompt is not None and isinstance(prompt, list):
|
882
889
|
batch_size = len(prompt)
|
@@ -884,51 +891,29 @@ class AnimateDiffVideoToVideoPipeline(
|
|
884
891
|
batch_size = prompt_embeds.shape[0]
|
885
892
|
|
886
893
|
device = self._execution_device
|
894
|
+
dtype = self.dtype
|
887
895
|
|
888
|
-
# 3.
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
893
|
-
prompt,
|
894
|
-
device,
|
895
|
-
num_videos_per_prompt,
|
896
|
-
self.do_classifier_free_guidance,
|
897
|
-
negative_prompt,
|
898
|
-
prompt_embeds=prompt_embeds,
|
899
|
-
negative_prompt_embeds=negative_prompt_embeds,
|
900
|
-
lora_scale=text_encoder_lora_scale,
|
901
|
-
clip_skip=self.clip_skip,
|
902
|
-
)
|
903
|
-
|
904
|
-
# For classifier free guidance, we need to do two forward passes.
|
905
|
-
# Here we concatenate the unconditional and text embeddings into a single batch
|
906
|
-
# to avoid doing two forward passes
|
907
|
-
if self.do_classifier_free_guidance:
|
908
|
-
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
909
|
-
|
910
|
-
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
911
|
-
image_embeds = self.prepare_ip_adapter_image_embeds(
|
912
|
-
ip_adapter_image,
|
913
|
-
ip_adapter_image_embeds,
|
914
|
-
device,
|
915
|
-
batch_size * num_videos_per_prompt,
|
916
|
-
self.do_classifier_free_guidance,
|
896
|
+
# 3. Prepare timesteps
|
897
|
+
if not enforce_inference_steps:
|
898
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
899
|
+
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
917
900
|
)
|
901
|
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
|
902
|
+
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
903
|
+
else:
|
904
|
+
denoising_inference_steps = int(num_inference_steps / strength)
|
905
|
+
timesteps, denoising_inference_steps = retrieve_timesteps(
|
906
|
+
self.scheduler, denoising_inference_steps, device, timesteps, sigmas
|
907
|
+
)
|
908
|
+
timesteps = timesteps[-num_inference_steps:]
|
909
|
+
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
918
910
|
|
919
|
-
# 4. Prepare
|
920
|
-
timesteps, num_inference_steps = retrieve_timesteps(
|
921
|
-
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
922
|
-
)
|
923
|
-
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
|
924
|
-
latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
|
925
|
-
|
926
|
-
# 5. Prepare latent variables
|
911
|
+
# 4. Prepare latent variables
|
927
912
|
if latents is None:
|
928
913
|
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
929
914
|
# Move the number of frames before the number of channels.
|
930
915
|
video = video.permute(0, 2, 1, 3, 4)
|
931
|
-
video = video.to(device=device, dtype=
|
916
|
+
video = video.to(device=device, dtype=dtype)
|
932
917
|
num_channels_latents = self.unet.config.in_channels
|
933
918
|
latents = self.prepare_latents(
|
934
919
|
video=video,
|
@@ -937,17 +922,67 @@ class AnimateDiffVideoToVideoPipeline(
|
|
937
922
|
num_channels_latents=num_channels_latents,
|
938
923
|
batch_size=batch_size * num_videos_per_prompt,
|
939
924
|
timestep=latent_timestep,
|
940
|
-
dtype=
|
925
|
+
dtype=dtype,
|
941
926
|
device=device,
|
942
927
|
generator=generator,
|
943
928
|
latents=latents,
|
944
929
|
decode_chunk_size=decode_chunk_size,
|
930
|
+
add_noise=enforce_inference_steps,
|
931
|
+
)
|
932
|
+
|
933
|
+
# 5. Encode input prompt
|
934
|
+
text_encoder_lora_scale = (
|
935
|
+
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
|
945
936
|
)
|
937
|
+
num_frames = latents.shape[2]
|
938
|
+
if self.free_noise_enabled:
|
939
|
+
prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
|
940
|
+
prompt=prompt,
|
941
|
+
num_frames=num_frames,
|
942
|
+
device=device,
|
943
|
+
num_videos_per_prompt=num_videos_per_prompt,
|
944
|
+
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
945
|
+
negative_prompt=negative_prompt,
|
946
|
+
prompt_embeds=prompt_embeds,
|
947
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
948
|
+
lora_scale=text_encoder_lora_scale,
|
949
|
+
clip_skip=self.clip_skip,
|
950
|
+
)
|
951
|
+
else:
|
952
|
+
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
953
|
+
prompt,
|
954
|
+
device,
|
955
|
+
num_videos_per_prompt,
|
956
|
+
self.do_classifier_free_guidance,
|
957
|
+
negative_prompt,
|
958
|
+
prompt_embeds=prompt_embeds,
|
959
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
960
|
+
lora_scale=text_encoder_lora_scale,
|
961
|
+
clip_skip=self.clip_skip,
|
962
|
+
)
|
963
|
+
|
964
|
+
# For classifier free guidance, we need to do two forward passes.
|
965
|
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
966
|
+
# to avoid doing two forward passes
|
967
|
+
if self.do_classifier_free_guidance:
|
968
|
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
969
|
+
|
970
|
+
prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
|
971
|
+
|
972
|
+
# 6. Prepare IP-Adapter embeddings
|
973
|
+
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
974
|
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
975
|
+
ip_adapter_image,
|
976
|
+
ip_adapter_image_embeds,
|
977
|
+
device,
|
978
|
+
batch_size * num_videos_per_prompt,
|
979
|
+
self.do_classifier_free_guidance,
|
980
|
+
)
|
946
981
|
|
947
|
-
#
|
982
|
+
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
948
983
|
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
949
984
|
|
950
|
-
#
|
985
|
+
# 8. Add image embeds for IP-Adapter
|
951
986
|
added_cond_kwargs = (
|
952
987
|
{"image_embeds": image_embeds}
|
953
988
|
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
|
@@ -967,9 +1002,12 @@ class AnimateDiffVideoToVideoPipeline(
|
|
967
1002
|
self._num_timesteps = len(timesteps)
|
968
1003
|
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
969
1004
|
|
970
|
-
#
|
1005
|
+
# 9. Denoising loop
|
971
1006
|
with self.progress_bar(total=self._num_timesteps) as progress_bar:
|
972
1007
|
for i, t in enumerate(timesteps):
|
1008
|
+
if self.interrupt:
|
1009
|
+
continue
|
1010
|
+
|
973
1011
|
# expand the latents if we are doing classifier free guidance
|
974
1012
|
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
975
1013
|
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
@@ -1005,14 +1043,14 @@ class AnimateDiffVideoToVideoPipeline(
|
|
1005
1043
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
1006
1044
|
progress_bar.update()
|
1007
1045
|
|
1008
|
-
#
|
1046
|
+
# 10. Post-processing
|
1009
1047
|
if output_type == "latent":
|
1010
1048
|
video = latents
|
1011
1049
|
else:
|
1012
1050
|
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
1013
1051
|
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
1014
1052
|
|
1015
|
-
#
|
1053
|
+
# 11. Offload all models
|
1016
1054
|
self.maybe_free_model_hooks()
|
1017
1055
|
|
1018
1056
|
if not return_dict:
|