diffusers 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +94 -3
- diffusers/commands/env.py +1 -5
- diffusers/configuration_utils.py +4 -9
- diffusers/dependency_versions_table.py +2 -2
- diffusers/image_processor.py +1 -2
- diffusers/loaders/__init__.py +17 -2
- diffusers/loaders/ip_adapter.py +10 -7
- diffusers/loaders/lora_base.py +752 -0
- diffusers/loaders/lora_pipeline.py +2222 -0
- diffusers/loaders/peft.py +213 -5
- diffusers/loaders/single_file.py +1 -12
- diffusers/loaders/single_file_model.py +31 -10
- diffusers/loaders/single_file_utils.py +262 -2
- diffusers/loaders/textual_inversion.py +1 -6
- diffusers/loaders/unet.py +23 -208
- diffusers/models/__init__.py +20 -0
- diffusers/models/activations.py +22 -0
- diffusers/models/attention.py +386 -7
- diffusers/models/attention_processor.py +1795 -629
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_kl.py +14 -3
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1035 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
- diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vq_model.py +4 -4
- diffusers/models/controlnet.py +2 -3
- diffusers/models/controlnet_hunyuan.py +401 -0
- diffusers/models/controlnet_sd3.py +11 -11
- diffusers/models/controlnet_sparsectrl.py +789 -0
- diffusers/models/controlnet_xs.py +40 -10
- diffusers/models/downsampling.py +68 -0
- diffusers/models/embeddings.py +319 -36
- diffusers/models/model_loading_utils.py +1 -3
- diffusers/models/modeling_flax_utils.py +1 -6
- diffusers/models/modeling_utils.py +4 -16
- diffusers/models/normalization.py +203 -12
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +527 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +345 -0
- diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
- diffusers/models/transformers/latte_transformer_3d.py +327 -0
- diffusers/models/transformers/lumina_nextdit2d.py +340 -0
- diffusers/models/transformers/pixart_transformer_2d.py +102 -1
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/stable_audio_transformer.py +458 -0
- diffusers/models/transformers/transformer_flux.py +455 -0
- diffusers/models/transformers/transformer_sd3.py +18 -4
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_condition.py +8 -1
- diffusers/models/unets/unet_3d_blocks.py +51 -920
- diffusers/models/unets/unet_3d_condition.py +4 -1
- diffusers/models/unets/unet_i2vgen_xl.py +4 -1
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +1330 -84
- diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
- diffusers/models/unets/unet_stable_cascade.py +1 -3
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +64 -0
- diffusers/models/vq_model.py +8 -4
- diffusers/optimization.py +1 -1
- diffusers/pipelines/__init__.py +100 -3
- diffusers/pipelines/animatediff/__init__.py +4 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
- diffusers/pipelines/aura_flow/__init__.py +48 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
- diffusers/pipelines/auto_pipeline.py +97 -19
- diffusers/pipelines/cogvideo/__init__.py +48 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +687 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
- diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +749 -0
- diffusers/pipelines/flux/pipeline_output.py +21 -0
- diffusers/pipelines/free_init_utils.py +2 -0
- diffusers/pipelines/free_noise_utils.py +236 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
- diffusers/pipelines/kolors/__init__.py +54 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
- diffusers/pipelines/kolors/pipeline_output.py +21 -0
- diffusers/pipelines/kolors/text_encoder.py +889 -0
- diffusers/pipelines/kolors/tokenizer.py +334 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
- diffusers/pipelines/latte/__init__.py +48 -0
- diffusers/pipelines/latte/pipeline_latte.py +881 -0
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
- diffusers/pipelines/lumina/__init__.py +48 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
- diffusers/pipelines/pag/__init__.py +67 -0
- diffusers/pipelines/pag/pag_utils.py +237 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
- diffusers/pipelines/pia/pipeline_pia.py +30 -37
- diffusers/pipelines/pipeline_flax_utils.py +4 -9
- diffusers/pipelines/pipeline_loading_utils.py +0 -3
- diffusers/pipelines/pipeline_utils.py +2 -14
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
- diffusers/pipelines/stable_audio/__init__.py +50 -0
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
- diffusers/schedulers/__init__.py +8 -0
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
- diffusers/schedulers/scheduling_ddim.py +1 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
- diffusers/schedulers/scheduling_ddpm.py +1 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
- diffusers/schedulers/scheduling_deis_multistep.py +2 -2
- diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
- diffusers/schedulers/scheduling_ipndm.py +1 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
- diffusers/schedulers/scheduling_utils.py +1 -3
- diffusers/schedulers/scheduling_utils_flax.py +1 -3
- diffusers/training_utils.py +99 -14
- diffusers/utils/__init__.py +2 -2
- diffusers/utils/dummy_pt_objects.py +210 -0
- diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
- diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
- diffusers/utils/dynamic_modules_utils.py +1 -11
- diffusers/utils/export_utils.py +1 -4
- diffusers/utils/hub_utils.py +45 -42
- diffusers/utils/import_utils.py +19 -16
- diffusers/utils/loading_utils.py +76 -3
- diffusers/utils/testing_utils.py +11 -8
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/METADATA +73 -83
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/RECORD +217 -164
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/WHEEL +1 -1
- diffusers/loaders/autoencoder.py +0 -146
- diffusers/loaders/controlnet.py +0 -136
- diffusers/loaders/lora.py +0 -1728
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/LICENSE +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ import torch
|
|
19
19
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
20
20
|
|
21
21
|
from ...image_processor import PipelineImageInput
|
22
|
-
from ...loaders import IPAdapterMixin,
|
22
|
+
from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
23
23
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
|
24
24
|
from ...models.lora import adjust_lora_scale_text_encoder
|
25
25
|
from ...models.unets.unet_motion_model import MotionAdapter
|
@@ -35,6 +35,7 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_
|
|
35
35
|
from ...utils.torch_utils import randn_tensor
|
36
36
|
from ...video_processor import VideoProcessor
|
37
37
|
from ..free_init_utils import FreeInitMixin
|
38
|
+
from ..free_noise_utils import AnimateDiffFreeNoiseMixin
|
38
39
|
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
39
40
|
from .pipeline_output import AnimateDiffPipelineOutput
|
40
41
|
|
@@ -174,8 +175,9 @@ class AnimateDiffVideoToVideoPipeline(
|
|
174
175
|
StableDiffusionMixin,
|
175
176
|
TextualInversionLoaderMixin,
|
176
177
|
IPAdapterMixin,
|
177
|
-
|
178
|
+
StableDiffusionLoraLoaderMixin,
|
178
179
|
FreeInitMixin,
|
180
|
+
AnimateDiffFreeNoiseMixin,
|
179
181
|
):
|
180
182
|
r"""
|
181
183
|
Pipeline for video-to-video generation.
|
@@ -185,8 +187,8 @@ class AnimateDiffVideoToVideoPipeline(
|
|
185
187
|
|
186
188
|
The pipeline also inherits the following loading methods:
|
187
189
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
188
|
-
- [`~loaders.
|
189
|
-
- [`~loaders.
|
190
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
191
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
190
192
|
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
191
193
|
|
192
194
|
Args:
|
@@ -288,7 +290,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
288
290
|
"""
|
289
291
|
# set lora scale so that monkey patched LoRA
|
290
292
|
# function of text encoder can correctly access it
|
291
|
-
if lora_scale is not None and isinstance(self,
|
293
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
292
294
|
self._lora_scale = lora_scale
|
293
295
|
|
294
296
|
# dynamically adjust the LoRA scale
|
@@ -421,7 +423,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
421
423
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
422
424
|
|
423
425
|
if self.text_encoder is not None:
|
424
|
-
if isinstance(self,
|
426
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
425
427
|
# Retrieve the original scale by scaling back the LoRA layers
|
426
428
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
427
429
|
|
@@ -456,6 +458,9 @@ class AnimateDiffVideoToVideoPipeline(
|
|
456
458
|
def prepare_ip_adapter_image_embeds(
|
457
459
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
458
460
|
):
|
461
|
+
image_embeds = []
|
462
|
+
if do_classifier_free_guidance:
|
463
|
+
negative_image_embeds = []
|
459
464
|
if ip_adapter_image_embeds is None:
|
460
465
|
if not isinstance(ip_adapter_image, list):
|
461
466
|
ip_adapter_image = [ip_adapter_image]
|
@@ -465,7 +470,6 @@ class AnimateDiffVideoToVideoPipeline(
|
|
465
470
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
466
471
|
)
|
467
472
|
|
468
|
-
image_embeds = []
|
469
473
|
for single_ip_adapter_image, image_proj_layer in zip(
|
470
474
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
471
475
|
):
|
@@ -473,46 +477,52 @@ class AnimateDiffVideoToVideoPipeline(
|
|
473
477
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
474
478
|
single_ip_adapter_image, device, 1, output_hidden_state
|
475
479
|
)
|
476
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
477
|
-
single_negative_image_embeds = torch.stack(
|
478
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
479
|
-
)
|
480
480
|
|
481
|
+
image_embeds.append(single_image_embeds[None, :])
|
481
482
|
if do_classifier_free_guidance:
|
482
|
-
|
483
|
-
single_image_embeds = single_image_embeds.to(device)
|
484
|
-
|
485
|
-
image_embeds.append(single_image_embeds)
|
483
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
486
484
|
else:
|
487
|
-
repeat_dims = [1]
|
488
|
-
image_embeds = []
|
489
485
|
for single_image_embeds in ip_adapter_image_embeds:
|
490
486
|
if do_classifier_free_guidance:
|
491
487
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
492
|
-
|
493
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
494
|
-
)
|
495
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
496
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
497
|
-
)
|
498
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
499
|
-
else:
|
500
|
-
single_image_embeds = single_image_embeds.repeat(
|
501
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
502
|
-
)
|
488
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
503
489
|
image_embeds.append(single_image_embeds)
|
504
490
|
|
505
|
-
|
491
|
+
ip_adapter_image_embeds = []
|
492
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
493
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
494
|
+
if do_classifier_free_guidance:
|
495
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
496
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
506
497
|
|
507
|
-
|
508
|
-
|
498
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
499
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
500
|
+
|
501
|
+
return ip_adapter_image_embeds
|
502
|
+
|
503
|
+
def encode_video(self, video, generator, decode_chunk_size: int = 16) -> torch.Tensor:
|
504
|
+
latents = []
|
505
|
+
for i in range(0, len(video), decode_chunk_size):
|
506
|
+
batch_video = video[i : i + decode_chunk_size]
|
507
|
+
batch_video = retrieve_latents(self.vae.encode(batch_video), generator=generator)
|
508
|
+
latents.append(batch_video)
|
509
|
+
return torch.cat(latents)
|
510
|
+
|
511
|
+
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
512
|
+
def decode_latents(self, latents, decode_chunk_size: int = 16):
|
509
513
|
latents = 1 / self.vae.config.scaling_factor * latents
|
510
514
|
|
511
515
|
batch_size, channels, num_frames, height, width = latents.shape
|
512
516
|
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
513
517
|
|
514
|
-
|
515
|
-
|
518
|
+
video = []
|
519
|
+
for i in range(0, latents.shape[0], decode_chunk_size):
|
520
|
+
batch_latents = latents[i : i + decode_chunk_size]
|
521
|
+
batch_latents = self.vae.decode(batch_latents).sample
|
522
|
+
video.append(batch_latents)
|
523
|
+
|
524
|
+
video = torch.cat(video)
|
525
|
+
video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
|
516
526
|
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
517
527
|
video = video.float()
|
518
528
|
return video
|
@@ -628,6 +638,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
628
638
|
device,
|
629
639
|
generator,
|
630
640
|
latents=None,
|
641
|
+
decode_chunk_size: int = 16,
|
631
642
|
):
|
632
643
|
if latents is None:
|
633
644
|
num_frames = video.shape[1]
|
@@ -662,13 +673,11 @@ class AnimateDiffVideoToVideoPipeline(
|
|
662
673
|
)
|
663
674
|
|
664
675
|
init_latents = [
|
665
|
-
|
676
|
+
self.encode_video(video[i], generator[i], decode_chunk_size).unsqueeze(0)
|
666
677
|
for i in range(batch_size)
|
667
678
|
]
|
668
679
|
else:
|
669
|
-
init_latents = [
|
670
|
-
retrieve_latents(self.vae.encode(vid), generator=generator).unsqueeze(0) for vid in video
|
671
|
-
]
|
680
|
+
init_latents = [self.encode_video(vid, generator, decode_chunk_size).unsqueeze(0) for vid in video]
|
672
681
|
|
673
682
|
init_latents = torch.cat(init_latents, dim=0)
|
674
683
|
|
@@ -753,6 +762,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
753
762
|
clip_skip: Optional[int] = None,
|
754
763
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
755
764
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
765
|
+
decode_chunk_size: int = 16,
|
756
766
|
):
|
757
767
|
r"""
|
758
768
|
The call function to the pipeline for generation.
|
@@ -828,6 +838,8 @@ class AnimateDiffVideoToVideoPipeline(
|
|
828
838
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
829
839
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
830
840
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
841
|
+
decode_chunk_size (`int`, defaults to `16`):
|
842
|
+
The number of frames to decode at a time when calling `decode_latents` method.
|
831
843
|
|
832
844
|
Examples:
|
833
845
|
|
@@ -929,6 +941,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
929
941
|
device=device,
|
930
942
|
generator=generator,
|
931
943
|
latents=latents,
|
944
|
+
decode_chunk_size=decode_chunk_size,
|
932
945
|
)
|
933
946
|
|
934
947
|
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
@@ -996,7 +1009,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|
996
1009
|
if output_type == "latent":
|
997
1010
|
video = latents
|
998
1011
|
else:
|
999
|
-
video_tensor = self.decode_latents(latents)
|
1012
|
+
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
1000
1013
|
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
1001
1014
|
|
1002
1015
|
# 10. Offload all models
|
@@ -544,7 +544,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad
|
|
544
544
|
|
545
545
|
def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
|
546
546
|
if hasattr(module, "get_processor"):
|
547
|
-
processors[f"{name}.processor"] = module.get_processor(
|
547
|
+
processors[f"{name}.processor"] = module.get_processor()
|
548
548
|
|
549
549
|
for sub_name, child in module.named_children():
|
550
550
|
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
|
@@ -286,6 +286,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
|
286
286
|
The sequence of generated hidden-states.
|
287
287
|
"""
|
288
288
|
max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
|
289
|
+
model_kwargs = self.language_model._get_initial_cache_position(inputs_embeds, model_kwargs)
|
289
290
|
for _ in range(max_new_tokens):
|
290
291
|
# prepare model inputs
|
291
292
|
model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from ...utils import (
|
4
|
+
DIFFUSERS_SLOW_IMPORT,
|
5
|
+
OptionalDependencyNotAvailable,
|
6
|
+
_LazyModule,
|
7
|
+
get_objects_from_module,
|
8
|
+
is_torch_available,
|
9
|
+
is_transformers_available,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
_dummy_objects = {}
|
14
|
+
_import_structure = {}
|
15
|
+
|
16
|
+
|
17
|
+
try:
|
18
|
+
if not (is_transformers_available() and is_torch_available()):
|
19
|
+
raise OptionalDependencyNotAvailable()
|
20
|
+
except OptionalDependencyNotAvailable:
|
21
|
+
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
22
|
+
|
23
|
+
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
|
+
else:
|
25
|
+
_import_structure["pipeline_aura_flow"] = ["AuraFlowPipeline"]
|
26
|
+
|
27
|
+
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
28
|
+
try:
|
29
|
+
if not (is_transformers_available() and is_torch_available()):
|
30
|
+
raise OptionalDependencyNotAvailable()
|
31
|
+
|
32
|
+
except OptionalDependencyNotAvailable:
|
33
|
+
from ...utils.dummy_torch_and_transformers_objects import *
|
34
|
+
else:
|
35
|
+
from .pipeline_aura_flow import AuraFlowPipeline
|
36
|
+
|
37
|
+
else:
|
38
|
+
import sys
|
39
|
+
|
40
|
+
sys.modules[__name__] = _LazyModule(
|
41
|
+
__name__,
|
42
|
+
globals()["__file__"],
|
43
|
+
_import_structure,
|
44
|
+
module_spec=__spec__,
|
45
|
+
)
|
46
|
+
|
47
|
+
for name, value in _dummy_objects.items():
|
48
|
+
setattr(sys.modules[__name__], name, value)
|