diffusers 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +94 -3
- diffusers/commands/env.py +1 -5
- diffusers/configuration_utils.py +4 -9
- diffusers/dependency_versions_table.py +2 -2
- diffusers/image_processor.py +1 -2
- diffusers/loaders/__init__.py +17 -2
- diffusers/loaders/ip_adapter.py +10 -7
- diffusers/loaders/lora_base.py +752 -0
- diffusers/loaders/lora_pipeline.py +2222 -0
- diffusers/loaders/peft.py +213 -5
- diffusers/loaders/single_file.py +1 -12
- diffusers/loaders/single_file_model.py +31 -10
- diffusers/loaders/single_file_utils.py +262 -2
- diffusers/loaders/textual_inversion.py +1 -6
- diffusers/loaders/unet.py +23 -208
- diffusers/models/__init__.py +20 -0
- diffusers/models/activations.py +22 -0
- diffusers/models/attention.py +386 -7
- diffusers/models/attention_processor.py +1795 -629
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_kl.py +14 -3
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1035 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
- diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vq_model.py +4 -4
- diffusers/models/controlnet.py +2 -3
- diffusers/models/controlnet_hunyuan.py +401 -0
- diffusers/models/controlnet_sd3.py +11 -11
- diffusers/models/controlnet_sparsectrl.py +789 -0
- diffusers/models/controlnet_xs.py +40 -10
- diffusers/models/downsampling.py +68 -0
- diffusers/models/embeddings.py +319 -36
- diffusers/models/model_loading_utils.py +1 -3
- diffusers/models/modeling_flax_utils.py +1 -6
- diffusers/models/modeling_utils.py +4 -16
- diffusers/models/normalization.py +203 -12
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +527 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +345 -0
- diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
- diffusers/models/transformers/latte_transformer_3d.py +327 -0
- diffusers/models/transformers/lumina_nextdit2d.py +340 -0
- diffusers/models/transformers/pixart_transformer_2d.py +102 -1
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/stable_audio_transformer.py +458 -0
- diffusers/models/transformers/transformer_flux.py +455 -0
- diffusers/models/transformers/transformer_sd3.py +18 -4
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_condition.py +8 -1
- diffusers/models/unets/unet_3d_blocks.py +51 -920
- diffusers/models/unets/unet_3d_condition.py +4 -1
- diffusers/models/unets/unet_i2vgen_xl.py +4 -1
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +1330 -84
- diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
- diffusers/models/unets/unet_stable_cascade.py +1 -3
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +64 -0
- diffusers/models/vq_model.py +8 -4
- diffusers/optimization.py +1 -1
- diffusers/pipelines/__init__.py +100 -3
- diffusers/pipelines/animatediff/__init__.py +4 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
- diffusers/pipelines/aura_flow/__init__.py +48 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
- diffusers/pipelines/auto_pipeline.py +97 -19
- diffusers/pipelines/cogvideo/__init__.py +48 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +687 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
- diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +749 -0
- diffusers/pipelines/flux/pipeline_output.py +21 -0
- diffusers/pipelines/free_init_utils.py +2 -0
- diffusers/pipelines/free_noise_utils.py +236 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
- diffusers/pipelines/kolors/__init__.py +54 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
- diffusers/pipelines/kolors/pipeline_output.py +21 -0
- diffusers/pipelines/kolors/text_encoder.py +889 -0
- diffusers/pipelines/kolors/tokenizer.py +334 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
- diffusers/pipelines/latte/__init__.py +48 -0
- diffusers/pipelines/latte/pipeline_latte.py +881 -0
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
- diffusers/pipelines/lumina/__init__.py +48 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
- diffusers/pipelines/pag/__init__.py +67 -0
- diffusers/pipelines/pag/pag_utils.py +237 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
- diffusers/pipelines/pia/pipeline_pia.py +30 -37
- diffusers/pipelines/pipeline_flax_utils.py +4 -9
- diffusers/pipelines/pipeline_loading_utils.py +0 -3
- diffusers/pipelines/pipeline_utils.py +2 -14
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
- diffusers/pipelines/stable_audio/__init__.py +50 -0
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
- diffusers/schedulers/__init__.py +8 -0
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
- diffusers/schedulers/scheduling_ddim.py +1 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
- diffusers/schedulers/scheduling_ddpm.py +1 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
- diffusers/schedulers/scheduling_deis_multistep.py +2 -2
- diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
- diffusers/schedulers/scheduling_ipndm.py +1 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
- diffusers/schedulers/scheduling_utils.py +1 -3
- diffusers/schedulers/scheduling_utils_flax.py +1 -3
- diffusers/training_utils.py +99 -14
- diffusers/utils/__init__.py +2 -2
- diffusers/utils/dummy_pt_objects.py +210 -0
- diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
- diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
- diffusers/utils/dynamic_modules_utils.py +1 -11
- diffusers/utils/export_utils.py +1 -4
- diffusers/utils/hub_utils.py +45 -42
- diffusers/utils/import_utils.py +19 -16
- diffusers/utils/loading_utils.py +76 -3
- diffusers/utils/testing_utils.py +11 -8
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/METADATA +73 -83
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/RECORD +217 -164
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/WHEEL +1 -1
- diffusers/loaders/autoencoder.py +0 -146
- diffusers/loaders/controlnet.py +0 -136
- diffusers/loaders/lora.py +0 -1728
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/LICENSE +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/top_level.txt +0 -0
@@ -36,8 +36,6 @@ from ...loaders import (
|
|
36
36
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
37
37
|
from ...models.attention_processor import (
|
38
38
|
AttnProcessor2_0,
|
39
|
-
LoRAAttnProcessor2_0,
|
40
|
-
LoRAXFormersAttnProcessor,
|
41
39
|
XFormersAttnProcessor,
|
42
40
|
)
|
43
41
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -712,6 +710,13 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
712
710
|
)
|
713
711
|
|
714
712
|
elif isinstance(generator, list):
|
713
|
+
if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
|
714
|
+
image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
|
715
|
+
elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
|
716
|
+
raise ValueError(
|
717
|
+
f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
|
718
|
+
)
|
719
|
+
|
715
720
|
init_latents = [
|
716
721
|
retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
|
717
722
|
for i in range(batch_size)
|
@@ -725,8 +730,8 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
725
730
|
|
726
731
|
init_latents = init_latents.to(dtype)
|
727
732
|
if latents_mean is not None and latents_std is not None:
|
728
|
-
latents_mean = latents_mean.to(device=
|
729
|
-
latents_std = latents_std.to(device=
|
733
|
+
latents_mean = latents_mean.to(device=device, dtype=dtype)
|
734
|
+
latents_std = latents_std.to(device=device, dtype=dtype)
|
730
735
|
init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
|
731
736
|
else:
|
732
737
|
init_latents = self.vae.config.scaling_factor * init_latents
|
@@ -781,6 +786,9 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
781
786
|
def prepare_ip_adapter_image_embeds(
|
782
787
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
783
788
|
):
|
789
|
+
image_embeds = []
|
790
|
+
if do_classifier_free_guidance:
|
791
|
+
negative_image_embeds = []
|
784
792
|
if ip_adapter_image_embeds is None:
|
785
793
|
if not isinstance(ip_adapter_image, list):
|
786
794
|
ip_adapter_image = [ip_adapter_image]
|
@@ -790,7 +798,6 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
790
798
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
791
799
|
)
|
792
800
|
|
793
|
-
image_embeds = []
|
794
801
|
for single_ip_adapter_image, image_proj_layer in zip(
|
795
802
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
796
803
|
):
|
@@ -798,36 +805,28 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
798
805
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
799
806
|
single_ip_adapter_image, device, 1, output_hidden_state
|
800
807
|
)
|
801
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
802
|
-
single_negative_image_embeds = torch.stack(
|
803
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
804
|
-
)
|
805
808
|
|
809
|
+
image_embeds.append(single_image_embeds[None, :])
|
806
810
|
if do_classifier_free_guidance:
|
807
|
-
|
808
|
-
single_image_embeds = single_image_embeds.to(device)
|
809
|
-
|
810
|
-
image_embeds.append(single_image_embeds)
|
811
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
811
812
|
else:
|
812
|
-
repeat_dims = [1]
|
813
|
-
image_embeds = []
|
814
813
|
for single_image_embeds in ip_adapter_image_embeds:
|
815
814
|
if do_classifier_free_guidance:
|
816
815
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
817
|
-
|
818
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
819
|
-
)
|
820
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
821
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
822
|
-
)
|
823
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
824
|
-
else:
|
825
|
-
single_image_embeds = single_image_embeds.repeat(
|
826
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
827
|
-
)
|
816
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
828
817
|
image_embeds.append(single_image_embeds)
|
829
818
|
|
830
|
-
|
819
|
+
ip_adapter_image_embeds = []
|
820
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
821
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
822
|
+
if do_classifier_free_guidance:
|
823
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
824
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
825
|
+
|
826
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
827
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
828
|
+
|
829
|
+
return ip_adapter_image_embeds
|
831
830
|
|
832
831
|
def _get_add_time_ids(
|
833
832
|
self,
|
@@ -889,8 +888,6 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
889
888
|
(
|
890
889
|
AttnProcessor2_0,
|
891
890
|
XFormersAttnProcessor,
|
892
|
-
LoRAXFormersAttnProcessor,
|
893
|
-
LoRAAttnProcessor2_0,
|
894
891
|
),
|
895
892
|
)
|
896
893
|
# if xformers or torch_2_0 is used attention block does not need
|
@@ -37,8 +37,6 @@ from ...loaders import (
|
|
37
37
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
38
38
|
from ...models.attention_processor import (
|
39
39
|
AttnProcessor2_0,
|
40
|
-
LoRAAttnProcessor2_0,
|
41
|
-
LoRAXFormersAttnProcessor,
|
42
40
|
XFormersAttnProcessor,
|
43
41
|
)
|
44
42
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -132,124 +130,6 @@ def mask_pil_to_torch(mask, height, width):
|
|
132
130
|
return mask
|
133
131
|
|
134
132
|
|
135
|
-
def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
|
136
|
-
"""
|
137
|
-
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
|
138
|
-
converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
|
139
|
-
``image`` and ``1`` for the ``mask``.
|
140
|
-
|
141
|
-
The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
|
142
|
-
binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
|
143
|
-
|
144
|
-
Args:
|
145
|
-
image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
|
146
|
-
It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
|
147
|
-
``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
|
148
|
-
mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
|
149
|
-
It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
|
150
|
-
``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
|
151
|
-
|
152
|
-
|
153
|
-
Raises:
|
154
|
-
ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
|
155
|
-
should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
|
156
|
-
TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
|
157
|
-
(ot the other way around).
|
158
|
-
|
159
|
-
Returns:
|
160
|
-
tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
|
161
|
-
dimensions: ``batch x channels x height x width``.
|
162
|
-
"""
|
163
|
-
|
164
|
-
# checkpoint. TOD(Yiyi) - need to clean this up later
|
165
|
-
deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
|
166
|
-
deprecate(
|
167
|
-
"prepare_mask_and_masked_image",
|
168
|
-
"0.30.0",
|
169
|
-
deprecation_message,
|
170
|
-
)
|
171
|
-
if image is None:
|
172
|
-
raise ValueError("`image` input cannot be undefined.")
|
173
|
-
|
174
|
-
if mask is None:
|
175
|
-
raise ValueError("`mask_image` input cannot be undefined.")
|
176
|
-
|
177
|
-
if isinstance(image, torch.Tensor):
|
178
|
-
if not isinstance(mask, torch.Tensor):
|
179
|
-
mask = mask_pil_to_torch(mask, height, width)
|
180
|
-
|
181
|
-
if image.ndim == 3:
|
182
|
-
image = image.unsqueeze(0)
|
183
|
-
|
184
|
-
# Batch and add channel dim for single mask
|
185
|
-
if mask.ndim == 2:
|
186
|
-
mask = mask.unsqueeze(0).unsqueeze(0)
|
187
|
-
|
188
|
-
# Batch single mask or add channel dim
|
189
|
-
if mask.ndim == 3:
|
190
|
-
# Single batched mask, no channel dim or single mask not batched but channel dim
|
191
|
-
if mask.shape[0] == 1:
|
192
|
-
mask = mask.unsqueeze(0)
|
193
|
-
|
194
|
-
# Batched masks no channel dim
|
195
|
-
else:
|
196
|
-
mask = mask.unsqueeze(1)
|
197
|
-
|
198
|
-
assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
|
199
|
-
# assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
|
200
|
-
assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
|
201
|
-
|
202
|
-
# Check image is in [-1, 1]
|
203
|
-
# if image.min() < -1 or image.max() > 1:
|
204
|
-
# raise ValueError("Image should be in [-1, 1] range")
|
205
|
-
|
206
|
-
# Check mask is in [0, 1]
|
207
|
-
if mask.min() < 0 or mask.max() > 1:
|
208
|
-
raise ValueError("Mask should be in [0, 1] range")
|
209
|
-
|
210
|
-
# Binarize mask
|
211
|
-
mask[mask < 0.5] = 0
|
212
|
-
mask[mask >= 0.5] = 1
|
213
|
-
|
214
|
-
# Image as float32
|
215
|
-
image = image.to(dtype=torch.float32)
|
216
|
-
elif isinstance(mask, torch.Tensor):
|
217
|
-
raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
|
218
|
-
else:
|
219
|
-
# preprocess image
|
220
|
-
if isinstance(image, (PIL.Image.Image, np.ndarray)):
|
221
|
-
image = [image]
|
222
|
-
if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
|
223
|
-
# resize all images w.r.t passed height an width
|
224
|
-
image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
|
225
|
-
image = [np.array(i.convert("RGB"))[None, :] for i in image]
|
226
|
-
image = np.concatenate(image, axis=0)
|
227
|
-
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
|
228
|
-
image = np.concatenate([i[None, :] for i in image], axis=0)
|
229
|
-
|
230
|
-
image = image.transpose(0, 3, 1, 2)
|
231
|
-
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
232
|
-
|
233
|
-
mask = mask_pil_to_torch(mask, height, width)
|
234
|
-
mask[mask < 0.5] = 0
|
235
|
-
mask[mask >= 0.5] = 1
|
236
|
-
|
237
|
-
if image.shape[1] == 4:
|
238
|
-
# images are in latent space and thus can't
|
239
|
-
# be masked set masked_image to None
|
240
|
-
# we assume that the checkpoint is not an inpainting
|
241
|
-
# checkpoint. TOD(Yiyi) - need to clean this up later
|
242
|
-
masked_image = None
|
243
|
-
else:
|
244
|
-
masked_image = image * (mask < 0.5)
|
245
|
-
|
246
|
-
# n.b. ensure backwards compatibility as old function does not return image
|
247
|
-
if return_image:
|
248
|
-
return mask, masked_image, image
|
249
|
-
|
250
|
-
return mask, masked_image
|
251
|
-
|
252
|
-
|
253
133
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
254
134
|
def retrieve_latents(
|
255
135
|
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
@@ -474,6 +354,9 @@ class StableDiffusionXLInpaintPipeline(
|
|
474
354
|
def prepare_ip_adapter_image_embeds(
|
475
355
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
476
356
|
):
|
357
|
+
image_embeds = []
|
358
|
+
if do_classifier_free_guidance:
|
359
|
+
negative_image_embeds = []
|
477
360
|
if ip_adapter_image_embeds is None:
|
478
361
|
if not isinstance(ip_adapter_image, list):
|
479
362
|
ip_adapter_image = [ip_adapter_image]
|
@@ -483,7 +366,6 @@ class StableDiffusionXLInpaintPipeline(
|
|
483
366
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
484
367
|
)
|
485
368
|
|
486
|
-
image_embeds = []
|
487
369
|
for single_ip_adapter_image, image_proj_layer in zip(
|
488
370
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
489
371
|
):
|
@@ -491,36 +373,28 @@ class StableDiffusionXLInpaintPipeline(
|
|
491
373
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
492
374
|
single_ip_adapter_image, device, 1, output_hidden_state
|
493
375
|
)
|
494
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
495
|
-
single_negative_image_embeds = torch.stack(
|
496
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
497
|
-
)
|
498
376
|
|
377
|
+
image_embeds.append(single_image_embeds[None, :])
|
499
378
|
if do_classifier_free_guidance:
|
500
|
-
|
501
|
-
single_image_embeds = single_image_embeds.to(device)
|
502
|
-
|
503
|
-
image_embeds.append(single_image_embeds)
|
379
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
504
380
|
else:
|
505
|
-
repeat_dims = [1]
|
506
|
-
image_embeds = []
|
507
381
|
for single_image_embeds in ip_adapter_image_embeds:
|
508
382
|
if do_classifier_free_guidance:
|
509
383
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
510
|
-
|
511
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
512
|
-
)
|
513
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
514
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
515
|
-
)
|
516
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
517
|
-
else:
|
518
|
-
single_image_embeds = single_image_embeds.repeat(
|
519
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
520
|
-
)
|
384
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
521
385
|
image_embeds.append(single_image_embeds)
|
522
386
|
|
523
|
-
|
387
|
+
ip_adapter_image_embeds = []
|
388
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
389
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
390
|
+
if do_classifier_free_guidance:
|
391
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
392
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
393
|
+
|
394
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
395
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
396
|
+
|
397
|
+
return ip_adapter_image_embeds
|
524
398
|
|
525
399
|
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
|
526
400
|
def encode_prompt(
|
@@ -1119,8 +993,6 @@ class StableDiffusionXLInpaintPipeline(
|
|
1119
993
|
(
|
1120
994
|
AttnProcessor2_0,
|
1121
995
|
XFormersAttnProcessor,
|
1122
|
-
LoRAXFormersAttnProcessor,
|
1123
|
-
LoRAAttnProcessor2_0,
|
1124
996
|
),
|
1125
997
|
)
|
1126
998
|
# if xformers or torch_2_0 is used attention block does not need
|
@@ -25,8 +25,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel
|
|
25
25
|
from ...models.attention_processor import (
|
26
26
|
AttnProcessor2_0,
|
27
27
|
FusedAttnProcessor2_0,
|
28
|
-
LoRAAttnProcessor2_0,
|
29
|
-
LoRAXFormersAttnProcessor,
|
30
28
|
XFormersAttnProcessor,
|
31
29
|
)
|
32
30
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -592,8 +590,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
592
590
|
(
|
593
591
|
AttnProcessor2_0,
|
594
592
|
XFormersAttnProcessor,
|
595
|
-
LoRAXFormersAttnProcessor,
|
596
|
-
LoRAAttnProcessor2_0,
|
597
593
|
FusedAttnProcessor2_0,
|
598
594
|
),
|
599
595
|
)
|
@@ -19,10 +19,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
19
19
|
import numpy as np
|
20
20
|
import PIL.Image
|
21
21
|
import torch
|
22
|
-
from transformers import
|
22
|
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
23
23
|
|
24
24
|
from ...image_processor import VaeImageProcessor
|
25
|
-
from ...loaders import
|
25
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
26
26
|
from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
|
27
27
|
from ...models.lora import adjust_lora_scale_text_encoder
|
28
28
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -209,7 +209,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
209
209
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
210
210
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
211
211
|
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
|
212
|
-
feature_extractor ([`
|
212
|
+
feature_extractor ([`CLIPImageProcessor`]):
|
213
213
|
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
214
214
|
"""
|
215
215
|
|
@@ -225,7 +225,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
225
225
|
adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
|
226
226
|
scheduler: KarrasDiffusionSchedulers,
|
227
227
|
safety_checker: StableDiffusionSafetyChecker,
|
228
|
-
feature_extractor:
|
228
|
+
feature_extractor: CLIPImageProcessor,
|
229
229
|
requires_safety_checker: bool = True,
|
230
230
|
):
|
231
231
|
super().__init__()
|
@@ -340,7 +340,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
340
340
|
"""
|
341
341
|
# set lora scale so that monkey patched LoRA
|
342
342
|
# function of text encoder can correctly access it
|
343
|
-
if lora_scale is not None and isinstance(self,
|
343
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
344
344
|
self._lora_scale = lora_scale
|
345
345
|
|
346
346
|
# dynamically adjust the LoRA scale
|
@@ -473,7 +473,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
473
473
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
474
474
|
|
475
475
|
if self.text_encoder is not None:
|
476
|
-
if isinstance(self,
|
476
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
477
477
|
# Retrieve the original scale by scaling back the LoRA layers
|
478
478
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
479
479
|
|
@@ -36,8 +36,6 @@ from ...loaders import (
|
|
36
36
|
from ...models import AutoencoderKL, ImageProjection, MultiAdapter, T2IAdapter, UNet2DConditionModel
|
37
37
|
from ...models.attention_processor import (
|
38
38
|
AttnProcessor2_0,
|
39
|
-
LoRAAttnProcessor2_0,
|
40
|
-
LoRAXFormersAttnProcessor,
|
41
39
|
XFormersAttnProcessor,
|
42
40
|
)
|
43
41
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -239,7 +237,7 @@ class StableDiffusionXLAdapterPipeline(
|
|
239
237
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
240
238
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
241
239
|
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
|
242
|
-
feature_extractor ([`
|
240
|
+
feature_extractor ([`CLIPImageProcessor`]):
|
243
241
|
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
244
242
|
"""
|
245
243
|
|
@@ -550,6 +548,9 @@ class StableDiffusionXLAdapterPipeline(
|
|
550
548
|
def prepare_ip_adapter_image_embeds(
|
551
549
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
552
550
|
):
|
551
|
+
image_embeds = []
|
552
|
+
if do_classifier_free_guidance:
|
553
|
+
negative_image_embeds = []
|
553
554
|
if ip_adapter_image_embeds is None:
|
554
555
|
if not isinstance(ip_adapter_image, list):
|
555
556
|
ip_adapter_image = [ip_adapter_image]
|
@@ -559,7 +560,6 @@ class StableDiffusionXLAdapterPipeline(
|
|
559
560
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
560
561
|
)
|
561
562
|
|
562
|
-
image_embeds = []
|
563
563
|
for single_ip_adapter_image, image_proj_layer in zip(
|
564
564
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
565
565
|
):
|
@@ -567,36 +567,28 @@ class StableDiffusionXLAdapterPipeline(
|
|
567
567
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
568
568
|
single_ip_adapter_image, device, 1, output_hidden_state
|
569
569
|
)
|
570
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
571
|
-
single_negative_image_embeds = torch.stack(
|
572
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
573
|
-
)
|
574
570
|
|
571
|
+
image_embeds.append(single_image_embeds[None, :])
|
575
572
|
if do_classifier_free_guidance:
|
576
|
-
|
577
|
-
single_image_embeds = single_image_embeds.to(device)
|
578
|
-
|
579
|
-
image_embeds.append(single_image_embeds)
|
573
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
580
574
|
else:
|
581
|
-
repeat_dims = [1]
|
582
|
-
image_embeds = []
|
583
575
|
for single_image_embeds in ip_adapter_image_embeds:
|
584
576
|
if do_classifier_free_guidance:
|
585
577
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
586
|
-
|
587
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
588
|
-
)
|
589
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
590
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
591
|
-
)
|
592
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
593
|
-
else:
|
594
|
-
single_image_embeds = single_image_embeds.repeat(
|
595
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
596
|
-
)
|
578
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
597
579
|
image_embeds.append(single_image_embeds)
|
598
580
|
|
599
|
-
|
581
|
+
ip_adapter_image_embeds = []
|
582
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
583
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
584
|
+
if do_classifier_free_guidance:
|
585
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
586
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
587
|
+
|
588
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
589
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
590
|
+
|
591
|
+
return ip_adapter_image_embeds
|
600
592
|
|
601
593
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
602
594
|
def prepare_extra_step_kwargs(self, generator, eta):
|
@@ -764,8 +756,6 @@ class StableDiffusionXLAdapterPipeline(
|
|
764
756
|
(
|
765
757
|
AttnProcessor2_0,
|
766
758
|
XFormersAttnProcessor,
|
767
|
-
LoRAXFormersAttnProcessor,
|
768
|
-
LoRAAttnProcessor2_0,
|
769
759
|
),
|
770
760
|
)
|
771
761
|
# if xformers or torch_2_0 is used attention block does not need
|
@@ -18,7 +18,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
18
18
|
import torch
|
19
19
|
from transformers import CLIPTextModel, CLIPTokenizer
|
20
20
|
|
21
|
-
from ...loaders import
|
21
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
22
22
|
from ...models import AutoencoderKL, UNet3DConditionModel
|
23
23
|
from ...models.lora import adjust_lora_scale_text_encoder
|
24
24
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -58,7 +58,9 @@ EXAMPLE_DOC_STRING = """
|
|
58
58
|
"""
|
59
59
|
|
60
60
|
|
61
|
-
class TextToVideoSDPipeline(
|
61
|
+
class TextToVideoSDPipeline(
|
62
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
63
|
+
):
|
62
64
|
r"""
|
63
65
|
Pipeline for text-to-video generation.
|
64
66
|
|
@@ -67,8 +69,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
67
69
|
|
68
70
|
The pipeline also inherits the following loading methods:
|
69
71
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
70
|
-
- [`~loaders.
|
71
|
-
- [`~loaders.
|
72
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
73
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
72
74
|
|
73
75
|
Args:
|
74
76
|
vae ([`AutoencoderKL`]):
|
@@ -183,7 +185,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
183
185
|
"""
|
184
186
|
# set lora scale so that monkey patched LoRA
|
185
187
|
# function of text encoder can correctly access it
|
186
|
-
if lora_scale is not None and isinstance(self,
|
188
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
187
189
|
self._lora_scale = lora_scale
|
188
190
|
|
189
191
|
# dynamically adjust the LoRA scale
|
@@ -316,7 +318,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
316
318
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
317
319
|
|
318
320
|
if self.text_encoder is not None:
|
319
|
-
if isinstance(self,
|
321
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
320
322
|
# Retrieve the original scale by scaling back the LoRA layers
|
321
323
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
322
324
|
|
@@ -19,7 +19,7 @@ import numpy as np
|
|
19
19
|
import torch
|
20
20
|
from transformers import CLIPTextModel, CLIPTokenizer
|
21
21
|
|
22
|
-
from ...loaders import
|
22
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
23
23
|
from ...models import AutoencoderKL, UNet3DConditionModel
|
24
24
|
from ...models.lora import adjust_lora_scale_text_encoder
|
25
25
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -93,7 +93,9 @@ def retrieve_latents(
|
|
93
93
|
raise AttributeError("Could not access latents of provided encoder_output")
|
94
94
|
|
95
95
|
|
96
|
-
class VideoToVideoSDPipeline(
|
96
|
+
class VideoToVideoSDPipeline(
|
97
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
98
|
+
):
|
97
99
|
r"""
|
98
100
|
Pipeline for text-guided video-to-video generation.
|
99
101
|
|
@@ -102,8 +104,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
102
104
|
|
103
105
|
The pipeline also inherits the following loading methods:
|
104
106
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
105
|
-
- [`~loaders.
|
106
|
-
- [`~loaders.
|
107
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
108
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
107
109
|
|
108
110
|
Args:
|
109
111
|
vae ([`AutoencoderKL`]):
|
@@ -218,7 +220,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
218
220
|
"""
|
219
221
|
# set lora scale so that monkey patched LoRA
|
220
222
|
# function of text encoder can correctly access it
|
221
|
-
if lora_scale is not None and isinstance(self,
|
223
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
222
224
|
self._lora_scale = lora_scale
|
223
225
|
|
224
226
|
# dynamically adjust the LoRA scale
|
@@ -351,7 +353,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
351
353
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
352
354
|
|
353
355
|
if self.text_encoder is not None:
|
354
|
-
if isinstance(self,
|
356
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
355
357
|
# Retrieve the original scale by scaling back the LoRA layers
|
356
358
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
357
359
|
|
@@ -11,7 +11,7 @@ from torch.nn.functional import grid_sample
|
|
11
11
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
12
12
|
|
13
13
|
from ...image_processor import VaeImageProcessor
|
14
|
-
from ...loaders import
|
14
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
15
15
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
16
16
|
from ...models.lora import adjust_lora_scale_text_encoder
|
17
17
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -281,7 +281,9 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s
|
|
281
281
|
return warped_latents
|
282
282
|
|
283
283
|
|
284
|
-
class TextToVideoZeroPipeline(
|
284
|
+
class TextToVideoZeroPipeline(
|
285
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
286
|
+
):
|
285
287
|
r"""
|
286
288
|
Pipeline for zero-shot text-to-video generation using Stable Diffusion.
|
287
289
|
|
@@ -831,7 +833,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
|
|
831
833
|
"""
|
832
834
|
# set lora scale so that monkey patched LoRA
|
833
835
|
# function of text encoder can correctly access it
|
834
|
-
if lora_scale is not None and isinstance(self,
|
836
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
835
837
|
self._lora_scale = lora_scale
|
836
838
|
|
837
839
|
# dynamically adjust the LoRA scale
|
@@ -964,7 +966,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
|
|
964
966
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
965
967
|
|
966
968
|
if self.text_encoder is not None:
|
967
|
-
if isinstance(self,
|
969
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
968
970
|
# Retrieve the original scale by scaling back the LoRA layers
|
969
971
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
970
972
|
|
@@ -22,8 +22,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel
|
|
22
22
|
from ...models.attention_processor import (
|
23
23
|
AttnProcessor2_0,
|
24
24
|
FusedAttnProcessor2_0,
|
25
|
-
LoRAAttnProcessor2_0,
|
26
|
-
LoRAXFormersAttnProcessor,
|
27
25
|
XFormersAttnProcessor,
|
28
26
|
)
|
29
27
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -438,8 +436,6 @@ class TextToVideoZeroSDXLPipeline(
|
|
438
436
|
(
|
439
437
|
AttnProcessor2_0,
|
440
438
|
XFormersAttnProcessor,
|
441
|
-
LoRAXFormersAttnProcessor,
|
442
|
-
LoRAAttnProcessor2_0,
|
443
439
|
FusedAttnProcessor2_0,
|
444
440
|
),
|
445
441
|
)
|
@@ -14,7 +14,7 @@ from transformers import (
|
|
14
14
|
)
|
15
15
|
|
16
16
|
from ...image_processor import VaeImageProcessor
|
17
|
-
from ...loaders import
|
17
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
18
18
|
from ...models import AutoencoderKL
|
19
19
|
from ...models.lora import adjust_lora_scale_text_encoder
|
20
20
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -422,7 +422,7 @@ class UniDiffuserPipeline(DiffusionPipeline):
|
|
422
422
|
"""
|
423
423
|
# set lora scale so that monkey patched LoRA
|
424
424
|
# function of text encoder can correctly access it
|
425
|
-
if lora_scale is not None and isinstance(self,
|
425
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
426
426
|
self._lora_scale = lora_scale
|
427
427
|
|
428
428
|
# dynamically adjust the LoRA scale
|
@@ -555,7 +555,7 @@ class UniDiffuserPipeline(DiffusionPipeline):
|
|
555
555
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
556
556
|
|
557
557
|
if self.text_encoder is not None:
|
558
|
-
if isinstance(self,
|
558
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
559
559
|
# Retrieve the original scale by scaling back the LoRA layers
|
560
560
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
561
561
|
|