diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +20 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +23 -25
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +46 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
- diffusers/schedulers/scheduling_edm_euler.py +53 -30
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
- diffusers/schedulers/scheduling_euler_discrete.py +163 -67
- diffusers/schedulers/scheduling_heun_discrete.py +60 -38
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +27 -25
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
- diffusers-0.28.0.dist-info/RECORD +414 -0
- diffusers-0.27.1.dist-info/RECORD +0 -399
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -74,7 +74,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
|
|
74
74
|
guidance_scale: Optional[float] = 1.0,
|
75
75
|
eta: Optional[float] = 0.0,
|
76
76
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
77
|
-
latents: Optional[torch.
|
77
|
+
latents: Optional[torch.Tensor] = None,
|
78
78
|
output_type: Optional[str] = "pil",
|
79
79
|
return_dict: bool = True,
|
80
80
|
**kwargs,
|
@@ -98,7 +98,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
|
|
98
98
|
generator (`torch.Generator`, *optional*):
|
99
99
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
100
100
|
generation deterministic.
|
101
|
-
latents (`torch.
|
101
|
+
latents (`torch.Tensor`, *optional*):
|
102
102
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
103
103
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
104
104
|
tensor is generated by sampling using the supplied random `generator`.
|
@@ -465,17 +465,17 @@ class LDMBertEncoderLayer(nn.Module):
|
|
465
465
|
|
466
466
|
def forward(
|
467
467
|
self,
|
468
|
-
hidden_states: torch.
|
469
|
-
attention_mask: torch.
|
470
|
-
layer_head_mask: torch.
|
468
|
+
hidden_states: torch.Tensor,
|
469
|
+
attention_mask: torch.Tensor,
|
470
|
+
layer_head_mask: torch.Tensor,
|
471
471
|
output_attentions: Optional[bool] = False,
|
472
|
-
) -> Tuple[torch.
|
472
|
+
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
473
473
|
"""
|
474
474
|
Args:
|
475
|
-
hidden_states (`torch.
|
476
|
-
attention_mask (`torch.
|
475
|
+
hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
|
476
|
+
attention_mask (`torch.Tensor`): attention mask of size
|
477
477
|
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
478
|
-
layer_head_mask (`torch.
|
478
|
+
layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size
|
479
479
|
`(encoder_attention_heads,)`.
|
480
480
|
output_attentions (`bool`, *optional*):
|
481
481
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
@@ -587,7 +587,7 @@ class LDMBertEncoder(LDMBertPreTrainedModel):
|
|
587
587
|
attention_mask: Optional[torch.Tensor] = None,
|
588
588
|
position_ids: Optional[torch.LongTensor] = None,
|
589
589
|
head_mask: Optional[torch.Tensor] = None,
|
590
|
-
inputs_embeds: Optional[torch.
|
590
|
+
inputs_embeds: Optional[torch.Tensor] = None,
|
591
591
|
output_attentions: Optional[bool] = None,
|
592
592
|
output_hidden_states: Optional[bool] = None,
|
593
593
|
return_dict: Optional[bool] = None,
|
@@ -615,7 +615,7 @@ class LDMBertEncoder(LDMBertPreTrainedModel):
|
|
615
615
|
- 1 indicates the head is **not masked**,
|
616
616
|
- 0 indicates the head is **masked**.
|
617
617
|
|
618
|
-
inputs_embeds (`torch.
|
618
|
+
inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
619
619
|
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
620
620
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
621
621
|
than the model's internal embedding lookup matrix.
|
@@ -40,30 +40,21 @@ EXAMPLE_DOC_STRING = """
|
|
40
40
|
>>> from io import BytesIO
|
41
41
|
|
42
42
|
>>> from diffusers import LEditsPPPipelineStableDiffusion
|
43
|
+
>>> from diffusers.utils import load_image
|
43
44
|
|
44
45
|
>>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
|
45
46
|
... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
|
46
47
|
... )
|
47
48
|
>>> pipe = pipe.to("cuda")
|
48
49
|
|
49
|
-
>>> def download_image(url):
|
50
|
-
... response = requests.get(url)
|
51
|
-
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
52
|
-
|
53
50
|
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
|
54
|
-
>>> image =
|
51
|
+
>>> image = load_image(img_url).convert("RGB")
|
55
52
|
|
56
|
-
>>> _ = pipe.invert(
|
57
|
-
... image = image,
|
58
|
-
... num_inversion_steps=50,
|
59
|
-
... skip=0.1
|
60
|
-
... )
|
53
|
+
>>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
|
61
54
|
|
62
55
|
>>> edited_image = pipe(
|
63
|
-
... editing_prompt=["cherry blossom"],
|
64
|
-
...
|
65
|
-
... edit_threshold=0.75,
|
66
|
-
).images[0]
|
56
|
+
... editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75
|
57
|
+
... ).images[0]
|
67
58
|
```
|
68
59
|
"""
|
69
60
|
|
@@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion(
|
|
279
270
|
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
280
271
|
scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
|
281
272
|
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
282
|
-
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
|
283
|
-
be set to [`DPMSolverMultistepScheduler`].
|
273
|
+
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
|
274
|
+
automatically be set to [`DPMSolverMultistepScheduler`].
|
284
275
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
285
276
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
286
277
|
Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
|
@@ -511,8 +502,8 @@ class LEditsPPPipelineStableDiffusion(
|
|
511
502
|
enable_edit_guidance,
|
512
503
|
negative_prompt=None,
|
513
504
|
editing_prompt=None,
|
514
|
-
negative_prompt_embeds: Optional[torch.
|
515
|
-
editing_prompt_embeds: Optional[torch.
|
505
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
506
|
+
editing_prompt_embeds: Optional[torch.Tensor] = None,
|
516
507
|
lora_scale: Optional[float] = None,
|
517
508
|
clip_skip: Optional[int] = None,
|
518
509
|
):
|
@@ -531,12 +522,11 @@ class LEditsPPPipelineStableDiffusion(
|
|
531
522
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
532
523
|
less than `1`).
|
533
524
|
editing_prompt (`str` or `List[str]`, *optional*):
|
534
|
-
Editing prompt(s) to be encoded. If not defined, one has to pass
|
535
|
-
|
536
|
-
editing_prompt_embeds (`torch.FloatTensor`, *optional*):
|
525
|
+
Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead.
|
526
|
+
editing_prompt_embeds (`torch.Tensor`, *optional*):
|
537
527
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
538
528
|
provided, text embeddings will be generated from `prompt` input argument.
|
539
|
-
negative_prompt_embeds (`torch.
|
529
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
540
530
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
541
531
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
542
532
|
argument.
|
@@ -714,13 +704,13 @@ class LEditsPPPipelineStableDiffusion(
|
|
714
704
|
return_dict: bool = True,
|
715
705
|
editing_prompt: Optional[Union[str, List[str]]] = None,
|
716
706
|
editing_prompt_embeds: Optional[torch.Tensor] = None,
|
717
|
-
negative_prompt_embeds: Optional[torch.
|
707
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
718
708
|
reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
|
719
709
|
edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
|
720
710
|
edit_warmup_steps: Optional[Union[int, List[int]]] = 0,
|
721
711
|
edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
|
722
712
|
edit_threshold: Optional[Union[float, List[float]]] = 0.9,
|
723
|
-
user_mask: Optional[torch.
|
713
|
+
user_mask: Optional[torch.Tensor] = None,
|
724
714
|
sem_guidance: Optional[List[torch.Tensor]] = None,
|
725
715
|
use_cross_attn_mask: bool = False,
|
726
716
|
use_intersect_mask: bool = True,
|
@@ -734,8 +724,9 @@ class LEditsPPPipelineStableDiffusion(
|
|
734
724
|
**kwargs,
|
735
725
|
):
|
736
726
|
r"""
|
737
|
-
The call function to the pipeline for editing. The
|
738
|
-
method has to be called beforehand. Edits will
|
727
|
+
The call function to the pipeline for editing. The
|
728
|
+
[`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will
|
729
|
+
always be performed for the last inverted image(s).
|
739
730
|
|
740
731
|
Args:
|
741
732
|
negative_prompt (`str` or `List[str]`, *optional*):
|
@@ -748,49 +739,51 @@ class LEditsPPPipelineStableDiffusion(
|
|
748
739
|
The output format of the generate image. Choose between
|
749
740
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
750
741
|
return_dict (`bool`, *optional*, defaults to `True`):
|
751
|
-
Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a
|
752
|
-
|
742
|
+
Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain
|
743
|
+
tuple.
|
753
744
|
editing_prompt (`str` or `List[str]`, *optional*):
|
754
745
|
The prompt or prompts to guide the image generation. The image is reconstructed by setting
|
755
|
-
`editing_prompt = None`. Guidance direction of prompt should be specified via
|
746
|
+
`editing_prompt = None`. Guidance direction of prompt should be specified via
|
747
|
+
`reverse_editing_direction`.
|
756
748
|
editing_prompt_embeds (`torch.Tensor>`, *optional*):
|
757
|
-
Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
|
758
|
-
specified via `reverse_editing_direction`.
|
759
|
-
negative_prompt_embeds (`torch.
|
749
|
+
Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
|
750
|
+
be specified via `reverse_editing_direction`.
|
751
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
760
752
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
761
753
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
762
754
|
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
|
763
755
|
Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
|
764
756
|
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
|
765
|
-
Guidance scale for guiding the image generation. If provided as list values should correspond to
|
766
|
-
`edit_guidance_scale` is defined as `s_e` of equation 12 of
|
767
|
-
|
757
|
+
Guidance scale for guiding the image generation. If provided as list values should correspond to
|
758
|
+
`editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
|
759
|
+
Paper](https://arxiv.org/abs/2301.12247).
|
768
760
|
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
|
769
761
|
Number of diffusion steps (for each prompt) for which guidance will not be applied.
|
770
762
|
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
|
771
763
|
Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
|
772
764
|
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
|
773
765
|
Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
|
774
|
-
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
|
775
|
-
|
776
|
-
|
777
|
-
|
766
|
+
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
|
767
|
+
Paper](https://arxiv.org/abs/2301.12247).
|
768
|
+
user_mask (`torch.Tensor`, *optional*):
|
769
|
+
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
|
770
|
+
implicit masks do not meet user preferences.
|
778
771
|
sem_guidance (`List[torch.Tensor]`, *optional*):
|
779
772
|
List of pre-generated guidance vectors to be applied at generation. Length of the list has to
|
780
773
|
correspond to `num_inference_steps`.
|
781
774
|
use_cross_attn_mask (`bool`, defaults to `False`):
|
782
775
|
Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
|
783
|
-
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
|
784
|
-
|
776
|
+
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
|
777
|
+
paper](https://arxiv.org/pdf/2311.16711.pdf).
|
785
778
|
use_intersect_mask (`bool`, defaults to `True`):
|
786
|
-
Whether the masking term is calculated as intersection of cross-attention masks and masks derived
|
787
|
-
|
788
|
-
|
779
|
+
Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
|
780
|
+
the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
|
781
|
+
are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
789
782
|
attn_store_steps (`List[int]`, *optional*):
|
790
783
|
Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
|
791
784
|
store_averaged_over_steps (`bool`, defaults to `True`):
|
792
|
-
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
|
793
|
-
|
785
|
+
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
|
786
|
+
False, attention maps for each step are stores separately. Just for visualization purposes.
|
794
787
|
cross_attention_kwargs (`dict`, *optional*):
|
795
788
|
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
796
789
|
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
@@ -815,10 +808,10 @@ class LEditsPPPipelineStableDiffusion(
|
|
815
808
|
|
816
809
|
Returns:
|
817
810
|
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
|
818
|
-
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
|
819
|
-
|
820
|
-
|
821
|
-
|
811
|
+
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
|
812
|
+
returning a tuple, the first element is a list with the generated images, and the second element is a list
|
813
|
+
of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
|
814
|
+
content, according to the `safety_checker`.
|
822
815
|
"""
|
823
816
|
|
824
817
|
if self.inversion_steps is None:
|
@@ -1219,11 +1212,11 @@ class LEditsPPPipelineStableDiffusion(
|
|
1219
1212
|
crops_coords: Optional[Tuple[int, int, int, int]] = None,
|
1220
1213
|
):
|
1221
1214
|
r"""
|
1222
|
-
The function to the pipeline for image inversion as described by the [LEDITS++
|
1223
|
-
If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
1224
|
-
will be performed instead.
|
1215
|
+
The function to the pipeline for image inversion as described by the [LEDITS++
|
1216
|
+
Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
1217
|
+
inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
|
1225
1218
|
|
1226
|
-
|
1219
|
+
Args:
|
1227
1220
|
image (`PipelineImageInput`):
|
1228
1221
|
Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
|
1229
1222
|
ratio.
|
@@ -1238,8 +1231,8 @@ class LEditsPPPipelineStableDiffusion(
|
|
1238
1231
|
Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
|
1239
1232
|
will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
|
1240
1233
|
generator (`torch.Generator`, *optional*):
|
1241
|
-
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
1242
|
-
|
1234
|
+
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
|
1235
|
+
deterministic.
|
1243
1236
|
cross_attention_kwargs (`dict`, *optional*):
|
1244
1237
|
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
1245
1238
|
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
@@ -1247,23 +1240,24 @@ class LEditsPPPipelineStableDiffusion(
|
|
1247
1240
|
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
1248
1241
|
the output of the pre-final layer will be used for computing the prompt embeddings.
|
1249
1242
|
height (`int`, *optional*, defaults to `None`):
|
1250
|
-
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
|
1243
|
+
The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
|
1244
|
+
height.
|
1251
1245
|
width (`int`, *optional*`, defaults to `None`):
|
1252
|
-
The width in preprocessed. If `None`, will use
|
1246
|
+
The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
|
1253
1247
|
resize_mode (`str`, *optional*, defaults to `default`):
|
1254
|
-
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
|
1255
|
-
|
1256
|
-
|
1257
|
-
within the dimensions, filling empty with data from image.
|
1258
|
-
|
1259
|
-
within the dimensions, cropping the excess.
|
1260
|
-
|
1248
|
+
The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
|
1249
|
+
the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
|
1250
|
+
resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
|
1251
|
+
center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
|
1252
|
+
image to fit within the specified width and height, maintaining the aspect ratio, and then center the
|
1253
|
+
image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
|
1254
|
+
supported for PIL image input.
|
1261
1255
|
crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
|
1262
1256
|
The crop coordinates for each image in the batch. If `None`, will not crop the image.
|
1263
1257
|
|
1264
1258
|
Returns:
|
1265
|
-
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
|
1266
|
-
|
1259
|
+
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
|
1260
|
+
and respective VAE reconstruction(s).
|
1267
1261
|
"""
|
1268
1262
|
# Reset attn processor, we do not want to store attn maps during inversion
|
1269
1263
|
self.unet.set_attn_processor(AttnProcessor())
|
@@ -85,25 +85,23 @@ EXAMPLE_DOC_STRING = """
|
|
85
85
|
... )
|
86
86
|
>>> pipe = pipe.to("cuda")
|
87
87
|
|
88
|
+
|
88
89
|
>>> def download_image(url):
|
89
90
|
... response = requests.get(url)
|
90
91
|
... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
|
91
92
|
|
93
|
+
|
92
94
|
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
|
93
95
|
>>> image = download_image(img_url)
|
94
96
|
|
95
|
-
>>> _ = pipe.invert(
|
96
|
-
... image = image,
|
97
|
-
... num_inversion_steps=50,
|
98
|
-
... skip=0.2
|
99
|
-
... )
|
97
|
+
>>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
|
100
98
|
|
101
99
|
>>> edited_image = pipe(
|
102
|
-
... editing_prompt=["tennis ball","tomato"],
|
103
|
-
... reverse_editing_direction=[True,False],
|
104
|
-
... edit_guidance_scale=[5.0,10.0],
|
105
|
-
... edit_threshold=[0.9,0.85],
|
106
|
-
).images[0]
|
100
|
+
... editing_prompt=["tennis ball", "tomato"],
|
101
|
+
... reverse_editing_direction=[True, False],
|
102
|
+
... edit_guidance_scale=[5.0, 10.0],
|
103
|
+
... edit_threshold=[0.9, 0.85],
|
104
|
+
... ).images[0]
|
107
105
|
```
|
108
106
|
"""
|
109
107
|
|
@@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
292
290
|
"""
|
293
291
|
Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
|
294
292
|
|
295
|
-
This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
|
296
|
-
documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
|
297
|
-
device, etc.).
|
293
|
+
This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
|
294
|
+
superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
|
295
|
+
particular device, etc.).
|
298
296
|
|
299
297
|
In addition the pipeline inherits the following loading methods:
|
300
298
|
- *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
|
@@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
325
323
|
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
|
326
324
|
scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
|
327
325
|
A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
|
328
|
-
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
|
329
|
-
be set to [`DPMSolverMultistepScheduler`].
|
326
|
+
[`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
|
327
|
+
automatically be set to [`DPMSolverMultistepScheduler`].
|
330
328
|
force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
|
331
329
|
Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
|
332
330
|
`stabilityai/stable-diffusion-xl-base-1-0`.
|
@@ -411,14 +409,14 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
411
409
|
num_images_per_prompt: int = 1,
|
412
410
|
negative_prompt: Optional[str] = None,
|
413
411
|
negative_prompt_2: Optional[str] = None,
|
414
|
-
negative_prompt_embeds: Optional[torch.
|
415
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
412
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
413
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
416
414
|
lora_scale: Optional[float] = None,
|
417
415
|
clip_skip: Optional[int] = None,
|
418
416
|
enable_edit_guidance: bool = True,
|
419
417
|
editing_prompt: Optional[str] = None,
|
420
|
-
editing_prompt_embeds: Optional[torch.
|
421
|
-
editing_pooled_prompt_embeds: Optional[torch.
|
418
|
+
editing_prompt_embeds: Optional[torch.Tensor] = None,
|
419
|
+
editing_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
422
420
|
) -> object:
|
423
421
|
r"""
|
424
422
|
Encodes the prompt into text encoder hidden states.
|
@@ -434,11 +432,11 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
434
432
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
435
433
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
436
434
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
437
|
-
negative_prompt_embeds (`torch.
|
435
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
438
436
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
439
437
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
440
438
|
argument.
|
441
|
-
negative_pooled_prompt_embeds (`torch.
|
439
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
442
440
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
443
441
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
444
442
|
input argument.
|
@@ -452,11 +450,11 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
452
450
|
editing_prompt (`str` or `List[str]`, *optional*):
|
453
451
|
Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
|
454
452
|
`editing_prompt_embeds` instead.
|
455
|
-
editing_prompt_embeds (`torch.
|
456
|
-
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
457
|
-
|
458
|
-
argument.
|
459
|
-
editing_pooled_prompt_embeds (`torch.
|
453
|
+
editing_prompt_embeds (`torch.Tensor`, *optional*):
|
454
|
+
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
455
|
+
If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from
|
456
|
+
`editing_prompt` input argument.
|
457
|
+
editing_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
460
458
|
Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
461
459
|
weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
|
462
460
|
input argument.
|
@@ -713,20 +711,22 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
713
711
|
self.vae.decoder.mid_block.to(dtype)
|
714
712
|
|
715
713
|
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
716
|
-
def get_guidance_scale_embedding(
|
714
|
+
def get_guidance_scale_embedding(
|
715
|
+
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
|
716
|
+
) -> torch.Tensor:
|
717
717
|
"""
|
718
718
|
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
719
719
|
|
720
720
|
Args:
|
721
|
-
|
722
|
-
|
721
|
+
w (`torch.Tensor`):
|
722
|
+
Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
|
723
723
|
embedding_dim (`int`, *optional*, defaults to 512):
|
724
|
-
|
725
|
-
dtype:
|
726
|
-
|
724
|
+
Dimension of the embeddings to generate.
|
725
|
+
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
726
|
+
Data type of the generated embeddings.
|
727
727
|
|
728
728
|
Returns:
|
729
|
-
`torch.
|
729
|
+
`torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
|
730
730
|
"""
|
731
731
|
assert len(w.shape) == 1
|
732
732
|
w = w * 1000.0
|
@@ -804,8 +804,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
804
804
|
denoising_end: Optional[float] = None,
|
805
805
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
806
806
|
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
807
|
-
negative_prompt_embeds: Optional[torch.
|
808
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
807
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
808
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
809
809
|
ip_adapter_image: Optional[PipelineImageInput] = None,
|
810
810
|
output_type: Optional[str] = "pil",
|
811
811
|
return_dict: bool = True,
|
@@ -824,7 +824,7 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
824
824
|
sem_guidance: Optional[List[torch.Tensor]] = None,
|
825
825
|
use_cross_attn_mask: bool = False,
|
826
826
|
use_intersect_mask: bool = False,
|
827
|
-
user_mask: Optional[torch.
|
827
|
+
user_mask: Optional[torch.Tensor] = None,
|
828
828
|
attn_store_steps: Optional[List[int]] = [],
|
829
829
|
store_averaged_over_steps: bool = True,
|
830
830
|
clip_skip: Optional[int] = None,
|
@@ -833,8 +833,9 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
833
833
|
**kwargs,
|
834
834
|
):
|
835
835
|
r"""
|
836
|
-
The call function to the pipeline for editing. The
|
837
|
-
method has to be called beforehand. Edits
|
836
|
+
The call function to the pipeline for editing. The
|
837
|
+
[`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits
|
838
|
+
will always be performed for the last inverted image(s).
|
838
839
|
|
839
840
|
Args:
|
840
841
|
denoising_end (`float`, *optional*):
|
@@ -850,11 +851,11 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
850
851
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
851
852
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
852
853
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
853
|
-
negative_prompt_embeds (`torch.
|
854
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
854
855
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
855
856
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
856
857
|
argument.
|
857
|
-
negative_pooled_prompt_embeds (`torch.
|
858
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
858
859
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
859
860
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
860
861
|
input argument.
|
@@ -868,7 +869,7 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
868
869
|
of a plain tuple.
|
869
870
|
callback (`Callable`, *optional*):
|
870
871
|
A function that will be called every `callback_steps` steps during inference. The function will be
|
871
|
-
called with the following arguments: `callback(step: int, timestep: int, latents: torch.
|
872
|
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
872
873
|
callback_steps (`int`, *optional*, defaults to 1):
|
873
874
|
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
874
875
|
called at every step.
|
@@ -892,11 +893,11 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
892
893
|
section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
|
893
894
|
editing_prompt (`str` or `List[str]`, *optional*):
|
894
895
|
The prompt or prompts to guide the image generation. The image is reconstructed by setting
|
895
|
-
`editing_prompt = None`. Guidance direction of prompt should be specified via
|
896
|
+
`editing_prompt = None`. Guidance direction of prompt should be specified via
|
897
|
+
`reverse_editing_direction`.
|
896
898
|
editing_prompt_embeddings (`torch.Tensor`, *optional*):
|
897
|
-
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
898
|
-
|
899
|
-
argument.
|
899
|
+
Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
900
|
+
If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument.
|
900
901
|
editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
|
901
902
|
Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
902
903
|
weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
|
@@ -904,35 +905,36 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
904
905
|
reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
|
905
906
|
Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
|
906
907
|
edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
|
907
|
-
Guidance scale for guiding the image generation. If provided as list values should correspond to
|
908
|
-
`edit_guidance_scale` is defined as `s_e` of equation 12 of
|
909
|
-
|
908
|
+
Guidance scale for guiding the image generation. If provided as list values should correspond to
|
909
|
+
`editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
|
910
|
+
Paper](https://arxiv.org/abs/2301.12247).
|
910
911
|
edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
|
911
912
|
Number of diffusion steps (for each prompt) for which guidance is not applied.
|
912
913
|
edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
|
913
914
|
Number of diffusion steps (for each prompt) after which guidance is no longer applied.
|
914
915
|
edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
|
915
916
|
Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
|
916
|
-
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
|
917
|
+
'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
|
918
|
+
Paper](https://arxiv.org/abs/2301.12247).
|
917
919
|
sem_guidance (`List[torch.Tensor]`, *optional*):
|
918
920
|
List of pre-generated guidance vectors to be applied at generation. Length of the list has to
|
919
921
|
correspond to `num_inference_steps`.
|
920
922
|
use_cross_attn_mask:
|
921
923
|
Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
|
922
|
-
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
|
923
|
-
|
924
|
+
is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
|
925
|
+
paper](https://arxiv.org/pdf/2311.16711.pdf).
|
924
926
|
use_intersect_mask:
|
925
|
-
Whether the masking term is calculated as intersection of cross-attention masks and masks derived
|
926
|
-
|
927
|
-
|
927
|
+
Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
|
928
|
+
the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
|
929
|
+
are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
|
928
930
|
user_mask:
|
929
|
-
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
|
930
|
-
masks do not meet user preferences.
|
931
|
+
User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
|
932
|
+
implicit masks do not meet user preferences.
|
931
933
|
attn_store_steps:
|
932
934
|
Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
|
933
935
|
store_averaged_over_steps:
|
934
|
-
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
|
935
|
-
|
936
|
+
Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
|
937
|
+
False, attention maps for each step are stores separately. Just for visualization purposes.
|
936
938
|
clip_skip (`int`, *optional*):
|
937
939
|
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
938
940
|
the output of the pre-final layer will be used for computing the prompt embeddings.
|
@@ -950,8 +952,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
950
952
|
|
951
953
|
Returns:
|
952
954
|
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
|
953
|
-
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
|
954
|
-
|
955
|
+
[`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
|
956
|
+
returning a tuple, the first element is a list with the generated images.
|
955
957
|
"""
|
956
958
|
if self.inversion_steps is None:
|
957
959
|
raise ValueError(
|
@@ -1417,7 +1419,6 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
1417
1419
|
if needs_upcasting:
|
1418
1420
|
image = image.float()
|
1419
1421
|
self.upcast_vae()
|
1420
|
-
image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
1421
1422
|
|
1422
1423
|
x0 = self.vae.encode(image).latent_dist.mode()
|
1423
1424
|
x0 = x0.to(dtype)
|
@@ -1444,11 +1445,11 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
1444
1445
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
1445
1446
|
):
|
1446
1447
|
r"""
|
1447
|
-
The function to the pipeline for image inversion as described by the [LEDITS++
|
1448
|
-
If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
1449
|
-
will be performed instead.
|
1448
|
+
The function to the pipeline for image inversion as described by the [LEDITS++
|
1449
|
+
Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
|
1450
|
+
inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
|
1450
1451
|
|
1451
|
-
|
1452
|
+
Args:
|
1452
1453
|
image (`PipelineImageInput`):
|
1453
1454
|
Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
|
1454
1455
|
ratio.
|
@@ -1470,8 +1471,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
1470
1471
|
Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
|
1471
1472
|
will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
|
1472
1473
|
generator (`torch.Generator`, *optional*):
|
1473
|
-
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
1474
|
-
|
1474
|
+
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
|
1475
|
+
deterministic.
|
1475
1476
|
crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
|
1476
1477
|
`crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
|
1477
1478
|
`crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
|
@@ -1486,8 +1487,8 @@ class LEditsPPPipelineStableDiffusionXL(
|
|
1486
1487
|
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
1487
1488
|
|
1488
1489
|
Returns:
|
1489
|
-
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
|
1490
|
-
|
1490
|
+
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
|
1491
|
+
and respective VAE reconstruction(s).
|
1491
1492
|
"""
|
1492
1493
|
|
1493
1494
|
# Reset attn processor, we do not want to store attn maps during inversion
|