diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +186 -3
- diffusers/configuration_utils.py +40 -12
- diffusers/dependency_versions_table.py +9 -2
- diffusers/hooks/__init__.py +9 -0
- diffusers/hooks/faster_cache.py +653 -0
- diffusers/hooks/group_offloading.py +793 -0
- diffusers/hooks/hooks.py +236 -0
- diffusers/hooks/layerwise_casting.py +245 -0
- diffusers/hooks/pyramid_attention_broadcast.py +311 -0
- diffusers/loaders/__init__.py +6 -0
- diffusers/loaders/ip_adapter.py +38 -30
- diffusers/loaders/lora_base.py +121 -86
- diffusers/loaders/lora_conversion_utils.py +504 -44
- diffusers/loaders/lora_pipeline.py +1769 -181
- diffusers/loaders/peft.py +167 -57
- diffusers/loaders/single_file.py +17 -2
- diffusers/loaders/single_file_model.py +53 -5
- diffusers/loaders/single_file_utils.py +646 -72
- diffusers/loaders/textual_inversion.py +9 -9
- diffusers/loaders/transformer_flux.py +8 -9
- diffusers/loaders/transformer_sd3.py +120 -39
- diffusers/loaders/unet.py +20 -7
- diffusers/models/__init__.py +22 -0
- diffusers/models/activations.py +9 -9
- diffusers/models/attention.py +0 -1
- diffusers/models/attention_processor.py +163 -25
- diffusers/models/auto_model.py +169 -0
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
- diffusers/models/autoencoders/autoencoder_dc.py +106 -4
- diffusers/models/autoencoders/autoencoder_kl.py +0 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
- diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
- diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
- diffusers/models/autoencoders/vae.py +31 -141
- diffusers/models/autoencoders/vq_model.py +3 -0
- diffusers/models/cache_utils.py +108 -0
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -8
- diffusers/models/controlnets/controlnet_flux.py +14 -42
- diffusers/models/controlnets/controlnet_sd3.py +58 -34
- diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
- diffusers/models/controlnets/controlnet_union.py +27 -18
- diffusers/models/controlnets/controlnet_xs.py +7 -46
- diffusers/models/controlnets/multicontrolnet_union.py +196 -0
- diffusers/models/embeddings.py +18 -7
- diffusers/models/model_loading_utils.py +122 -80
- diffusers/models/modeling_flax_pytorch_utils.py +1 -1
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +617 -272
- diffusers/models/normalization.py +67 -14
- diffusers/models/resnet.py +1 -1
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
- diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
- diffusers/models/transformers/consisid_transformer_3d.py +789 -0
- diffusers/models/transformers/dit_transformer_2d.py +5 -19
- diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
- diffusers/models/transformers/latte_transformer_3d.py +20 -15
- diffusers/models/transformers/lumina_nextdit2d.py +3 -1
- diffusers/models/transformers/pixart_transformer_2d.py +4 -19
- diffusers/models/transformers/prior_transformer.py +5 -1
- diffusers/models/transformers/sana_transformer.py +144 -40
- diffusers/models/transformers/stable_audio_transformer.py +5 -20
- diffusers/models/transformers/transformer_2d.py +7 -22
- diffusers/models/transformers/transformer_allegro.py +9 -17
- diffusers/models/transformers/transformer_cogview3plus.py +6 -17
- diffusers/models/transformers/transformer_cogview4.py +462 -0
- diffusers/models/transformers/transformer_easyanimate.py +527 -0
- diffusers/models/transformers/transformer_flux.py +68 -110
- diffusers/models/transformers/transformer_hunyuan_video.py +404 -46
- diffusers/models/transformers/transformer_ltx.py +53 -35
- diffusers/models/transformers/transformer_lumina2.py +548 -0
- diffusers/models/transformers/transformer_mochi.py +6 -17
- diffusers/models/transformers/transformer_omnigen.py +469 -0
- diffusers/models/transformers/transformer_sd3.py +56 -86
- diffusers/models/transformers/transformer_temporal.py +5 -11
- diffusers/models/transformers/transformer_wan.py +469 -0
- diffusers/models/unets/unet_1d.py +3 -1
- diffusers/models/unets/unet_2d.py +21 -20
- diffusers/models/unets/unet_2d_blocks.py +19 -243
- diffusers/models/unets/unet_2d_condition.py +4 -6
- diffusers/models/unets/unet_3d_blocks.py +14 -127
- diffusers/models/unets/unet_3d_condition.py +8 -12
- diffusers/models/unets/unet_i2vgen_xl.py +5 -13
- diffusers/models/unets/unet_kandinsky3.py +0 -4
- diffusers/models/unets/unet_motion_model.py +20 -114
- diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
- diffusers/models/unets/unet_stable_cascade.py +8 -35
- diffusers/models/unets/uvit_2d.py +1 -4
- diffusers/optimization.py +2 -2
- diffusers/pipelines/__init__.py +57 -8
- diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
- diffusers/pipelines/amused/pipeline_amused.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
- diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
- diffusers/pipelines/auto_pipeline.py +35 -14
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
- diffusers/pipelines/cogview4/__init__.py +49 -0
- diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
- diffusers/pipelines/cogview4/pipeline_output.py +21 -0
- diffusers/pipelines/consisid/__init__.py +49 -0
- diffusers/pipelines/consisid/consisid_utils.py +357 -0
- diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
- diffusers/pipelines/consisid/pipeline_output.py +20 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
- diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
- diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +15 -2
- diffusers/pipelines/easyanimate/__init__.py +52 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
- diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -21
- diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
- diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
- diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hunyuan_video/__init__.py +4 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
- diffusers/pipelines/kolors/text_encoder.py +7 -34
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
- diffusers/pipelines/latte/pipeline_latte.py +36 -7
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
- diffusers/pipelines/ltx/__init__.py +2 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
- diffusers/pipelines/lumina/__init__.py +2 -2
- diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
- diffusers/pipelines/lumina2/__init__.py +48 -0
- diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
- diffusers/pipelines/marigold/__init__.py +2 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
- diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
- diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
- diffusers/pipelines/omnigen/__init__.py +50 -0
- diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
- diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
- diffusers/pipelines/onnx_utils.py +5 -3
- diffusers/pipelines/pag/pag_utils.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
- diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
- diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
- diffusers/pipelines/pia/pipeline_pia.py +13 -1
- diffusers/pipelines/pipeline_flax_utils.py +7 -7
- diffusers/pipelines/pipeline_loading_utils.py +193 -83
- diffusers/pipelines/pipeline_utils.py +221 -106
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
- diffusers/pipelines/sana/__init__.py +2 -0
- diffusers/pipelines/sana/pipeline_sana.py +183 -58
- diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
- diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
- diffusers/pipelines/shap_e/renderer.py +6 -6
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
- diffusers/pipelines/transformers_loading_utils.py +121 -0
- diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
- diffusers/pipelines/wan/__init__.py +51 -0
- diffusers/pipelines/wan/pipeline_output.py +20 -0
- diffusers/pipelines/wan/pipeline_wan.py +593 -0
- diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
- diffusers/quantizers/auto.py +5 -1
- diffusers/quantizers/base.py +5 -9
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
- diffusers/quantizers/bitsandbytes/utils.py +30 -20
- diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
- diffusers/quantizers/gguf/utils.py +4 -2
- diffusers/quantizers/quantization_config.py +59 -4
- diffusers/quantizers/quanto/__init__.py +1 -0
- diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
- diffusers/quantizers/quanto/utils.py +60 -0
- diffusers/quantizers/torchao/__init__.py +1 -1
- diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
- diffusers/schedulers/__init__.py +2 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -2
- diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
- diffusers/schedulers/scheduling_ddpm.py +2 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
- diffusers/schedulers/scheduling_edm_euler.py +45 -10
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
- diffusers/schedulers/scheduling_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_lcm.py +1 -2
- diffusers/schedulers/scheduling_lms_discrete.py +1 -1
- diffusers/schedulers/scheduling_repaint.py +5 -1
- diffusers/schedulers/scheduling_scm.py +265 -0
- diffusers/schedulers/scheduling_tcd.py +1 -2
- diffusers/schedulers/scheduling_utils.py +2 -1
- diffusers/training_utils.py +14 -7
- diffusers/utils/__init__.py +9 -1
- diffusers/utils/constants.py +13 -1
- diffusers/utils/deprecation_utils.py +1 -1
- diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
- diffusers/utils/dummy_gguf_objects.py +17 -0
- diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
- diffusers/utils/dummy_pt_objects.py +233 -0
- diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dummy_torchao_objects.py +17 -0
- diffusers/utils/dynamic_modules_utils.py +1 -1
- diffusers/utils/export_utils.py +28 -3
- diffusers/utils/hub_utils.py +52 -102
- diffusers/utils/import_utils.py +121 -221
- diffusers/utils/loading_utils.py +2 -1
- diffusers/utils/logging.py +1 -2
- diffusers/utils/peft_utils.py +6 -14
- diffusers/utils/remote_utils.py +425 -0
- diffusers/utils/source_code_parsing_utils.py +52 -0
- diffusers/utils/state_dict_utils.py +15 -1
- diffusers/utils/testing_utils.py +243 -13
- diffusers/utils/torch_utils.py +10 -0
- diffusers/utils/typing_utils.py +91 -0
- diffusers/video_processor.py +1 -1
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
- diffusers-0.33.0.dist-info/RECORD +608 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
- diffusers-0.32.2.dist-info/RECORD +0 -550
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ from ...schedulers import KarrasDiffusionSchedulers
|
|
28
28
|
from ...utils import (
|
29
29
|
USE_PEFT_BACKEND,
|
30
30
|
deprecate,
|
31
|
+
is_torch_xla_available,
|
31
32
|
logging,
|
32
33
|
replace_example_docstring,
|
33
34
|
scale_lora_layers,
|
@@ -38,8 +39,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffu
|
|
38
39
|
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
|
39
40
|
|
40
41
|
|
42
|
+
if is_torch_xla_available():
|
43
|
+
import torch_xla.core.xla_model as xm
|
44
|
+
|
45
|
+
XLA_AVAILABLE = True
|
46
|
+
else:
|
47
|
+
XLA_AVAILABLE = False
|
48
|
+
|
41
49
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
42
50
|
|
51
|
+
|
43
52
|
EXAMPLE_DOC_STRING = """
|
44
53
|
Examples:
|
45
54
|
```py
|
@@ -155,7 +164,7 @@ class StableUnCLIPImg2ImgPipeline(
|
|
155
164
|
vae=vae,
|
156
165
|
)
|
157
166
|
|
158
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
167
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
159
168
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
160
169
|
|
161
170
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
@@ -829,6 +838,9 @@ class StableUnCLIPImg2ImgPipeline(
|
|
829
838
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
830
839
|
callback(step_idx, t, latents)
|
831
840
|
|
841
|
+
if XLA_AVAILABLE:
|
842
|
+
xm.mark_step()
|
843
|
+
|
832
844
|
# 9. Post-processing
|
833
845
|
if not output_type == "latent":
|
834
846
|
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from transformers import (
|
20
|
-
BaseImageProcessor,
|
21
20
|
CLIPTextModelWithProjection,
|
22
21
|
CLIPTokenizer,
|
23
|
-
|
22
|
+
SiglipImageProcessor,
|
23
|
+
SiglipVisionModel,
|
24
24
|
T5EncoderModel,
|
25
25
|
T5TokenizerFast,
|
26
26
|
)
|
@@ -76,7 +76,7 @@ def calculate_shift(
|
|
76
76
|
base_seq_len: int = 256,
|
77
77
|
max_seq_len: int = 4096,
|
78
78
|
base_shift: float = 0.5,
|
79
|
-
max_shift: float = 1.
|
79
|
+
max_shift: float = 1.15,
|
80
80
|
):
|
81
81
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
82
82
|
b = base_shift - m * base_seq_len
|
@@ -176,9 +176,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
176
176
|
tokenizer_3 (`T5TokenizerFast`):
|
177
177
|
Tokenizer of class
|
178
178
|
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
179
|
-
image_encoder (`
|
179
|
+
image_encoder (`SiglipVisionModel`, *optional*):
|
180
180
|
Pre-trained Vision Model for IP Adapter.
|
181
|
-
feature_extractor (`
|
181
|
+
feature_extractor (`SiglipImageProcessor`, *optional*):
|
182
182
|
Image processor for IP Adapter.
|
183
183
|
"""
|
184
184
|
|
@@ -197,8 +197,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
197
197
|
tokenizer_2: CLIPTokenizer,
|
198
198
|
text_encoder_3: T5EncoderModel,
|
199
199
|
tokenizer_3: T5TokenizerFast,
|
200
|
-
image_encoder:
|
201
|
-
feature_extractor:
|
200
|
+
image_encoder: SiglipVisionModel = None,
|
201
|
+
feature_extractor: SiglipImageProcessor = None,
|
202
202
|
):
|
203
203
|
super().__init__()
|
204
204
|
|
@@ -215,9 +215,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
215
215
|
image_encoder=image_encoder,
|
216
216
|
feature_extractor=feature_extractor,
|
217
217
|
)
|
218
|
-
self.vae_scale_factor = (
|
219
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
220
|
-
)
|
218
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
221
219
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
222
220
|
self.tokenizer_max_length = (
|
223
221
|
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
@@ -385,9 +383,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
385
383
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
386
384
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
387
385
|
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
|
388
|
-
|
386
|
+
negative_prompt_3 (`str` or `List[str]`, *optional*):
|
389
387
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
|
390
|
-
`text_encoder_3`. If not defined, `negative_prompt` is used in
|
388
|
+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
|
391
389
|
prompt_embeds (`torch.FloatTensor`, *optional*):
|
392
390
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
393
391
|
provided, text embeddings will be generated from `prompt` input argument.
|
@@ -870,7 +868,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
870
868
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
871
869
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
872
870
|
input argument.
|
873
|
-
ip_adapter_image (`PipelineImageInput`, *optional*):
|
871
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
872
|
+
Optional image input to work with IP Adapters.
|
874
873
|
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
875
874
|
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
|
876
875
|
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
|
@@ -1014,10 +1013,10 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
1014
1013
|
)
|
1015
1014
|
mu = calculate_shift(
|
1016
1015
|
image_seq_len,
|
1017
|
-
self.scheduler.config.base_image_seq_len,
|
1018
|
-
self.scheduler.config.max_image_seq_len,
|
1019
|
-
self.scheduler.config.base_shift,
|
1020
|
-
self.scheduler.config.max_shift,
|
1016
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
1017
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
1018
|
+
self.scheduler.config.get("base_shift", 0.5),
|
1019
|
+
self.scheduler.config.get("max_shift", 1.16),
|
1021
1020
|
)
|
1022
1021
|
scheduler_kwargs["mu"] = mu
|
1023
1022
|
elif mu is not None:
|
@@ -20,12 +20,14 @@ import torch
|
|
20
20
|
from transformers import (
|
21
21
|
CLIPTextModelWithProjection,
|
22
22
|
CLIPTokenizer,
|
23
|
+
SiglipImageProcessor,
|
24
|
+
SiglipVisionModel,
|
23
25
|
T5EncoderModel,
|
24
26
|
T5TokenizerFast,
|
25
27
|
)
|
26
28
|
|
27
29
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
28
|
-
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
30
|
+
from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
|
29
31
|
from ...models.autoencoders import AutoencoderKL
|
30
32
|
from ...models.transformers import SD3Transformer2DModel
|
31
33
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
@@ -81,7 +83,7 @@ def calculate_shift(
|
|
81
83
|
base_seq_len: int = 256,
|
82
84
|
max_seq_len: int = 4096,
|
83
85
|
base_shift: float = 0.5,
|
84
|
-
max_shift: float = 1.
|
86
|
+
max_shift: float = 1.15,
|
85
87
|
):
|
86
88
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
87
89
|
b = base_shift - m * base_seq_len
|
@@ -163,7 +165,7 @@ def retrieve_timesteps(
|
|
163
165
|
return timesteps, num_inference_steps
|
164
166
|
|
165
167
|
|
166
|
-
class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
|
168
|
+
class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
|
167
169
|
r"""
|
168
170
|
Args:
|
169
171
|
transformer ([`SD3Transformer2DModel`]):
|
@@ -195,10 +197,14 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
195
197
|
tokenizer_3 (`T5TokenizerFast`):
|
196
198
|
Tokenizer of class
|
197
199
|
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
200
|
+
image_encoder (`SiglipVisionModel`, *optional*):
|
201
|
+
Pre-trained Vision Model for IP Adapter.
|
202
|
+
feature_extractor (`SiglipImageProcessor`, *optional*):
|
203
|
+
Image processor for IP Adapter.
|
198
204
|
"""
|
199
205
|
|
200
|
-
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
|
201
|
-
_optional_components = []
|
206
|
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
|
207
|
+
_optional_components = ["image_encoder", "feature_extractor"]
|
202
208
|
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
|
203
209
|
|
204
210
|
def __init__(
|
@@ -212,6 +218,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
212
218
|
tokenizer_2: CLIPTokenizer,
|
213
219
|
text_encoder_3: T5EncoderModel,
|
214
220
|
tokenizer_3: T5TokenizerFast,
|
221
|
+
image_encoder: Optional[SiglipVisionModel] = None,
|
222
|
+
feature_extractor: Optional[SiglipImageProcessor] = None,
|
215
223
|
):
|
216
224
|
super().__init__()
|
217
225
|
|
@@ -225,13 +233,22 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
225
233
|
tokenizer_3=tokenizer_3,
|
226
234
|
transformer=transformer,
|
227
235
|
scheduler=scheduler,
|
236
|
+
image_encoder=image_encoder,
|
237
|
+
feature_extractor=feature_extractor,
|
228
238
|
)
|
229
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
239
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
240
|
+
latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
|
230
241
|
self.image_processor = VaeImageProcessor(
|
231
|
-
vae_scale_factor=self.vae_scale_factor, vae_latent_channels=
|
242
|
+
vae_scale_factor=self.vae_scale_factor, vae_latent_channels=latent_channels
|
243
|
+
)
|
244
|
+
self.tokenizer_max_length = (
|
245
|
+
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
246
|
+
)
|
247
|
+
self.default_sample_size = (
|
248
|
+
self.transformer.config.sample_size
|
249
|
+
if hasattr(self, "transformer") and self.transformer is not None
|
250
|
+
else 128
|
232
251
|
)
|
233
|
-
self.tokenizer_max_length = self.tokenizer.model_max_length
|
234
|
-
self.default_sample_size = self.transformer.config.sample_size
|
235
252
|
self.patch_size = (
|
236
253
|
self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
|
237
254
|
)
|
@@ -393,9 +410,9 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
393
410
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
394
411
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
395
412
|
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
|
396
|
-
|
413
|
+
negative_prompt_3 (`str` or `List[str]`, *optional*):
|
397
414
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
|
398
|
-
`text_encoder_3`. If not defined, `negative_prompt` is used in
|
415
|
+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
|
399
416
|
prompt_embeds (`torch.FloatTensor`, *optional*):
|
400
417
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
401
418
|
provided, text embeddings will be generated from `prompt` input argument.
|
@@ -731,6 +748,84 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
731
748
|
def interrupt(self):
|
732
749
|
return self._interrupt
|
733
750
|
|
751
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
|
752
|
+
def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
|
753
|
+
"""Encodes the given image into a feature representation using a pre-trained image encoder.
|
754
|
+
|
755
|
+
Args:
|
756
|
+
image (`PipelineImageInput`):
|
757
|
+
Input image to be encoded.
|
758
|
+
device: (`torch.device`):
|
759
|
+
Torch device.
|
760
|
+
|
761
|
+
Returns:
|
762
|
+
`torch.Tensor`: The encoded image feature representation.
|
763
|
+
"""
|
764
|
+
if not isinstance(image, torch.Tensor):
|
765
|
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
766
|
+
|
767
|
+
image = image.to(device=device, dtype=self.dtype)
|
768
|
+
|
769
|
+
return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
770
|
+
|
771
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
|
772
|
+
def prepare_ip_adapter_image_embeds(
|
773
|
+
self,
|
774
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
775
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
776
|
+
device: Optional[torch.device] = None,
|
777
|
+
num_images_per_prompt: int = 1,
|
778
|
+
do_classifier_free_guidance: bool = True,
|
779
|
+
) -> torch.Tensor:
|
780
|
+
"""Prepares image embeddings for use in the IP-Adapter.
|
781
|
+
|
782
|
+
Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
|
783
|
+
|
784
|
+
Args:
|
785
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
786
|
+
The input image to extract features from for IP-Adapter.
|
787
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
788
|
+
Precomputed image embeddings.
|
789
|
+
device: (`torch.device`, *optional*):
|
790
|
+
Torch device.
|
791
|
+
num_images_per_prompt (`int`, defaults to 1):
|
792
|
+
Number of images that should be generated per prompt.
|
793
|
+
do_classifier_free_guidance (`bool`, defaults to True):
|
794
|
+
Whether to use classifier free guidance or not.
|
795
|
+
"""
|
796
|
+
device = device or self._execution_device
|
797
|
+
|
798
|
+
if ip_adapter_image_embeds is not None:
|
799
|
+
if do_classifier_free_guidance:
|
800
|
+
single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
|
801
|
+
else:
|
802
|
+
single_image_embeds = ip_adapter_image_embeds
|
803
|
+
elif ip_adapter_image is not None:
|
804
|
+
single_image_embeds = self.encode_image(ip_adapter_image, device)
|
805
|
+
if do_classifier_free_guidance:
|
806
|
+
single_negative_image_embeds = torch.zeros_like(single_image_embeds)
|
807
|
+
else:
|
808
|
+
raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
|
809
|
+
|
810
|
+
image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
811
|
+
|
812
|
+
if do_classifier_free_guidance:
|
813
|
+
negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
814
|
+
image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
|
815
|
+
|
816
|
+
return image_embeds.to(device=device)
|
817
|
+
|
818
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
|
819
|
+
def enable_sequential_cpu_offload(self, *args, **kwargs):
|
820
|
+
if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
|
821
|
+
logger.warning(
|
822
|
+
"`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
|
823
|
+
"`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
|
824
|
+
"`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
|
825
|
+
)
|
826
|
+
|
827
|
+
super().enable_sequential_cpu_offload(*args, **kwargs)
|
828
|
+
|
734
829
|
@torch.no_grad()
|
735
830
|
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
736
831
|
def __call__(
|
@@ -756,6 +851,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
756
851
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
757
852
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
758
853
|
output_type: Optional[str] = "pil",
|
854
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
855
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
759
856
|
return_dict: bool = True,
|
760
857
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
761
858
|
clip_skip: Optional[int] = None,
|
@@ -777,9 +874,9 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
777
874
|
prompt_3 (`str` or `List[str]`, *optional*):
|
778
875
|
The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
|
779
876
|
will be used instead
|
780
|
-
height (`int`, *optional*, defaults to self.
|
877
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
|
781
878
|
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
782
|
-
width (`int`, *optional*, defaults to self.
|
879
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
|
783
880
|
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
784
881
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
785
882
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
@@ -827,6 +924,12 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
827
924
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
828
925
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
829
926
|
input argument.
|
927
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
928
|
+
Optional image input to work with IP Adapters.
|
929
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
930
|
+
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
|
931
|
+
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
|
932
|
+
`True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
830
933
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
831
934
|
The output format of the generate image. Choose between
|
832
935
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
@@ -936,10 +1039,10 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
936
1039
|
)
|
937
1040
|
mu = calculate_shift(
|
938
1041
|
image_seq_len,
|
939
|
-
self.scheduler.config.base_image_seq_len,
|
940
|
-
self.scheduler.config.max_image_seq_len,
|
941
|
-
self.scheduler.config.base_shift,
|
942
|
-
self.scheduler.config.max_shift,
|
1042
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
1043
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
1044
|
+
self.scheduler.config.get("base_shift", 0.5),
|
1045
|
+
self.scheduler.config.get("max_shift", 1.16),
|
943
1046
|
)
|
944
1047
|
scheduler_kwargs["mu"] = mu
|
945
1048
|
elif mu is not None:
|
@@ -962,7 +1065,22 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
962
1065
|
generator,
|
963
1066
|
)
|
964
1067
|
|
965
|
-
# 6.
|
1068
|
+
# 6. Prepare image embeddings
|
1069
|
+
if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
|
1070
|
+
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
|
1071
|
+
ip_adapter_image,
|
1072
|
+
ip_adapter_image_embeds,
|
1073
|
+
device,
|
1074
|
+
batch_size * num_images_per_prompt,
|
1075
|
+
self.do_classifier_free_guidance,
|
1076
|
+
)
|
1077
|
+
|
1078
|
+
if self.joint_attention_kwargs is None:
|
1079
|
+
self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
|
1080
|
+
else:
|
1081
|
+
self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
|
1082
|
+
|
1083
|
+
# 7. Denoising loop
|
966
1084
|
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
967
1085
|
self._num_timesteps = len(timesteps)
|
968
1086
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|