diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +186 -3
- diffusers/configuration_utils.py +40 -12
- diffusers/dependency_versions_table.py +9 -2
- diffusers/hooks/__init__.py +9 -0
- diffusers/hooks/faster_cache.py +653 -0
- diffusers/hooks/group_offloading.py +793 -0
- diffusers/hooks/hooks.py +236 -0
- diffusers/hooks/layerwise_casting.py +245 -0
- diffusers/hooks/pyramid_attention_broadcast.py +311 -0
- diffusers/loaders/__init__.py +6 -0
- diffusers/loaders/ip_adapter.py +38 -30
- diffusers/loaders/lora_base.py +121 -86
- diffusers/loaders/lora_conversion_utils.py +504 -44
- diffusers/loaders/lora_pipeline.py +1769 -181
- diffusers/loaders/peft.py +167 -57
- diffusers/loaders/single_file.py +17 -2
- diffusers/loaders/single_file_model.py +53 -5
- diffusers/loaders/single_file_utils.py +646 -72
- diffusers/loaders/textual_inversion.py +9 -9
- diffusers/loaders/transformer_flux.py +8 -9
- diffusers/loaders/transformer_sd3.py +120 -39
- diffusers/loaders/unet.py +20 -7
- diffusers/models/__init__.py +22 -0
- diffusers/models/activations.py +9 -9
- diffusers/models/attention.py +0 -1
- diffusers/models/attention_processor.py +163 -25
- diffusers/models/auto_model.py +169 -0
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
- diffusers/models/autoencoders/autoencoder_dc.py +106 -4
- diffusers/models/autoencoders/autoencoder_kl.py +0 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
- diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
- diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
- diffusers/models/autoencoders/vae.py +31 -141
- diffusers/models/autoencoders/vq_model.py +3 -0
- diffusers/models/cache_utils.py +108 -0
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -8
- diffusers/models/controlnets/controlnet_flux.py +14 -42
- diffusers/models/controlnets/controlnet_sd3.py +58 -34
- diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
- diffusers/models/controlnets/controlnet_union.py +27 -18
- diffusers/models/controlnets/controlnet_xs.py +7 -46
- diffusers/models/controlnets/multicontrolnet_union.py +196 -0
- diffusers/models/embeddings.py +18 -7
- diffusers/models/model_loading_utils.py +122 -80
- diffusers/models/modeling_flax_pytorch_utils.py +1 -1
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +617 -272
- diffusers/models/normalization.py +67 -14
- diffusers/models/resnet.py +1 -1
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
- diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
- diffusers/models/transformers/consisid_transformer_3d.py +789 -0
- diffusers/models/transformers/dit_transformer_2d.py +5 -19
- diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
- diffusers/models/transformers/latte_transformer_3d.py +20 -15
- diffusers/models/transformers/lumina_nextdit2d.py +3 -1
- diffusers/models/transformers/pixart_transformer_2d.py +4 -19
- diffusers/models/transformers/prior_transformer.py +5 -1
- diffusers/models/transformers/sana_transformer.py +144 -40
- diffusers/models/transformers/stable_audio_transformer.py +5 -20
- diffusers/models/transformers/transformer_2d.py +7 -22
- diffusers/models/transformers/transformer_allegro.py +9 -17
- diffusers/models/transformers/transformer_cogview3plus.py +6 -17
- diffusers/models/transformers/transformer_cogview4.py +462 -0
- diffusers/models/transformers/transformer_easyanimate.py +527 -0
- diffusers/models/transformers/transformer_flux.py +68 -110
- diffusers/models/transformers/transformer_hunyuan_video.py +404 -46
- diffusers/models/transformers/transformer_ltx.py +53 -35
- diffusers/models/transformers/transformer_lumina2.py +548 -0
- diffusers/models/transformers/transformer_mochi.py +6 -17
- diffusers/models/transformers/transformer_omnigen.py +469 -0
- diffusers/models/transformers/transformer_sd3.py +56 -86
- diffusers/models/transformers/transformer_temporal.py +5 -11
- diffusers/models/transformers/transformer_wan.py +469 -0
- diffusers/models/unets/unet_1d.py +3 -1
- diffusers/models/unets/unet_2d.py +21 -20
- diffusers/models/unets/unet_2d_blocks.py +19 -243
- diffusers/models/unets/unet_2d_condition.py +4 -6
- diffusers/models/unets/unet_3d_blocks.py +14 -127
- diffusers/models/unets/unet_3d_condition.py +8 -12
- diffusers/models/unets/unet_i2vgen_xl.py +5 -13
- diffusers/models/unets/unet_kandinsky3.py +0 -4
- diffusers/models/unets/unet_motion_model.py +20 -114
- diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
- diffusers/models/unets/unet_stable_cascade.py +8 -35
- diffusers/models/unets/uvit_2d.py +1 -4
- diffusers/optimization.py +2 -2
- diffusers/pipelines/__init__.py +57 -8
- diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
- diffusers/pipelines/amused/pipeline_amused.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
- diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
- diffusers/pipelines/auto_pipeline.py +35 -14
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
- diffusers/pipelines/cogview4/__init__.py +49 -0
- diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
- diffusers/pipelines/cogview4/pipeline_output.py +21 -0
- diffusers/pipelines/consisid/__init__.py +49 -0
- diffusers/pipelines/consisid/consisid_utils.py +357 -0
- diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
- diffusers/pipelines/consisid/pipeline_output.py +20 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
- diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
- diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +15 -2
- diffusers/pipelines/easyanimate/__init__.py +52 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
- diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -21
- diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
- diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
- diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hunyuan_video/__init__.py +4 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
- diffusers/pipelines/kolors/text_encoder.py +7 -34
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
- diffusers/pipelines/latte/pipeline_latte.py +36 -7
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
- diffusers/pipelines/ltx/__init__.py +2 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
- diffusers/pipelines/lumina/__init__.py +2 -2
- diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
- diffusers/pipelines/lumina2/__init__.py +48 -0
- diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
- diffusers/pipelines/marigold/__init__.py +2 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
- diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
- diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
- diffusers/pipelines/omnigen/__init__.py +50 -0
- diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
- diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
- diffusers/pipelines/onnx_utils.py +5 -3
- diffusers/pipelines/pag/pag_utils.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
- diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
- diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
- diffusers/pipelines/pia/pipeline_pia.py +13 -1
- diffusers/pipelines/pipeline_flax_utils.py +7 -7
- diffusers/pipelines/pipeline_loading_utils.py +193 -83
- diffusers/pipelines/pipeline_utils.py +221 -106
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
- diffusers/pipelines/sana/__init__.py +2 -0
- diffusers/pipelines/sana/pipeline_sana.py +183 -58
- diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
- diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
- diffusers/pipelines/shap_e/renderer.py +6 -6
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
- diffusers/pipelines/transformers_loading_utils.py +121 -0
- diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
- diffusers/pipelines/wan/__init__.py +51 -0
- diffusers/pipelines/wan/pipeline_output.py +20 -0
- diffusers/pipelines/wan/pipeline_wan.py +593 -0
- diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
- diffusers/quantizers/auto.py +5 -1
- diffusers/quantizers/base.py +5 -9
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
- diffusers/quantizers/bitsandbytes/utils.py +30 -20
- diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
- diffusers/quantizers/gguf/utils.py +4 -2
- diffusers/quantizers/quantization_config.py +59 -4
- diffusers/quantizers/quanto/__init__.py +1 -0
- diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
- diffusers/quantizers/quanto/utils.py +60 -0
- diffusers/quantizers/torchao/__init__.py +1 -1
- diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
- diffusers/schedulers/__init__.py +2 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -2
- diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
- diffusers/schedulers/scheduling_ddpm.py +2 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
- diffusers/schedulers/scheduling_edm_euler.py +45 -10
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
- diffusers/schedulers/scheduling_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_lcm.py +1 -2
- diffusers/schedulers/scheduling_lms_discrete.py +1 -1
- diffusers/schedulers/scheduling_repaint.py +5 -1
- diffusers/schedulers/scheduling_scm.py +265 -0
- diffusers/schedulers/scheduling_tcd.py +1 -2
- diffusers/schedulers/scheduling_utils.py +2 -1
- diffusers/training_utils.py +14 -7
- diffusers/utils/__init__.py +9 -1
- diffusers/utils/constants.py +13 -1
- diffusers/utils/deprecation_utils.py +1 -1
- diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
- diffusers/utils/dummy_gguf_objects.py +17 -0
- diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
- diffusers/utils/dummy_pt_objects.py +233 -0
- diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dummy_torchao_objects.py +17 -0
- diffusers/utils/dynamic_modules_utils.py +1 -1
- diffusers/utils/export_utils.py +28 -3
- diffusers/utils/hub_utils.py +52 -102
- diffusers/utils/import_utils.py +121 -221
- diffusers/utils/loading_utils.py +2 -1
- diffusers/utils/logging.py +1 -2
- diffusers/utils/peft_utils.py +6 -14
- diffusers/utils/remote_utils.py +425 -0
- diffusers/utils/source_code_parsing_utils.py +52 -0
- diffusers/utils/state_dict_utils.py +15 -1
- diffusers/utils/testing_utils.py +243 -13
- diffusers/utils/torch_utils.py +10 -0
- diffusers/utils/typing_utils.py +91 -0
- diffusers/video_processor.py +1 -1
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
- diffusers-0.33.0.dist-info/RECORD +608 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
- diffusers-0.32.2.dist-info/RECORD +0 -550
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
CHANGED
@@ -170,10 +170,14 @@ class StableDiffusionXLKDiffusionPipeline(
|
|
170
170
|
scheduler=scheduler,
|
171
171
|
)
|
172
172
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
173
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
173
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
174
174
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
175
175
|
|
176
|
-
self.default_sample_size =
|
176
|
+
self.default_sample_size = (
|
177
|
+
self.unet.config.sample_size
|
178
|
+
if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
|
179
|
+
else 128
|
180
|
+
)
|
177
181
|
|
178
182
|
model = ModelWrapper(unet, scheduler.alphas_cumprod)
|
179
183
|
if scheduler.config.prediction_type == "v_prediction":
|
@@ -321,7 +325,9 @@ class StableDiffusionXLKDiffusionPipeline(
|
|
321
325
|
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
|
322
326
|
|
323
327
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
324
|
-
pooled_prompt_embeds
|
328
|
+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
|
329
|
+
pooled_prompt_embeds = prompt_embeds[0]
|
330
|
+
|
325
331
|
if clip_skip is None:
|
326
332
|
prompt_embeds = prompt_embeds.hidden_states[-2]
|
327
333
|
else:
|
@@ -380,8 +386,10 @@ class StableDiffusionXLKDiffusionPipeline(
|
|
380
386
|
uncond_input.input_ids.to(device),
|
381
387
|
output_hidden_states=True,
|
382
388
|
)
|
389
|
+
|
383
390
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
384
|
-
negative_pooled_prompt_embeds
|
391
|
+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
|
392
|
+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
|
385
393
|
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
|
386
394
|
|
387
395
|
negative_prompt_embeds_list.append(negative_prompt_embeds)
|
@@ -30,6 +30,7 @@ from ...utils import (
|
|
30
30
|
USE_PEFT_BACKEND,
|
31
31
|
BaseOutput,
|
32
32
|
deprecate,
|
33
|
+
is_torch_xla_available,
|
33
34
|
logging,
|
34
35
|
replace_example_docstring,
|
35
36
|
scale_lora_layers,
|
@@ -40,8 +41,16 @@ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
|
40
41
|
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
41
42
|
|
42
43
|
|
44
|
+
if is_torch_xla_available():
|
45
|
+
import torch_xla.core.xla_model as xm
|
46
|
+
|
47
|
+
XLA_AVAILABLE = True
|
48
|
+
else:
|
49
|
+
XLA_AVAILABLE = False
|
50
|
+
|
43
51
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
44
52
|
|
53
|
+
|
45
54
|
EXAMPLE_DOC_STRING = """
|
46
55
|
Examples:
|
47
56
|
```python
|
@@ -203,8 +212,8 @@ class StableDiffusionLDM3DPipeline(
|
|
203
212
|
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
204
213
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
205
214
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
206
|
-
Please refer to the [model card](https://huggingface.co/
|
207
|
-
about a model's potential harms.
|
215
|
+
Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
|
216
|
+
more details about a model's potential harms.
|
208
217
|
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
209
218
|
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
|
210
219
|
"""
|
@@ -254,7 +263,7 @@ class StableDiffusionLDM3DPipeline(
|
|
254
263
|
feature_extractor=feature_extractor,
|
255
264
|
image_encoder=image_encoder,
|
256
265
|
)
|
257
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
266
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
258
267
|
self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
|
259
268
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
260
269
|
|
@@ -1002,6 +1011,9 @@ class StableDiffusionLDM3DPipeline(
|
|
1002
1011
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
1003
1012
|
callback(step_idx, t, latents)
|
1004
1013
|
|
1014
|
+
if XLA_AVAILABLE:
|
1015
|
+
xm.mark_step()
|
1016
|
+
|
1005
1017
|
if not output_type == "latent":
|
1006
1018
|
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
1007
1019
|
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
@@ -26,6 +26,7 @@ from ...schedulers import DDIMScheduler
|
|
26
26
|
from ...utils import (
|
27
27
|
USE_PEFT_BACKEND,
|
28
28
|
deprecate,
|
29
|
+
is_torch_xla_available,
|
29
30
|
logging,
|
30
31
|
replace_example_docstring,
|
31
32
|
scale_lora_layers,
|
@@ -37,8 +38,16 @@ from ..stable_diffusion import StableDiffusionPipelineOutput
|
|
37
38
|
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
38
39
|
|
39
40
|
|
41
|
+
if is_torch_xla_available():
|
42
|
+
import torch_xla.core.xla_model as xm
|
43
|
+
|
44
|
+
XLA_AVAILABLE = True
|
45
|
+
else:
|
46
|
+
XLA_AVAILABLE = False
|
47
|
+
|
40
48
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
41
49
|
|
50
|
+
|
42
51
|
EXAMPLE_DOC_STRING = """
|
43
52
|
Examples:
|
44
53
|
```py
|
@@ -179,8 +188,8 @@ class StableDiffusionPanoramaPipeline(
|
|
179
188
|
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
180
189
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
181
190
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
182
|
-
Please refer to the [model card](https://huggingface.co/
|
183
|
-
about a model's potential harms.
|
191
|
+
Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
|
192
|
+
more details about a model's potential harms.
|
184
193
|
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
185
194
|
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
|
186
195
|
"""
|
@@ -230,7 +239,7 @@ class StableDiffusionPanoramaPipeline(
|
|
230
239
|
feature_extractor=feature_extractor,
|
231
240
|
image_encoder=image_encoder,
|
232
241
|
)
|
233
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
242
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
234
243
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
235
244
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
236
245
|
|
@@ -1155,6 +1164,9 @@ class StableDiffusionPanoramaPipeline(
|
|
1155
1164
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
1156
1165
|
callback(step_idx, t, latents)
|
1157
1166
|
|
1167
|
+
if XLA_AVAILABLE:
|
1168
|
+
xm.mark_step()
|
1169
|
+
|
1158
1170
|
if output_type != "latent":
|
1159
1171
|
if circular_padding:
|
1160
1172
|
image = self.decode_latents_with_padding(latents)
|
@@ -12,13 +12,20 @@ from ...image_processor import PipelineImageInput
|
|
12
12
|
from ...loaders import IPAdapterMixin
|
13
13
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
14
14
|
from ...schedulers import KarrasDiffusionSchedulers
|
15
|
-
from ...utils import deprecate, logging
|
15
|
+
from ...utils import deprecate, is_torch_xla_available, logging
|
16
16
|
from ...utils.torch_utils import randn_tensor
|
17
17
|
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
18
18
|
from . import StableDiffusionSafePipelineOutput
|
19
19
|
from .safety_checker import SafeStableDiffusionSafetyChecker
|
20
20
|
|
21
21
|
|
22
|
+
if is_torch_xla_available():
|
23
|
+
import torch_xla.core.xla_model as xm
|
24
|
+
|
25
|
+
XLA_AVAILABLE = True
|
26
|
+
else:
|
27
|
+
XLA_AVAILABLE = False
|
28
|
+
|
22
29
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
23
30
|
|
24
31
|
|
@@ -46,8 +53,8 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda
|
|
46
53
|
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
47
54
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
48
55
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
49
|
-
Please refer to the [model card](https://huggingface.co/
|
50
|
-
about a model's potential harms.
|
56
|
+
Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
|
57
|
+
more details about a model's potential harms.
|
51
58
|
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
52
59
|
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
|
53
60
|
"""
|
@@ -74,7 +81,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda
|
|
74
81
|
" abuse, brutality, cruelty"
|
75
82
|
)
|
76
83
|
|
77
|
-
if
|
84
|
+
if scheduler is not None and getattr(scheduler.config, "steps_offset", 1) != 1:
|
78
85
|
deprecation_message = (
|
79
86
|
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
|
80
87
|
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
|
@@ -88,7 +95,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda
|
|
88
95
|
new_config["steps_offset"] = 1
|
89
96
|
scheduler._internal_dict = FrozenDict(new_config)
|
90
97
|
|
91
|
-
if
|
98
|
+
if scheduler is not None and getattr(scheduler.config, "clip_sample", False) is True:
|
92
99
|
deprecation_message = (
|
93
100
|
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
|
94
101
|
" `clip_sample` should be set to False in the configuration file. Please make sure to update the"
|
@@ -117,17 +124,21 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda
|
|
117
124
|
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
|
118
125
|
)
|
119
126
|
|
120
|
-
is_unet_version_less_0_9_0 =
|
121
|
-
|
122
|
-
|
123
|
-
|
127
|
+
is_unet_version_less_0_9_0 = (
|
128
|
+
unet is not None
|
129
|
+
and hasattr(unet.config, "_diffusers_version")
|
130
|
+
and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
|
131
|
+
)
|
132
|
+
is_unet_sample_size_less_64 = (
|
133
|
+
unet is not None and hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
|
134
|
+
)
|
124
135
|
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
|
125
136
|
deprecation_message = (
|
126
137
|
"The configuration file of the unet has set the default `sample_size` to smaller than"
|
127
138
|
" 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
|
128
139
|
" following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
|
129
|
-
" CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n-
|
130
|
-
" \n-
|
140
|
+
" CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- stable-diffusion-v1-5/stable-diffusion-v1-5"
|
141
|
+
" \n- stable-diffusion-v1-5/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
|
131
142
|
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
|
132
143
|
" in the config might lead to incorrect results in future versions. If you have downloaded this"
|
133
144
|
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
|
@@ -149,7 +160,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda
|
|
149
160
|
image_encoder=image_encoder,
|
150
161
|
)
|
151
162
|
self._safety_text_concept = safety_concept
|
152
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
163
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
153
164
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
154
165
|
|
155
166
|
@property
|
@@ -739,6 +750,9 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAda
|
|
739
750
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
740
751
|
callback(step_idx, t, latents)
|
741
752
|
|
753
|
+
if XLA_AVAILABLE:
|
754
|
+
xm.mark_step()
|
755
|
+
|
742
756
|
# 8. Post-processing
|
743
757
|
image = self.decode_latents(latents)
|
744
758
|
|
@@ -27,6 +27,7 @@ from ...schedulers import KarrasDiffusionSchedulers
|
|
27
27
|
from ...utils import (
|
28
28
|
USE_PEFT_BACKEND,
|
29
29
|
deprecate,
|
30
|
+
is_torch_xla_available,
|
30
31
|
logging,
|
31
32
|
replace_example_docstring,
|
32
33
|
scale_lora_layers,
|
@@ -38,8 +39,16 @@ from ..stable_diffusion import StableDiffusionPipelineOutput
|
|
38
39
|
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
39
40
|
|
40
41
|
|
42
|
+
if is_torch_xla_available():
|
43
|
+
import torch_xla.core.xla_model as xm
|
44
|
+
|
45
|
+
XLA_AVAILABLE = True
|
46
|
+
else:
|
47
|
+
XLA_AVAILABLE = False
|
48
|
+
|
41
49
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
42
50
|
|
51
|
+
|
43
52
|
EXAMPLE_DOC_STRING = """
|
44
53
|
Examples:
|
45
54
|
```py
|
@@ -47,7 +56,7 @@ EXAMPLE_DOC_STRING = """
|
|
47
56
|
>>> from diffusers import StableDiffusionSAGPipeline
|
48
57
|
|
49
58
|
>>> pipe = StableDiffusionSAGPipeline.from_pretrained(
|
50
|
-
... "
|
59
|
+
... "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
|
51
60
|
... )
|
52
61
|
>>> pipe = pipe.to("cuda")
|
53
62
|
|
@@ -123,8 +132,8 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
|
123
132
|
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
124
133
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
125
134
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
126
|
-
Please refer to the [model card](https://huggingface.co/
|
127
|
-
about a model's potential harms.
|
135
|
+
Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
|
136
|
+
more details about a model's potential harms.
|
128
137
|
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
129
138
|
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
|
130
139
|
"""
|
@@ -157,7 +166,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
|
157
166
|
feature_extractor=feature_extractor,
|
158
167
|
image_encoder=image_encoder,
|
159
168
|
)
|
160
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
169
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
161
170
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
162
171
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
163
172
|
|
@@ -840,6 +849,9 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
|
840
849
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
841
850
|
callback(step_idx, t, latents)
|
842
851
|
|
852
|
+
if XLA_AVAILABLE:
|
853
|
+
xm.mark_step()
|
854
|
+
|
843
855
|
if not output_type == "latent":
|
844
856
|
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
845
857
|
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
@@ -65,7 +65,7 @@ class FlaxStableDiffusionXLPipeline(FlaxDiffusionPipeline):
|
|
65
65
|
unet=unet,
|
66
66
|
scheduler=scheduler,
|
67
67
|
)
|
68
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
68
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
69
69
|
|
70
70
|
def prepare_inputs(self, prompt: Union[str, List[str]]):
|
71
71
|
if not isinstance(prompt, (str, list)):
|
@@ -269,10 +269,14 @@ class StableDiffusionXLPipeline(
|
|
269
269
|
feature_extractor=feature_extractor,
|
270
270
|
)
|
271
271
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
272
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
272
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
273
273
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
274
274
|
|
275
|
-
self.default_sample_size =
|
275
|
+
self.default_sample_size = (
|
276
|
+
self.unet.config.sample_size
|
277
|
+
if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
|
278
|
+
else 128
|
279
|
+
)
|
276
280
|
|
277
281
|
add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
|
278
282
|
|
@@ -406,7 +410,9 @@ class StableDiffusionXLPipeline(
|
|
406
410
|
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
|
407
411
|
|
408
412
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
409
|
-
pooled_prompt_embeds
|
413
|
+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
|
414
|
+
pooled_prompt_embeds = prompt_embeds[0]
|
415
|
+
|
410
416
|
if clip_skip is None:
|
411
417
|
prompt_embeds = prompt_embeds.hidden_states[-2]
|
412
418
|
else:
|
@@ -465,8 +471,10 @@ class StableDiffusionXLPipeline(
|
|
465
471
|
uncond_input.input_ids.to(device),
|
466
472
|
output_hidden_states=True,
|
467
473
|
)
|
474
|
+
|
468
475
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
469
|
-
negative_pooled_prompt_embeds
|
476
|
+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
|
477
|
+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
|
470
478
|
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
|
471
479
|
|
472
480
|
negative_prompt_embeds_list.append(negative_prompt_embeds)
|
@@ -291,7 +291,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
291
291
|
)
|
292
292
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
293
293
|
self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
|
294
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
294
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
295
295
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
296
296
|
|
297
297
|
add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
|
@@ -427,7 +427,9 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
427
427
|
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
|
428
428
|
|
429
429
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
430
|
-
pooled_prompt_embeds
|
430
|
+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
|
431
|
+
pooled_prompt_embeds = prompt_embeds[0]
|
432
|
+
|
431
433
|
if clip_skip is None:
|
432
434
|
prompt_embeds = prompt_embeds.hidden_states[-2]
|
433
435
|
else:
|
@@ -486,8 +488,10 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
486
488
|
uncond_input.input_ids.to(device),
|
487
489
|
output_hidden_states=True,
|
488
490
|
)
|
491
|
+
|
489
492
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
490
|
-
negative_pooled_prompt_embeds
|
493
|
+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
|
494
|
+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
|
491
495
|
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
|
492
496
|
|
493
497
|
negative_prompt_embeds_list.append(negative_prompt_embeds)
|
@@ -321,7 +321,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
321
321
|
)
|
322
322
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
323
323
|
self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
|
324
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
324
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
325
325
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
326
326
|
self.mask_processor = VaeImageProcessor(
|
327
327
|
vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
|
@@ -531,7 +531,9 @@ class StableDiffusionXLInpaintPipeline(
|
|
531
531
|
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
|
532
532
|
|
533
533
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
534
|
-
pooled_prompt_embeds
|
534
|
+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
|
535
|
+
pooled_prompt_embeds = prompt_embeds[0]
|
536
|
+
|
535
537
|
if clip_skip is None:
|
536
538
|
prompt_embeds = prompt_embeds.hidden_states[-2]
|
537
539
|
else:
|
@@ -590,8 +592,10 @@ class StableDiffusionXLInpaintPipeline(
|
|
590
592
|
uncond_input.input_ids.to(device),
|
591
593
|
output_hidden_states=True,
|
592
594
|
)
|
595
|
+
|
593
596
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
594
|
-
negative_pooled_prompt_embeds
|
597
|
+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
|
598
|
+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
|
595
599
|
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
|
596
600
|
|
597
601
|
negative_prompt_embeds_list.append(negative_prompt_embeds)
|
@@ -737,7 +741,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
737
741
|
if padding_mask_crop is not None:
|
738
742
|
if not isinstance(image, PIL.Image.Image):
|
739
743
|
raise ValueError(
|
740
|
-
f"The image should be a PIL image when inpainting mask crop, but is of type
|
744
|
+
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
|
741
745
|
)
|
742
746
|
if not isinstance(mask_image, PIL.Image.Image):
|
743
747
|
raise ValueError(
|
@@ -745,7 +749,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
745
749
|
f" {type(mask_image)}."
|
746
750
|
)
|
747
751
|
if output_type != "pil":
|
748
|
-
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is
|
752
|
+
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
|
749
753
|
|
750
754
|
if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
|
751
755
|
raise ValueError(
|
@@ -1505,7 +1509,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
1505
1509
|
f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
|
1506
1510
|
f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
|
1507
1511
|
f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
|
1508
|
-
f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
|
1512
|
+
f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
|
1509
1513
|
" `pipeline.unet` or your `mask_image` or `image` input."
|
1510
1514
|
)
|
1511
1515
|
elif num_channels_unet != 4:
|
@@ -199,9 +199,13 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
199
199
|
scheduler=scheduler,
|
200
200
|
)
|
201
201
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
202
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
202
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
203
203
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
204
|
-
self.default_sample_size =
|
204
|
+
self.default_sample_size = (
|
205
|
+
self.unet.config.sample_size
|
206
|
+
if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
|
207
|
+
else 128
|
208
|
+
)
|
205
209
|
self.is_cosxl_edit = is_cosxl_edit
|
206
210
|
|
207
211
|
add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
|
@@ -333,7 +337,9 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
333
337
|
)
|
334
338
|
|
335
339
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
336
|
-
pooled_prompt_embeds
|
340
|
+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
|
341
|
+
pooled_prompt_embeds = prompt_embeds[0]
|
342
|
+
|
337
343
|
prompt_embeds = prompt_embeds.hidden_states[-2]
|
338
344
|
|
339
345
|
prompt_embeds_list.append(prompt_embeds)
|
@@ -385,7 +391,8 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
385
391
|
output_hidden_states=True,
|
386
392
|
)
|
387
393
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
388
|
-
negative_pooled_prompt_embeds
|
394
|
+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
|
395
|
+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
|
389
396
|
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
|
390
397
|
|
391
398
|
negative_prompt_embeds_list.append(negative_prompt_embeds)
|
@@ -24,14 +24,22 @@ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
|
|
24
24
|
from ...image_processor import PipelineImageInput
|
25
25
|
from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
|
26
26
|
from ...schedulers import EulerDiscreteScheduler
|
27
|
-
from ...utils import BaseOutput, logging, replace_example_docstring
|
27
|
+
from ...utils import BaseOutput, is_torch_xla_available, logging, replace_example_docstring
|
28
28
|
from ...utils.torch_utils import is_compiled_module, randn_tensor
|
29
29
|
from ...video_processor import VideoProcessor
|
30
30
|
from ..pipeline_utils import DiffusionPipeline
|
31
31
|
|
32
32
|
|
33
|
+
if is_torch_xla_available():
|
34
|
+
import torch_xla.core.xla_model as xm
|
35
|
+
|
36
|
+
XLA_AVAILABLE = True
|
37
|
+
else:
|
38
|
+
XLA_AVAILABLE = False
|
39
|
+
|
33
40
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
34
41
|
|
42
|
+
|
35
43
|
EXAMPLE_DOC_STRING = """
|
36
44
|
Examples:
|
37
45
|
```py
|
@@ -177,7 +185,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
177
185
|
scheduler=scheduler,
|
178
186
|
feature_extractor=feature_extractor,
|
179
187
|
)
|
180
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
188
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
181
189
|
self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
|
182
190
|
|
183
191
|
def _encode_image(
|
@@ -600,6 +608,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
600
608
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
601
609
|
progress_bar.update()
|
602
610
|
|
611
|
+
if XLA_AVAILABLE:
|
612
|
+
xm.mark_step()
|
613
|
+
|
603
614
|
if not output_type == "latent":
|
604
615
|
# cast back to fp16 if needed
|
605
616
|
if needs_upcasting:
|
@@ -22,7 +22,7 @@ import torch
|
|
22
22
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
23
23
|
|
24
24
|
from ...image_processor import VaeImageProcessor
|
25
|
-
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
25
|
+
from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
26
26
|
from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
|
27
27
|
from ...models.lora import adjust_lora_scale_text_encoder
|
28
28
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -31,6 +31,7 @@ from ...utils import (
|
|
31
31
|
USE_PEFT_BACKEND,
|
32
32
|
BaseOutput,
|
33
33
|
deprecate,
|
34
|
+
is_torch_xla_available,
|
34
35
|
logging,
|
35
36
|
replace_example_docstring,
|
36
37
|
scale_lora_layers,
|
@@ -41,6 +42,14 @@ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
|
41
42
|
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
42
43
|
|
43
44
|
|
45
|
+
if is_torch_xla_available():
|
46
|
+
import torch_xla.core.xla_model as xm
|
47
|
+
|
48
|
+
XLA_AVAILABLE = True
|
49
|
+
else:
|
50
|
+
XLA_AVAILABLE = False
|
51
|
+
|
52
|
+
|
44
53
|
@dataclass
|
45
54
|
class StableDiffusionAdapterPipelineOutput(BaseOutput):
|
46
55
|
"""
|
@@ -59,6 +68,7 @@ class StableDiffusionAdapterPipelineOutput(BaseOutput):
|
|
59
68
|
|
60
69
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
61
70
|
|
71
|
+
|
62
72
|
EXAMPLE_DOC_STRING = """
|
63
73
|
Examples:
|
64
74
|
```py
|
@@ -178,7 +188,7 @@ def retrieve_timesteps(
|
|
178
188
|
return timesteps, num_inference_steps
|
179
189
|
|
180
190
|
|
181
|
-
class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
191
|
+
class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
|
182
192
|
r"""
|
183
193
|
Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
|
184
194
|
https://arxiv.org/abs/2302.08453
|
@@ -208,7 +218,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
208
218
|
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
|
209
219
|
safety_checker ([`StableDiffusionSafetyChecker`]):
|
210
220
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
211
|
-
Please, refer to the [model card](https://huggingface.co/
|
221
|
+
Please, refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
|
222
|
+
details.
|
212
223
|
feature_extractor ([`CLIPImageProcessor`]):
|
213
224
|
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
|
214
225
|
"""
|
@@ -259,7 +270,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
259
270
|
safety_checker=safety_checker,
|
260
271
|
feature_extractor=feature_extractor,
|
261
272
|
)
|
262
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
273
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
263
274
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
264
275
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
265
276
|
|
@@ -914,6 +925,9 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
914
925
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
915
926
|
callback(step_idx, t, latents)
|
916
927
|
|
928
|
+
if XLA_AVAILABLE:
|
929
|
+
xm.mark_step()
|
930
|
+
|
917
931
|
if output_type == "latent":
|
918
932
|
image = latents
|
919
933
|
has_nsfw_concept = None
|