diffusers 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +186 -3
- diffusers/configuration_utils.py +40 -12
- diffusers/dependency_versions_table.py +9 -2
- diffusers/hooks/__init__.py +9 -0
- diffusers/hooks/faster_cache.py +653 -0
- diffusers/hooks/group_offloading.py +793 -0
- diffusers/hooks/hooks.py +236 -0
- diffusers/hooks/layerwise_casting.py +245 -0
- diffusers/hooks/pyramid_attention_broadcast.py +311 -0
- diffusers/loaders/__init__.py +6 -0
- diffusers/loaders/ip_adapter.py +38 -30
- diffusers/loaders/lora_base.py +198 -28
- diffusers/loaders/lora_conversion_utils.py +679 -44
- diffusers/loaders/lora_pipeline.py +1963 -801
- diffusers/loaders/peft.py +169 -84
- diffusers/loaders/single_file.py +17 -2
- diffusers/loaders/single_file_model.py +53 -5
- diffusers/loaders/single_file_utils.py +653 -75
- diffusers/loaders/textual_inversion.py +9 -9
- diffusers/loaders/transformer_flux.py +8 -9
- diffusers/loaders/transformer_sd3.py +120 -39
- diffusers/loaders/unet.py +22 -32
- diffusers/models/__init__.py +22 -0
- diffusers/models/activations.py +9 -9
- diffusers/models/attention.py +0 -1
- diffusers/models/attention_processor.py +163 -25
- diffusers/models/auto_model.py +169 -0
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
- diffusers/models/autoencoders/autoencoder_dc.py +106 -4
- diffusers/models/autoencoders/autoencoder_kl.py +0 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
- diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
- diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
- diffusers/models/autoencoders/vae.py +31 -141
- diffusers/models/autoencoders/vq_model.py +3 -0
- diffusers/models/cache_utils.py +108 -0
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -8
- diffusers/models/controlnets/controlnet_flux.py +14 -42
- diffusers/models/controlnets/controlnet_sd3.py +58 -34
- diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
- diffusers/models/controlnets/controlnet_union.py +27 -18
- diffusers/models/controlnets/controlnet_xs.py +7 -46
- diffusers/models/controlnets/multicontrolnet_union.py +196 -0
- diffusers/models/embeddings.py +18 -7
- diffusers/models/model_loading_utils.py +122 -80
- diffusers/models/modeling_flax_pytorch_utils.py +1 -1
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +617 -272
- diffusers/models/normalization.py +67 -14
- diffusers/models/resnet.py +1 -1
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
- diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
- diffusers/models/transformers/consisid_transformer_3d.py +789 -0
- diffusers/models/transformers/dit_transformer_2d.py +5 -19
- diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
- diffusers/models/transformers/latte_transformer_3d.py +20 -15
- diffusers/models/transformers/lumina_nextdit2d.py +3 -1
- diffusers/models/transformers/pixart_transformer_2d.py +4 -19
- diffusers/models/transformers/prior_transformer.py +5 -1
- diffusers/models/transformers/sana_transformer.py +144 -40
- diffusers/models/transformers/stable_audio_transformer.py +5 -20
- diffusers/models/transformers/transformer_2d.py +7 -22
- diffusers/models/transformers/transformer_allegro.py +9 -17
- diffusers/models/transformers/transformer_cogview3plus.py +6 -17
- diffusers/models/transformers/transformer_cogview4.py +462 -0
- diffusers/models/transformers/transformer_easyanimate.py +527 -0
- diffusers/models/transformers/transformer_flux.py +68 -110
- diffusers/models/transformers/transformer_hunyuan_video.py +409 -49
- diffusers/models/transformers/transformer_ltx.py +53 -35
- diffusers/models/transformers/transformer_lumina2.py +548 -0
- diffusers/models/transformers/transformer_mochi.py +6 -17
- diffusers/models/transformers/transformer_omnigen.py +469 -0
- diffusers/models/transformers/transformer_sd3.py +56 -86
- diffusers/models/transformers/transformer_temporal.py +5 -11
- diffusers/models/transformers/transformer_wan.py +469 -0
- diffusers/models/unets/unet_1d.py +3 -1
- diffusers/models/unets/unet_2d.py +21 -20
- diffusers/models/unets/unet_2d_blocks.py +19 -243
- diffusers/models/unets/unet_2d_condition.py +4 -6
- diffusers/models/unets/unet_3d_blocks.py +14 -127
- diffusers/models/unets/unet_3d_condition.py +8 -12
- diffusers/models/unets/unet_i2vgen_xl.py +5 -13
- diffusers/models/unets/unet_kandinsky3.py +0 -4
- diffusers/models/unets/unet_motion_model.py +20 -114
- diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
- diffusers/models/unets/unet_stable_cascade.py +8 -35
- diffusers/models/unets/uvit_2d.py +1 -4
- diffusers/optimization.py +2 -2
- diffusers/pipelines/__init__.py +57 -8
- diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
- diffusers/pipelines/amused/pipeline_amused.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
- diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
- diffusers/pipelines/auto_pipeline.py +35 -14
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
- diffusers/pipelines/cogview4/__init__.py +49 -0
- diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
- diffusers/pipelines/cogview4/pipeline_output.py +21 -0
- diffusers/pipelines/consisid/__init__.py +49 -0
- diffusers/pipelines/consisid/consisid_utils.py +357 -0
- diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
- diffusers/pipelines/consisid/pipeline_output.py +20 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
- diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
- diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +15 -2
- diffusers/pipelines/easyanimate/__init__.py +52 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
- diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -21
- diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
- diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
- diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hunyuan_video/__init__.py +4 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
- diffusers/pipelines/kolors/text_encoder.py +7 -34
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
- diffusers/pipelines/latte/pipeline_latte.py +36 -7
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
- diffusers/pipelines/ltx/__init__.py +2 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
- diffusers/pipelines/lumina/__init__.py +2 -2
- diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
- diffusers/pipelines/lumina2/__init__.py +48 -0
- diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
- diffusers/pipelines/marigold/__init__.py +2 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
- diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
- diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
- diffusers/pipelines/omnigen/__init__.py +50 -0
- diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
- diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
- diffusers/pipelines/onnx_utils.py +5 -3
- diffusers/pipelines/pag/pag_utils.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
- diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
- diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
- diffusers/pipelines/pia/pipeline_pia.py +13 -1
- diffusers/pipelines/pipeline_flax_utils.py +7 -7
- diffusers/pipelines/pipeline_loading_utils.py +193 -83
- diffusers/pipelines/pipeline_utils.py +221 -106
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
- diffusers/pipelines/sana/__init__.py +2 -0
- diffusers/pipelines/sana/pipeline_sana.py +183 -58
- diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
- diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
- diffusers/pipelines/shap_e/renderer.py +6 -6
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
- diffusers/pipelines/transformers_loading_utils.py +121 -0
- diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
- diffusers/pipelines/wan/__init__.py +51 -0
- diffusers/pipelines/wan/pipeline_output.py +20 -0
- diffusers/pipelines/wan/pipeline_wan.py +593 -0
- diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
- diffusers/quantizers/auto.py +5 -1
- diffusers/quantizers/base.py +5 -9
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
- diffusers/quantizers/bitsandbytes/utils.py +30 -20
- diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
- diffusers/quantizers/gguf/utils.py +4 -2
- diffusers/quantizers/quantization_config.py +59 -4
- diffusers/quantizers/quanto/__init__.py +1 -0
- diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
- diffusers/quantizers/quanto/utils.py +60 -0
- diffusers/quantizers/torchao/__init__.py +1 -1
- diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
- diffusers/schedulers/__init__.py +2 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -2
- diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
- diffusers/schedulers/scheduling_ddpm.py +2 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
- diffusers/schedulers/scheduling_edm_euler.py +45 -10
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
- diffusers/schedulers/scheduling_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_lcm.py +1 -2
- diffusers/schedulers/scheduling_lms_discrete.py +1 -1
- diffusers/schedulers/scheduling_repaint.py +5 -1
- diffusers/schedulers/scheduling_scm.py +265 -0
- diffusers/schedulers/scheduling_tcd.py +1 -2
- diffusers/schedulers/scheduling_utils.py +2 -1
- diffusers/training_utils.py +14 -7
- diffusers/utils/__init__.py +10 -2
- diffusers/utils/constants.py +13 -1
- diffusers/utils/deprecation_utils.py +1 -1
- diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
- diffusers/utils/dummy_gguf_objects.py +17 -0
- diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
- diffusers/utils/dummy_pt_objects.py +233 -0
- diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dummy_torchao_objects.py +17 -0
- diffusers/utils/dynamic_modules_utils.py +1 -1
- diffusers/utils/export_utils.py +28 -3
- diffusers/utils/hub_utils.py +52 -102
- diffusers/utils/import_utils.py +121 -221
- diffusers/utils/loading_utils.py +14 -1
- diffusers/utils/logging.py +1 -2
- diffusers/utils/peft_utils.py +6 -14
- diffusers/utils/remote_utils.py +425 -0
- diffusers/utils/source_code_parsing_utils.py +52 -0
- diffusers/utils/state_dict_utils.py +15 -1
- diffusers/utils/testing_utils.py +243 -13
- diffusers/utils/torch_utils.py +10 -0
- diffusers/utils/typing_utils.py +91 -0
- diffusers/video_processor.py +1 -1
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
- diffusers-0.33.0.dist-info/RECORD +608 -0
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
- diffusers-0.32.1.dist-info/RECORD +0 -550
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -61,6 +61,17 @@ from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutpu
|
|
61
61
|
if is_invisible_watermark_available():
|
62
62
|
from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
|
63
63
|
|
64
|
+
|
65
|
+
from ...utils import is_torch_xla_available
|
66
|
+
|
67
|
+
|
68
|
+
if is_torch_xla_available():
|
69
|
+
import torch_xla.core.xla_model as xm
|
70
|
+
|
71
|
+
XLA_AVAILABLE = True
|
72
|
+
else:
|
73
|
+
XLA_AVAILABLE = False
|
74
|
+
|
64
75
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
65
76
|
|
66
77
|
|
@@ -241,12 +252,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
241
252
|
"feature_extractor",
|
242
253
|
"image_encoder",
|
243
254
|
]
|
244
|
-
_callback_tensor_inputs = [
|
245
|
-
"latents",
|
246
|
-
"prompt_embeds",
|
247
|
-
"add_text_embeds",
|
248
|
-
"add_time_ids",
|
249
|
-
]
|
255
|
+
_callback_tensor_inputs = ["latents", "prompt_embeds", "add_text_embeds", "add_time_ids", "control_image"]
|
250
256
|
|
251
257
|
def __init__(
|
252
258
|
self,
|
@@ -281,7 +287,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
281
287
|
feature_extractor=feature_extractor,
|
282
288
|
image_encoder=image_encoder,
|
283
289
|
)
|
284
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
290
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
285
291
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
|
286
292
|
self.control_image_processor = VaeImageProcessor(
|
287
293
|
vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
|
@@ -422,7 +428,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
422
428
|
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
|
423
429
|
|
424
430
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
425
|
-
pooled_prompt_embeds
|
431
|
+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
|
432
|
+
pooled_prompt_embeds = prompt_embeds[0]
|
433
|
+
|
426
434
|
if clip_skip is None:
|
427
435
|
prompt_embeds = prompt_embeds.hidden_states[-2]
|
428
436
|
else:
|
@@ -481,8 +489,10 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
481
489
|
uncond_input.input_ids.to(device),
|
482
490
|
output_hidden_states=True,
|
483
491
|
)
|
492
|
+
|
484
493
|
# We are only ALWAYS interested in the pooled output of the final text encoder
|
485
|
-
negative_pooled_prompt_embeds
|
494
|
+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
|
495
|
+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
|
486
496
|
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
|
487
497
|
|
488
498
|
negative_prompt_embeds_list.append(negative_prompt_embeds)
|
@@ -731,26 +741,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
731
741
|
else:
|
732
742
|
assert False
|
733
743
|
|
734
|
-
# Check `controlnet_conditioning_scale`
|
735
|
-
if (
|
736
|
-
isinstance(self.controlnet, ControlNetModel)
|
737
|
-
or is_compiled
|
738
|
-
and isinstance(self.controlnet._orig_mod, ControlNetModel)
|
739
|
-
):
|
740
|
-
if not isinstance(controlnet_conditioning_scale, float):
|
741
|
-
raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
|
742
|
-
|
743
|
-
elif (
|
744
|
-
isinstance(self.controlnet, ControlNetUnionModel)
|
745
|
-
or is_compiled
|
746
|
-
and isinstance(self.controlnet._orig_mod, ControlNetUnionModel)
|
747
|
-
):
|
748
|
-
if not isinstance(controlnet_conditioning_scale, float):
|
749
|
-
raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
|
750
|
-
|
751
|
-
else:
|
752
|
-
assert False
|
753
|
-
|
754
744
|
if not isinstance(control_guidance_start, (tuple, list)):
|
755
745
|
control_guidance_start = [control_guidance_start]
|
756
746
|
|
@@ -1291,6 +1281,8 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
1291
1281
|
|
1292
1282
|
if not isinstance(control_image, list):
|
1293
1283
|
control_image = [control_image]
|
1284
|
+
else:
|
1285
|
+
control_image = control_image.copy()
|
1294
1286
|
|
1295
1287
|
if not isinstance(control_mode, list):
|
1296
1288
|
control_mode = [control_mode]
|
@@ -1565,6 +1557,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
1565
1557
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1566
1558
|
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
1567
1559
|
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
1560
|
+
control_image = callback_outputs.pop("control_image", control_image)
|
1568
1561
|
|
1569
1562
|
# call the callback, if provided
|
1570
1563
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
@@ -1573,6 +1566,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
|
|
1573
1566
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
1574
1567
|
callback(step_idx, t, latents)
|
1575
1568
|
|
1569
|
+
if XLA_AVAILABLE:
|
1570
|
+
xm.mark_step()
|
1571
|
+
|
1576
1572
|
# If we do sequential model offloading, let's offload unet and controlnet
|
1577
1573
|
# manually for max memory savings
|
1578
1574
|
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
@@ -75,7 +75,10 @@ EXAMPLE_DOC_STRING = """
|
|
75
75
|
... "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
|
76
76
|
... )
|
77
77
|
>>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
|
78
|
-
... "
|
78
|
+
... "stable-diffusion-v1-5/stable-diffusion-v1-5",
|
79
|
+
... controlnet=controlnet,
|
80
|
+
... revision="flax",
|
81
|
+
... dtype=jnp.float32,
|
79
82
|
... )
|
80
83
|
>>> params["controlnet"] = controlnet_params
|
81
84
|
|
@@ -132,8 +135,8 @@ class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
|
|
132
135
|
[`FlaxDPMSolverMultistepScheduler`].
|
133
136
|
safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
|
134
137
|
Classification module that estimates whether generated images could be considered offensive or harmful.
|
135
|
-
Please refer to the [model card](https://huggingface.co/
|
136
|
-
about a model's potential harms.
|
138
|
+
Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
|
139
|
+
more details about a model's potential harms.
|
137
140
|
feature_extractor ([`~transformers.CLIPImageProcessor`]):
|
138
141
|
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
|
139
142
|
"""
|
@@ -175,7 +178,7 @@ class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
|
|
175
178
|
safety_checker=safety_checker,
|
176
179
|
feature_extractor=feature_extractor,
|
177
180
|
)
|
178
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
181
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
179
182
|
|
180
183
|
def prepare_text_inputs(self, prompt: Union[str, List[str]]):
|
181
184
|
if not isinstance(prompt, (str, list)):
|
@@ -232,8 +232,8 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
|
|
232
232
|
Tuple[HunyuanDiT2DControlNetModel],
|
233
233
|
HunyuanDiT2DMultiControlNetModel,
|
234
234
|
],
|
235
|
-
text_encoder_2=
|
236
|
-
tokenizer_2=
|
235
|
+
text_encoder_2: Optional[T5EncoderModel] = None,
|
236
|
+
tokenizer_2: Optional[MT5Tokenizer] = None,
|
237
237
|
requires_safety_checker: bool = True,
|
238
238
|
):
|
239
239
|
super().__init__()
|
@@ -269,9 +269,7 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
|
|
269
269
|
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
|
270
270
|
)
|
271
271
|
|
272
|
-
self.vae_scale_factor = (
|
273
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
274
|
-
)
|
272
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
275
273
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
276
274
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
277
275
|
self.default_sample_size = (
|
@@ -19,12 +19,14 @@ import torch
|
|
19
19
|
from transformers import (
|
20
20
|
CLIPTextModelWithProjection,
|
21
21
|
CLIPTokenizer,
|
22
|
+
SiglipImageProcessor,
|
23
|
+
SiglipVisionModel,
|
22
24
|
T5EncoderModel,
|
23
25
|
T5TokenizerFast,
|
24
26
|
)
|
25
27
|
|
26
28
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
27
|
-
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
29
|
+
from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
|
28
30
|
from ...models.autoencoders import AutoencoderKL
|
29
31
|
from ...models.controlnets.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
|
30
32
|
from ...models.transformers import SD3Transformer2DModel
|
@@ -138,7 +140,9 @@ def retrieve_timesteps(
|
|
138
140
|
return timesteps, num_inference_steps
|
139
141
|
|
140
142
|
|
141
|
-
class StableDiffusion3ControlNetPipeline(
|
143
|
+
class StableDiffusion3ControlNetPipeline(
|
144
|
+
DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin
|
145
|
+
):
|
142
146
|
r"""
|
143
147
|
Args:
|
144
148
|
transformer ([`SD3Transformer2DModel`]):
|
@@ -174,10 +178,14 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
174
178
|
Provides additional conditioning to the `unet` during the denoising process. If you set multiple
|
175
179
|
ControlNets as a list, the outputs from each ControlNet are added together to create one combined
|
176
180
|
additional conditioning.
|
181
|
+
image_encoder (`SiglipVisionModel`, *optional*):
|
182
|
+
Pre-trained Vision Model for IP Adapter.
|
183
|
+
feature_extractor (`SiglipImageProcessor`, *optional*):
|
184
|
+
Image processor for IP Adapter.
|
177
185
|
"""
|
178
186
|
|
179
|
-
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
|
180
|
-
_optional_components = []
|
187
|
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
|
188
|
+
_optional_components = ["image_encoder", "feature_extractor"]
|
181
189
|
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
|
182
190
|
|
183
191
|
def __init__(
|
@@ -194,6 +202,8 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
194
202
|
controlnet: Union[
|
195
203
|
SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
|
196
204
|
],
|
205
|
+
image_encoder: Optional[SiglipVisionModel] = None,
|
206
|
+
feature_extractor: Optional[SiglipImageProcessor] = None,
|
197
207
|
):
|
198
208
|
super().__init__()
|
199
209
|
if isinstance(controlnet, (list, tuple)):
|
@@ -223,10 +233,10 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
223
233
|
transformer=transformer,
|
224
234
|
scheduler=scheduler,
|
225
235
|
controlnet=controlnet,
|
236
|
+
image_encoder=image_encoder,
|
237
|
+
feature_extractor=feature_extractor,
|
226
238
|
)
|
227
|
-
self.vae_scale_factor = (
|
228
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
229
|
-
)
|
239
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
230
240
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
231
241
|
self.tokenizer_max_length = (
|
232
242
|
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
@@ -394,9 +404,9 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
394
404
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
395
405
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
396
406
|
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
|
397
|
-
|
407
|
+
negative_prompt_3 (`str` or `List[str]`, *optional*):
|
398
408
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
|
399
|
-
`text_encoder_3`. If not defined, `negative_prompt` is used in
|
409
|
+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
|
400
410
|
prompt_embeds (`torch.FloatTensor`, *optional*):
|
401
411
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
402
412
|
provided, text embeddings will be generated from `prompt` input argument.
|
@@ -727,6 +737,84 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
727
737
|
def interrupt(self):
|
728
738
|
return self._interrupt
|
729
739
|
|
740
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
|
741
|
+
def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
|
742
|
+
"""Encodes the given image into a feature representation using a pre-trained image encoder.
|
743
|
+
|
744
|
+
Args:
|
745
|
+
image (`PipelineImageInput`):
|
746
|
+
Input image to be encoded.
|
747
|
+
device: (`torch.device`):
|
748
|
+
Torch device.
|
749
|
+
|
750
|
+
Returns:
|
751
|
+
`torch.Tensor`: The encoded image feature representation.
|
752
|
+
"""
|
753
|
+
if not isinstance(image, torch.Tensor):
|
754
|
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
755
|
+
|
756
|
+
image = image.to(device=device, dtype=self.dtype)
|
757
|
+
|
758
|
+
return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
759
|
+
|
760
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
|
761
|
+
def prepare_ip_adapter_image_embeds(
|
762
|
+
self,
|
763
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
764
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
765
|
+
device: Optional[torch.device] = None,
|
766
|
+
num_images_per_prompt: int = 1,
|
767
|
+
do_classifier_free_guidance: bool = True,
|
768
|
+
) -> torch.Tensor:
|
769
|
+
"""Prepares image embeddings for use in the IP-Adapter.
|
770
|
+
|
771
|
+
Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
|
772
|
+
|
773
|
+
Args:
|
774
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
775
|
+
The input image to extract features from for IP-Adapter.
|
776
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
777
|
+
Precomputed image embeddings.
|
778
|
+
device: (`torch.device`, *optional*):
|
779
|
+
Torch device.
|
780
|
+
num_images_per_prompt (`int`, defaults to 1):
|
781
|
+
Number of images that should be generated per prompt.
|
782
|
+
do_classifier_free_guidance (`bool`, defaults to True):
|
783
|
+
Whether to use classifier free guidance or not.
|
784
|
+
"""
|
785
|
+
device = device or self._execution_device
|
786
|
+
|
787
|
+
if ip_adapter_image_embeds is not None:
|
788
|
+
if do_classifier_free_guidance:
|
789
|
+
single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
|
790
|
+
else:
|
791
|
+
single_image_embeds = ip_adapter_image_embeds
|
792
|
+
elif ip_adapter_image is not None:
|
793
|
+
single_image_embeds = self.encode_image(ip_adapter_image, device)
|
794
|
+
if do_classifier_free_guidance:
|
795
|
+
single_negative_image_embeds = torch.zeros_like(single_image_embeds)
|
796
|
+
else:
|
797
|
+
raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
|
798
|
+
|
799
|
+
image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
800
|
+
|
801
|
+
if do_classifier_free_guidance:
|
802
|
+
negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
803
|
+
image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
|
804
|
+
|
805
|
+
return image_embeds.to(device=device)
|
806
|
+
|
807
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
|
808
|
+
def enable_sequential_cpu_offload(self, *args, **kwargs):
|
809
|
+
if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
|
810
|
+
logger.warning(
|
811
|
+
"`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
|
812
|
+
"`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
|
813
|
+
"`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
|
814
|
+
)
|
815
|
+
|
816
|
+
super().enable_sequential_cpu_offload(*args, **kwargs)
|
817
|
+
|
730
818
|
@torch.no_grad()
|
731
819
|
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
732
820
|
def __call__(
|
@@ -754,6 +842,8 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
754
842
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
755
843
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
756
844
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
845
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
846
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
757
847
|
output_type: Optional[str] = "pil",
|
758
848
|
return_dict: bool = True,
|
759
849
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -843,6 +933,12 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
843
933
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
844
934
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
845
935
|
input argument.
|
936
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
937
|
+
Optional image input to work with IP Adapters.
|
938
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
939
|
+
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
|
940
|
+
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
|
941
|
+
`True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
846
942
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
847
943
|
The output format of the generate image. Choose between
|
848
944
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
@@ -1040,7 +1136,22 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
|
|
1040
1136
|
# SD35 official 8b controlnet does not use encoder_hidden_states
|
1041
1137
|
controlnet_encoder_hidden_states = None
|
1042
1138
|
|
1043
|
-
# 7.
|
1139
|
+
# 7. Prepare image embeddings
|
1140
|
+
if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
|
1141
|
+
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
|
1142
|
+
ip_adapter_image,
|
1143
|
+
ip_adapter_image_embeds,
|
1144
|
+
device,
|
1145
|
+
batch_size * num_images_per_prompt,
|
1146
|
+
self.do_classifier_free_guidance,
|
1147
|
+
)
|
1148
|
+
|
1149
|
+
if self.joint_attention_kwargs is None:
|
1150
|
+
self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
|
1151
|
+
else:
|
1152
|
+
self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
|
1153
|
+
|
1154
|
+
# 8. Denoising loop
|
1044
1155
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
1045
1156
|
for i, t in enumerate(timesteps):
|
1046
1157
|
if self.interrupt:
|
@@ -19,12 +19,14 @@ import torch
|
|
19
19
|
from transformers import (
|
20
20
|
CLIPTextModelWithProjection,
|
21
21
|
CLIPTokenizer,
|
22
|
+
SiglipImageProcessor,
|
23
|
+
SiglipModel,
|
22
24
|
T5EncoderModel,
|
23
25
|
T5TokenizerFast,
|
24
26
|
)
|
25
27
|
|
26
28
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
27
|
-
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
29
|
+
from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
|
28
30
|
from ...models.autoencoders import AutoencoderKL
|
29
31
|
from ...models.controlnets.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
|
30
32
|
from ...models.transformers import SD3Transformer2DModel
|
@@ -159,7 +161,9 @@ def retrieve_timesteps(
|
|
159
161
|
return timesteps, num_inference_steps
|
160
162
|
|
161
163
|
|
162
|
-
class StableDiffusion3ControlNetInpaintingPipeline(
|
164
|
+
class StableDiffusion3ControlNetInpaintingPipeline(
|
165
|
+
DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin
|
166
|
+
):
|
163
167
|
r"""
|
164
168
|
Args:
|
165
169
|
transformer ([`SD3Transformer2DModel`]):
|
@@ -192,13 +196,17 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
192
196
|
Tokenizer of class
|
193
197
|
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
194
198
|
controlnet ([`SD3ControlNetModel`] or `List[SD3ControlNetModel]` or [`SD3MultiControlNetModel`]):
|
195
|
-
Provides additional conditioning to the `
|
199
|
+
Provides additional conditioning to the `transformer` during the denoising process. If you set multiple
|
196
200
|
ControlNets as a list, the outputs from each ControlNet are added together to create one combined
|
197
201
|
additional conditioning.
|
202
|
+
image_encoder (`PreTrainedModel`, *optional*):
|
203
|
+
Pre-trained Vision Model for IP Adapter.
|
204
|
+
feature_extractor (`BaseImageProcessor`, *optional*):
|
205
|
+
Image processor for IP Adapter.
|
198
206
|
"""
|
199
207
|
|
200
|
-
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
|
201
|
-
_optional_components = []
|
208
|
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
|
209
|
+
_optional_components = ["image_encoder", "feature_extractor"]
|
202
210
|
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
|
203
211
|
|
204
212
|
def __init__(
|
@@ -215,6 +223,8 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
215
223
|
controlnet: Union[
|
216
224
|
SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
|
217
225
|
],
|
226
|
+
image_encoder: SiglipModel = None,
|
227
|
+
feature_extractor: Optional[SiglipImageProcessor] = None,
|
218
228
|
):
|
219
229
|
super().__init__()
|
220
230
|
|
@@ -229,10 +239,10 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
229
239
|
transformer=transformer,
|
230
240
|
scheduler=scheduler,
|
231
241
|
controlnet=controlnet,
|
242
|
+
image_encoder=image_encoder,
|
243
|
+
feature_extractor=feature_extractor,
|
232
244
|
)
|
233
|
-
self.vae_scale_factor = (
|
234
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
235
|
-
)
|
245
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
236
246
|
self.image_processor = VaeImageProcessor(
|
237
247
|
vae_scale_factor=self.vae_scale_factor, do_resize=True, do_convert_rgb=True, do_normalize=True
|
238
248
|
)
|
@@ -412,9 +422,9 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
412
422
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
413
423
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
414
424
|
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
|
415
|
-
|
425
|
+
negative_prompt_3 (`str` or `List[str]`, *optional*):
|
416
426
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
|
417
|
-
`text_encoder_3`. If not defined, `negative_prompt` is used in
|
427
|
+
`text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
|
418
428
|
prompt_embeds (`torch.FloatTensor`, *optional*):
|
419
429
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
420
430
|
provided, text embeddings will be generated from `prompt` input argument.
|
@@ -777,6 +787,84 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
777
787
|
def interrupt(self):
|
778
788
|
return self._interrupt
|
779
789
|
|
790
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
|
791
|
+
def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
|
792
|
+
"""Encodes the given image into a feature representation using a pre-trained image encoder.
|
793
|
+
|
794
|
+
Args:
|
795
|
+
image (`PipelineImageInput`):
|
796
|
+
Input image to be encoded.
|
797
|
+
device: (`torch.device`):
|
798
|
+
Torch device.
|
799
|
+
|
800
|
+
Returns:
|
801
|
+
`torch.Tensor`: The encoded image feature representation.
|
802
|
+
"""
|
803
|
+
if not isinstance(image, torch.Tensor):
|
804
|
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
805
|
+
|
806
|
+
image = image.to(device=device, dtype=self.dtype)
|
807
|
+
|
808
|
+
return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
809
|
+
|
810
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
|
811
|
+
def prepare_ip_adapter_image_embeds(
|
812
|
+
self,
|
813
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
814
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
815
|
+
device: Optional[torch.device] = None,
|
816
|
+
num_images_per_prompt: int = 1,
|
817
|
+
do_classifier_free_guidance: bool = True,
|
818
|
+
) -> torch.Tensor:
|
819
|
+
"""Prepares image embeddings for use in the IP-Adapter.
|
820
|
+
|
821
|
+
Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
|
822
|
+
|
823
|
+
Args:
|
824
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
825
|
+
The input image to extract features from for IP-Adapter.
|
826
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
827
|
+
Precomputed image embeddings.
|
828
|
+
device: (`torch.device`, *optional*):
|
829
|
+
Torch device.
|
830
|
+
num_images_per_prompt (`int`, defaults to 1):
|
831
|
+
Number of images that should be generated per prompt.
|
832
|
+
do_classifier_free_guidance (`bool`, defaults to True):
|
833
|
+
Whether to use classifier free guidance or not.
|
834
|
+
"""
|
835
|
+
device = device or self._execution_device
|
836
|
+
|
837
|
+
if ip_adapter_image_embeds is not None:
|
838
|
+
if do_classifier_free_guidance:
|
839
|
+
single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
|
840
|
+
else:
|
841
|
+
single_image_embeds = ip_adapter_image_embeds
|
842
|
+
elif ip_adapter_image is not None:
|
843
|
+
single_image_embeds = self.encode_image(ip_adapter_image, device)
|
844
|
+
if do_classifier_free_guidance:
|
845
|
+
single_negative_image_embeds = torch.zeros_like(single_image_embeds)
|
846
|
+
else:
|
847
|
+
raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
|
848
|
+
|
849
|
+
image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
850
|
+
|
851
|
+
if do_classifier_free_guidance:
|
852
|
+
negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
853
|
+
image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
|
854
|
+
|
855
|
+
return image_embeds.to(device=device)
|
856
|
+
|
857
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
|
858
|
+
def enable_sequential_cpu_offload(self, *args, **kwargs):
|
859
|
+
if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
|
860
|
+
logger.warning(
|
861
|
+
"`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
|
862
|
+
"`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
|
863
|
+
"`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
|
864
|
+
)
|
865
|
+
|
866
|
+
super().enable_sequential_cpu_offload(*args, **kwargs)
|
867
|
+
|
780
868
|
@torch.no_grad()
|
781
869
|
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
782
870
|
def __call__(
|
@@ -805,6 +893,8 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
805
893
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
806
894
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
807
895
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
896
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
897
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
808
898
|
output_type: Optional[str] = "pil",
|
809
899
|
return_dict: bool = True,
|
810
900
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -898,6 +988,12 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
898
988
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
899
989
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
900
990
|
input argument.
|
991
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
992
|
+
Optional image input to work with IP Adapters.
|
993
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
994
|
+
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
|
995
|
+
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
|
996
|
+
`True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
901
997
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
902
998
|
The output format of the generate image. Choose between
|
903
999
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
@@ -1059,7 +1155,22 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
|
|
1059
1155
|
]
|
1060
1156
|
controlnet_keep.append(keeps[0] if isinstance(self.controlnet, SD3ControlNetModel) else keeps)
|
1061
1157
|
|
1062
|
-
# 7.
|
1158
|
+
# 7. Prepare image embeddings
|
1159
|
+
if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
|
1160
|
+
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
|
1161
|
+
ip_adapter_image,
|
1162
|
+
ip_adapter_image_embeds,
|
1163
|
+
device,
|
1164
|
+
batch_size * num_images_per_prompt,
|
1165
|
+
self.do_classifier_free_guidance,
|
1166
|
+
)
|
1167
|
+
|
1168
|
+
if self.joint_attention_kwargs is None:
|
1169
|
+
self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
|
1170
|
+
else:
|
1171
|
+
self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
|
1172
|
+
|
1173
|
+
# 8. Denoising loop
|
1063
1174
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
1064
1175
|
for i, t in enumerate(timesteps):
|
1065
1176
|
if self.interrupt:
|
@@ -30,6 +30,7 @@ from ...schedulers import KarrasDiffusionSchedulers
|
|
30
30
|
from ...utils import (
|
31
31
|
USE_PEFT_BACKEND,
|
32
32
|
deprecate,
|
33
|
+
is_torch_xla_available,
|
33
34
|
logging,
|
34
35
|
replace_example_docstring,
|
35
36
|
scale_lora_layers,
|
@@ -41,6 +42,13 @@ from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
|
|
41
42
|
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
42
43
|
|
43
44
|
|
45
|
+
if is_torch_xla_available():
|
46
|
+
import torch_xla.core.xla_model as xm
|
47
|
+
|
48
|
+
XLA_AVAILABLE = True
|
49
|
+
else:
|
50
|
+
XLA_AVAILABLE = False
|
51
|
+
|
44
52
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
45
53
|
|
46
54
|
|
@@ -178,7 +186,7 @@ class StableDiffusionControlNetXSPipeline(
|
|
178
186
|
safety_checker=safety_checker,
|
179
187
|
feature_extractor=feature_extractor,
|
180
188
|
)
|
181
|
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
189
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
182
190
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
|
183
191
|
self.control_image_processor = VaeImageProcessor(
|
184
192
|
vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
|
@@ -884,6 +892,9 @@ class StableDiffusionControlNetXSPipeline(
|
|
884
892
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
885
893
|
progress_bar.update()
|
886
894
|
|
895
|
+
if XLA_AVAILABLE:
|
896
|
+
xm.mark_step()
|
897
|
+
|
887
898
|
# If we do sequential model offloading, let's offload unet and controlnet
|
888
899
|
# manually for max memory savings
|
889
900
|
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|