diffusers 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +186 -3
- diffusers/configuration_utils.py +40 -12
- diffusers/dependency_versions_table.py +9 -2
- diffusers/hooks/__init__.py +9 -0
- diffusers/hooks/faster_cache.py +653 -0
- diffusers/hooks/group_offloading.py +793 -0
- diffusers/hooks/hooks.py +236 -0
- diffusers/hooks/layerwise_casting.py +245 -0
- diffusers/hooks/pyramid_attention_broadcast.py +311 -0
- diffusers/loaders/__init__.py +6 -0
- diffusers/loaders/ip_adapter.py +38 -30
- diffusers/loaders/lora_base.py +198 -28
- diffusers/loaders/lora_conversion_utils.py +679 -44
- diffusers/loaders/lora_pipeline.py +1963 -801
- diffusers/loaders/peft.py +169 -84
- diffusers/loaders/single_file.py +17 -2
- diffusers/loaders/single_file_model.py +53 -5
- diffusers/loaders/single_file_utils.py +653 -75
- diffusers/loaders/textual_inversion.py +9 -9
- diffusers/loaders/transformer_flux.py +8 -9
- diffusers/loaders/transformer_sd3.py +120 -39
- diffusers/loaders/unet.py +22 -32
- diffusers/models/__init__.py +22 -0
- diffusers/models/activations.py +9 -9
- diffusers/models/attention.py +0 -1
- diffusers/models/attention_processor.py +163 -25
- diffusers/models/auto_model.py +169 -0
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
- diffusers/models/autoencoders/autoencoder_dc.py +106 -4
- diffusers/models/autoencoders/autoencoder_kl.py +0 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
- diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
- diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
- diffusers/models/autoencoders/vae.py +31 -141
- diffusers/models/autoencoders/vq_model.py +3 -0
- diffusers/models/cache_utils.py +108 -0
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -8
- diffusers/models/controlnets/controlnet_flux.py +14 -42
- diffusers/models/controlnets/controlnet_sd3.py +58 -34
- diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
- diffusers/models/controlnets/controlnet_union.py +27 -18
- diffusers/models/controlnets/controlnet_xs.py +7 -46
- diffusers/models/controlnets/multicontrolnet_union.py +196 -0
- diffusers/models/embeddings.py +18 -7
- diffusers/models/model_loading_utils.py +122 -80
- diffusers/models/modeling_flax_pytorch_utils.py +1 -1
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +617 -272
- diffusers/models/normalization.py +67 -14
- diffusers/models/resnet.py +1 -1
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
- diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
- diffusers/models/transformers/consisid_transformer_3d.py +789 -0
- diffusers/models/transformers/dit_transformer_2d.py +5 -19
- diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
- diffusers/models/transformers/latte_transformer_3d.py +20 -15
- diffusers/models/transformers/lumina_nextdit2d.py +3 -1
- diffusers/models/transformers/pixart_transformer_2d.py +4 -19
- diffusers/models/transformers/prior_transformer.py +5 -1
- diffusers/models/transformers/sana_transformer.py +144 -40
- diffusers/models/transformers/stable_audio_transformer.py +5 -20
- diffusers/models/transformers/transformer_2d.py +7 -22
- diffusers/models/transformers/transformer_allegro.py +9 -17
- diffusers/models/transformers/transformer_cogview3plus.py +6 -17
- diffusers/models/transformers/transformer_cogview4.py +462 -0
- diffusers/models/transformers/transformer_easyanimate.py +527 -0
- diffusers/models/transformers/transformer_flux.py +68 -110
- diffusers/models/transformers/transformer_hunyuan_video.py +409 -49
- diffusers/models/transformers/transformer_ltx.py +53 -35
- diffusers/models/transformers/transformer_lumina2.py +548 -0
- diffusers/models/transformers/transformer_mochi.py +6 -17
- diffusers/models/transformers/transformer_omnigen.py +469 -0
- diffusers/models/transformers/transformer_sd3.py +56 -86
- diffusers/models/transformers/transformer_temporal.py +5 -11
- diffusers/models/transformers/transformer_wan.py +469 -0
- diffusers/models/unets/unet_1d.py +3 -1
- diffusers/models/unets/unet_2d.py +21 -20
- diffusers/models/unets/unet_2d_blocks.py +19 -243
- diffusers/models/unets/unet_2d_condition.py +4 -6
- diffusers/models/unets/unet_3d_blocks.py +14 -127
- diffusers/models/unets/unet_3d_condition.py +8 -12
- diffusers/models/unets/unet_i2vgen_xl.py +5 -13
- diffusers/models/unets/unet_kandinsky3.py +0 -4
- diffusers/models/unets/unet_motion_model.py +20 -114
- diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
- diffusers/models/unets/unet_stable_cascade.py +8 -35
- diffusers/models/unets/uvit_2d.py +1 -4
- diffusers/optimization.py +2 -2
- diffusers/pipelines/__init__.py +57 -8
- diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
- diffusers/pipelines/amused/pipeline_amused.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
- diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
- diffusers/pipelines/auto_pipeline.py +35 -14
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
- diffusers/pipelines/cogview4/__init__.py +49 -0
- diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
- diffusers/pipelines/cogview4/pipeline_output.py +21 -0
- diffusers/pipelines/consisid/__init__.py +49 -0
- diffusers/pipelines/consisid/consisid_utils.py +357 -0
- diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
- diffusers/pipelines/consisid/pipeline_output.py +20 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
- diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
- diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +15 -2
- diffusers/pipelines/easyanimate/__init__.py +52 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
- diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -21
- diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
- diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
- diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hunyuan_video/__init__.py +4 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
- diffusers/pipelines/kolors/text_encoder.py +7 -34
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
- diffusers/pipelines/latte/pipeline_latte.py +36 -7
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
- diffusers/pipelines/ltx/__init__.py +2 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
- diffusers/pipelines/lumina/__init__.py +2 -2
- diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
- diffusers/pipelines/lumina2/__init__.py +48 -0
- diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
- diffusers/pipelines/marigold/__init__.py +2 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
- diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
- diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
- diffusers/pipelines/omnigen/__init__.py +50 -0
- diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
- diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
- diffusers/pipelines/onnx_utils.py +5 -3
- diffusers/pipelines/pag/pag_utils.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
- diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
- diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
- diffusers/pipelines/pia/pipeline_pia.py +13 -1
- diffusers/pipelines/pipeline_flax_utils.py +7 -7
- diffusers/pipelines/pipeline_loading_utils.py +193 -83
- diffusers/pipelines/pipeline_utils.py +221 -106
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
- diffusers/pipelines/sana/__init__.py +2 -0
- diffusers/pipelines/sana/pipeline_sana.py +183 -58
- diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
- diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
- diffusers/pipelines/shap_e/renderer.py +6 -6
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
- diffusers/pipelines/transformers_loading_utils.py +121 -0
- diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
- diffusers/pipelines/wan/__init__.py +51 -0
- diffusers/pipelines/wan/pipeline_output.py +20 -0
- diffusers/pipelines/wan/pipeline_wan.py +593 -0
- diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
- diffusers/quantizers/auto.py +5 -1
- diffusers/quantizers/base.py +5 -9
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
- diffusers/quantizers/bitsandbytes/utils.py +30 -20
- diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
- diffusers/quantizers/gguf/utils.py +4 -2
- diffusers/quantizers/quantization_config.py +59 -4
- diffusers/quantizers/quanto/__init__.py +1 -0
- diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
- diffusers/quantizers/quanto/utils.py +60 -0
- diffusers/quantizers/torchao/__init__.py +1 -1
- diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
- diffusers/schedulers/__init__.py +2 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -2
- diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
- diffusers/schedulers/scheduling_ddpm.py +2 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
- diffusers/schedulers/scheduling_edm_euler.py +45 -10
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
- diffusers/schedulers/scheduling_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_lcm.py +1 -2
- diffusers/schedulers/scheduling_lms_discrete.py +1 -1
- diffusers/schedulers/scheduling_repaint.py +5 -1
- diffusers/schedulers/scheduling_scm.py +265 -0
- diffusers/schedulers/scheduling_tcd.py +1 -2
- diffusers/schedulers/scheduling_utils.py +2 -1
- diffusers/training_utils.py +14 -7
- diffusers/utils/__init__.py +10 -2
- diffusers/utils/constants.py +13 -1
- diffusers/utils/deprecation_utils.py +1 -1
- diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
- diffusers/utils/dummy_gguf_objects.py +17 -0
- diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
- diffusers/utils/dummy_pt_objects.py +233 -0
- diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dummy_torchao_objects.py +17 -0
- diffusers/utils/dynamic_modules_utils.py +1 -1
- diffusers/utils/export_utils.py +28 -3
- diffusers/utils/hub_utils.py +52 -102
- diffusers/utils/import_utils.py +121 -221
- diffusers/utils/loading_utils.py +14 -1
- diffusers/utils/logging.py +1 -2
- diffusers/utils/peft_utils.py +6 -14
- diffusers/utils/remote_utils.py +425 -0
- diffusers/utils/source_code_parsing_utils.py +52 -0
- diffusers/utils/state_dict_utils.py +15 -1
- diffusers/utils/testing_utils.py +243 -13
- diffusers/utils/torch_utils.py +10 -0
- diffusers/utils/typing_utils.py +91 -0
- diffusers/video_processor.py +1 -1
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
- diffusers-0.33.0.dist-info/RECORD +608 -0
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
- diffusers-0.32.1.dist-info/RECORD +0 -550
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -18,14 +18,16 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
18
18
|
import numpy as np
|
19
19
|
import torch
|
20
20
|
from transformers import (
|
21
|
+
CLIPImageProcessor,
|
21
22
|
CLIPTextModel,
|
22
23
|
CLIPTokenizer,
|
24
|
+
CLIPVisionModelWithProjection,
|
23
25
|
T5EncoderModel,
|
24
26
|
T5TokenizerFast,
|
25
27
|
)
|
26
28
|
|
27
29
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
28
|
-
from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
|
30
|
+
from ...loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
|
29
31
|
from ...models.autoencoders import AutoencoderKL
|
30
32
|
from ...models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
|
31
33
|
from ...models.transformers import FluxTransformer2DModel
|
@@ -61,6 +63,7 @@ EXAMPLE_DOC_STRING = """
|
|
61
63
|
>>> from diffusers import FluxControlNetPipeline
|
62
64
|
>>> from diffusers import FluxControlNetModel
|
63
65
|
|
66
|
+
>>> base_model = "black-forest-labs/FLUX.1-dev"
|
64
67
|
>>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
|
65
68
|
>>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
|
66
69
|
>>> pipe = FluxControlNetPipeline.from_pretrained(
|
@@ -89,7 +92,7 @@ def calculate_shift(
|
|
89
92
|
base_seq_len: int = 256,
|
90
93
|
max_seq_len: int = 4096,
|
91
94
|
base_shift: float = 0.5,
|
92
|
-
max_shift: float = 1.
|
95
|
+
max_shift: float = 1.15,
|
93
96
|
):
|
94
97
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
95
98
|
b = base_shift - m * base_seq_len
|
@@ -171,7 +174,7 @@ def retrieve_timesteps(
|
|
171
174
|
return timesteps, num_inference_steps
|
172
175
|
|
173
176
|
|
174
|
-
class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
177
|
+
class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin, FluxIPAdapterMixin):
|
175
178
|
r"""
|
176
179
|
The Flux pipeline for text-to-image generation.
|
177
180
|
|
@@ -198,9 +201,9 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
198
201
|
[T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
|
199
202
|
"""
|
200
203
|
|
201
|
-
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
202
|
-
_optional_components = []
|
203
|
-
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
204
|
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
|
205
|
+
_optional_components = ["image_encoder", "feature_extractor"]
|
206
|
+
_callback_tensor_inputs = ["latents", "prompt_embeds", "control_image"]
|
204
207
|
|
205
208
|
def __init__(
|
206
209
|
self,
|
@@ -214,6 +217,8 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
214
217
|
controlnet: Union[
|
215
218
|
FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
|
216
219
|
],
|
220
|
+
image_encoder: CLIPVisionModelWithProjection = None,
|
221
|
+
feature_extractor: CLIPImageProcessor = None,
|
217
222
|
):
|
218
223
|
super().__init__()
|
219
224
|
if isinstance(controlnet, (list, tuple)):
|
@@ -228,10 +233,10 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
228
233
|
transformer=transformer,
|
229
234
|
scheduler=scheduler,
|
230
235
|
controlnet=controlnet,
|
236
|
+
image_encoder=image_encoder,
|
237
|
+
feature_extractor=feature_extractor,
|
231
238
|
)
|
232
|
-
self.vae_scale_factor = (
|
233
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
234
|
-
)
|
239
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
235
240
|
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
236
241
|
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
237
242
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
|
@@ -415,14 +420,67 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
415
420
|
|
416
421
|
return prompt_embeds, pooled_prompt_embeds, text_ids
|
417
422
|
|
423
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
|
424
|
+
def encode_image(self, image, device, num_images_per_prompt):
|
425
|
+
dtype = next(self.image_encoder.parameters()).dtype
|
426
|
+
|
427
|
+
if not isinstance(image, torch.Tensor):
|
428
|
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
429
|
+
|
430
|
+
image = image.to(device=device, dtype=dtype)
|
431
|
+
image_embeds = self.image_encoder(image).image_embeds
|
432
|
+
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
433
|
+
return image_embeds
|
434
|
+
|
435
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
|
436
|
+
def prepare_ip_adapter_image_embeds(
|
437
|
+
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
|
438
|
+
):
|
439
|
+
image_embeds = []
|
440
|
+
if ip_adapter_image_embeds is None:
|
441
|
+
if not isinstance(ip_adapter_image, list):
|
442
|
+
ip_adapter_image = [ip_adapter_image]
|
443
|
+
|
444
|
+
if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
|
445
|
+
raise ValueError(
|
446
|
+
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
|
447
|
+
)
|
448
|
+
|
449
|
+
for single_ip_adapter_image in ip_adapter_image:
|
450
|
+
single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
|
451
|
+
image_embeds.append(single_image_embeds[None, :])
|
452
|
+
else:
|
453
|
+
if not isinstance(ip_adapter_image_embeds, list):
|
454
|
+
ip_adapter_image_embeds = [ip_adapter_image_embeds]
|
455
|
+
|
456
|
+
if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
|
457
|
+
raise ValueError(
|
458
|
+
f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
|
459
|
+
)
|
460
|
+
|
461
|
+
for single_image_embeds in ip_adapter_image_embeds:
|
462
|
+
image_embeds.append(single_image_embeds)
|
463
|
+
|
464
|
+
ip_adapter_image_embeds = []
|
465
|
+
for single_image_embeds in image_embeds:
|
466
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
467
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
468
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
469
|
+
|
470
|
+
return ip_adapter_image_embeds
|
471
|
+
|
418
472
|
def check_inputs(
|
419
473
|
self,
|
420
474
|
prompt,
|
421
475
|
prompt_2,
|
422
476
|
height,
|
423
477
|
width,
|
478
|
+
negative_prompt=None,
|
479
|
+
negative_prompt_2=None,
|
424
480
|
prompt_embeds=None,
|
481
|
+
negative_prompt_embeds=None,
|
425
482
|
pooled_prompt_embeds=None,
|
483
|
+
negative_pooled_prompt_embeds=None,
|
426
484
|
callback_on_step_end_tensor_inputs=None,
|
427
485
|
max_sequence_length=None,
|
428
486
|
):
|
@@ -457,10 +515,33 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
457
515
|
elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
|
458
516
|
raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
|
459
517
|
|
518
|
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
519
|
+
raise ValueError(
|
520
|
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
521
|
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
522
|
+
)
|
523
|
+
elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
|
524
|
+
raise ValueError(
|
525
|
+
f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
|
526
|
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
527
|
+
)
|
528
|
+
|
529
|
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
530
|
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
531
|
+
raise ValueError(
|
532
|
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
533
|
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
534
|
+
f" {negative_prompt_embeds.shape}."
|
535
|
+
)
|
536
|
+
|
460
537
|
if prompt_embeds is not None and pooled_prompt_embeds is None:
|
461
538
|
raise ValueError(
|
462
539
|
"If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
|
463
540
|
)
|
541
|
+
if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
|
542
|
+
raise ValueError(
|
543
|
+
"If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
|
544
|
+
)
|
464
545
|
|
465
546
|
if max_sequence_length is not None and max_sequence_length > 512:
|
466
547
|
raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
|
@@ -599,6 +680,9 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
599
680
|
self,
|
600
681
|
prompt: Union[str, List[str]] = None,
|
601
682
|
prompt_2: Optional[Union[str, List[str]]] = None,
|
683
|
+
negative_prompt: Union[str, List[str]] = None,
|
684
|
+
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
685
|
+
true_cfg_scale: float = 1.0,
|
602
686
|
height: Optional[int] = None,
|
603
687
|
width: Optional[int] = None,
|
604
688
|
num_inference_steps: int = 28,
|
@@ -614,6 +698,12 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
614
698
|
latents: Optional[torch.FloatTensor] = None,
|
615
699
|
prompt_embeds: Optional[torch.FloatTensor] = None,
|
616
700
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
701
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
702
|
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
703
|
+
negative_ip_adapter_image: Optional[PipelineImageInput] = None,
|
704
|
+
negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
705
|
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
706
|
+
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
617
707
|
output_type: Optional[str] = "pil",
|
618
708
|
return_dict: bool = True,
|
619
709
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -681,6 +771,17 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
681
771
|
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
682
772
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
683
773
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
774
|
+
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
775
|
+
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
776
|
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
777
|
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
|
778
|
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
779
|
+
negative_ip_adapter_image:
|
780
|
+
(`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
781
|
+
negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
782
|
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
783
|
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
|
784
|
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
684
785
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
685
786
|
The output format of the generate image. Choose between
|
686
787
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
@@ -729,8 +830,12 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
729
830
|
prompt_2,
|
730
831
|
height,
|
731
832
|
width,
|
833
|
+
negative_prompt=negative_prompt,
|
834
|
+
negative_prompt_2=negative_prompt_2,
|
732
835
|
prompt_embeds=prompt_embeds,
|
836
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
733
837
|
pooled_prompt_embeds=pooled_prompt_embeds,
|
838
|
+
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
734
839
|
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
735
840
|
max_sequence_length=max_sequence_length,
|
736
841
|
)
|
@@ -754,6 +859,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
754
859
|
lora_scale = (
|
755
860
|
self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
|
756
861
|
)
|
862
|
+
do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
|
757
863
|
(
|
758
864
|
prompt_embeds,
|
759
865
|
pooled_prompt_embeds,
|
@@ -768,6 +874,21 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
768
874
|
max_sequence_length=max_sequence_length,
|
769
875
|
lora_scale=lora_scale,
|
770
876
|
)
|
877
|
+
if do_true_cfg:
|
878
|
+
(
|
879
|
+
negative_prompt_embeds,
|
880
|
+
negative_pooled_prompt_embeds,
|
881
|
+
_,
|
882
|
+
) = self.encode_prompt(
|
883
|
+
prompt=negative_prompt,
|
884
|
+
prompt_2=negative_prompt_2,
|
885
|
+
prompt_embeds=negative_prompt_embeds,
|
886
|
+
pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
887
|
+
device=device,
|
888
|
+
num_images_per_prompt=num_images_per_prompt,
|
889
|
+
max_sequence_length=max_sequence_length,
|
890
|
+
lora_scale=lora_scale,
|
891
|
+
)
|
771
892
|
|
772
893
|
# 3. Prepare control image
|
773
894
|
num_channels_latents = self.transformer.config.in_channels // 4
|
@@ -876,10 +997,10 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
876
997
|
image_seq_len = latents.shape[1]
|
877
998
|
mu = calculate_shift(
|
878
999
|
image_seq_len,
|
879
|
-
self.scheduler.config.base_image_seq_len,
|
880
|
-
self.scheduler.config.max_image_seq_len,
|
881
|
-
self.scheduler.config.base_shift,
|
882
|
-
self.scheduler.config.max_shift,
|
1000
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
1001
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
1002
|
+
self.scheduler.config.get("base_shift", 0.5),
|
1003
|
+
self.scheduler.config.get("max_shift", 1.15),
|
883
1004
|
)
|
884
1005
|
timesteps, num_inference_steps = retrieve_timesteps(
|
885
1006
|
self.scheduler,
|
@@ -901,12 +1022,43 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
901
1022
|
]
|
902
1023
|
controlnet_keep.append(keeps[0] if isinstance(self.controlnet, FluxControlNetModel) else keeps)
|
903
1024
|
|
1025
|
+
if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
|
1026
|
+
negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
|
1027
|
+
):
|
1028
|
+
negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
|
1029
|
+
elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
|
1030
|
+
negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
|
1031
|
+
):
|
1032
|
+
ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
|
1033
|
+
|
1034
|
+
if self.joint_attention_kwargs is None:
|
1035
|
+
self._joint_attention_kwargs = {}
|
1036
|
+
|
1037
|
+
image_embeds = None
|
1038
|
+
negative_image_embeds = None
|
1039
|
+
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
1040
|
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
1041
|
+
ip_adapter_image,
|
1042
|
+
ip_adapter_image_embeds,
|
1043
|
+
device,
|
1044
|
+
batch_size * num_images_per_prompt,
|
1045
|
+
)
|
1046
|
+
if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
|
1047
|
+
negative_image_embeds = self.prepare_ip_adapter_image_embeds(
|
1048
|
+
negative_ip_adapter_image,
|
1049
|
+
negative_ip_adapter_image_embeds,
|
1050
|
+
device,
|
1051
|
+
batch_size * num_images_per_prompt,
|
1052
|
+
)
|
1053
|
+
|
904
1054
|
# 7. Denoising loop
|
905
1055
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
906
1056
|
for i, t in enumerate(timesteps):
|
907
1057
|
if self.interrupt:
|
908
1058
|
continue
|
909
1059
|
|
1060
|
+
if image_embeds is not None:
|
1061
|
+
self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
|
910
1062
|
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
911
1063
|
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
912
1064
|
|
@@ -962,6 +1114,25 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
962
1114
|
controlnet_blocks_repeat=controlnet_blocks_repeat,
|
963
1115
|
)[0]
|
964
1116
|
|
1117
|
+
if do_true_cfg:
|
1118
|
+
if negative_image_embeds is not None:
|
1119
|
+
self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
|
1120
|
+
neg_noise_pred = self.transformer(
|
1121
|
+
hidden_states=latents,
|
1122
|
+
timestep=timestep / 1000,
|
1123
|
+
guidance=guidance,
|
1124
|
+
pooled_projections=negative_pooled_prompt_embeds,
|
1125
|
+
encoder_hidden_states=negative_prompt_embeds,
|
1126
|
+
controlnet_block_samples=controlnet_block_samples,
|
1127
|
+
controlnet_single_block_samples=controlnet_single_block_samples,
|
1128
|
+
txt_ids=text_ids,
|
1129
|
+
img_ids=latent_image_ids,
|
1130
|
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
1131
|
+
return_dict=False,
|
1132
|
+
controlnet_blocks_repeat=controlnet_blocks_repeat,
|
1133
|
+
)[0]
|
1134
|
+
noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
|
1135
|
+
|
965
1136
|
# compute the previous noisy sample x_t -> x_t-1
|
966
1137
|
latents_dtype = latents.dtype
|
967
1138
|
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
@@ -979,6 +1150,7 @@ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleF
|
|
979
1150
|
|
980
1151
|
latents = callback_outputs.pop("latents", latents)
|
981
1152
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1153
|
+
control_image = callback_outputs.pop("control_image", control_image)
|
982
1154
|
|
983
1155
|
# call the callback, if provided
|
984
1156
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
@@ -87,7 +87,7 @@ def calculate_shift(
|
|
87
87
|
base_seq_len: int = 256,
|
88
88
|
max_seq_len: int = 4096,
|
89
89
|
base_shift: float = 0.5,
|
90
|
-
max_shift: float = 1.
|
90
|
+
max_shift: float = 1.15,
|
91
91
|
):
|
92
92
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
93
93
|
b = base_shift - m * base_seq_len
|
@@ -198,7 +198,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
198
198
|
|
199
199
|
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
200
200
|
_optional_components = []
|
201
|
-
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
201
|
+
_callback_tensor_inputs = ["latents", "prompt_embeds", "control_image"]
|
202
202
|
|
203
203
|
def __init__(
|
204
204
|
self,
|
@@ -227,9 +227,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
227
227
|
scheduler=scheduler,
|
228
228
|
controlnet=controlnet,
|
229
229
|
)
|
230
|
-
self.vae_scale_factor = (
|
231
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
232
|
-
)
|
230
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
233
231
|
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
234
232
|
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
235
233
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
|
@@ -535,7 +533,6 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
535
533
|
|
536
534
|
return latents
|
537
535
|
|
538
|
-
# Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents
|
539
536
|
def prepare_latents(
|
540
537
|
self,
|
541
538
|
image,
|
@@ -864,10 +861,10 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
864
861
|
image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
|
865
862
|
mu = calculate_shift(
|
866
863
|
image_seq_len,
|
867
|
-
self.scheduler.config.base_image_seq_len,
|
868
|
-
self.scheduler.config.max_image_seq_len,
|
869
|
-
self.scheduler.config.base_shift,
|
870
|
-
self.scheduler.config.max_shift,
|
864
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
865
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
866
|
+
self.scheduler.config.get("base_shift", 0.5),
|
867
|
+
self.scheduler.config.get("max_shift", 1.15),
|
871
868
|
)
|
872
869
|
timesteps, num_inference_steps = retrieve_timesteps(
|
873
870
|
self.scheduler,
|
@@ -975,6 +972,7 @@ class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
975
972
|
|
976
973
|
latents = callback_outputs.pop("latents", latents)
|
977
974
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
975
|
+
control_image = callback_outputs.pop("control_image", control_image)
|
978
976
|
|
979
977
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
980
978
|
progress_bar.update()
|
@@ -89,7 +89,7 @@ def calculate_shift(
|
|
89
89
|
base_seq_len: int = 256,
|
90
90
|
max_seq_len: int = 4096,
|
91
91
|
base_shift: float = 0.5,
|
92
|
-
max_shift: float = 1.
|
92
|
+
max_shift: float = 1.15,
|
93
93
|
):
|
94
94
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
95
95
|
b = base_shift - m * base_seq_len
|
@@ -200,7 +200,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
200
200
|
|
201
201
|
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
202
202
|
_optional_components = []
|
203
|
-
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
203
|
+
_callback_tensor_inputs = ["latents", "prompt_embeds", "control_image", "mask", "masked_image_latents"]
|
204
204
|
|
205
205
|
def __init__(
|
206
206
|
self,
|
@@ -230,15 +230,14 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
230
230
|
controlnet=controlnet,
|
231
231
|
)
|
232
232
|
|
233
|
-
self.vae_scale_factor = (
|
234
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
235
|
-
)
|
233
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
236
234
|
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
237
235
|
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
238
236
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
|
237
|
+
latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
|
239
238
|
self.mask_processor = VaeImageProcessor(
|
240
239
|
vae_scale_factor=self.vae_scale_factor * 2,
|
241
|
-
vae_latent_channels=
|
240
|
+
vae_latent_channels=latent_channels,
|
242
241
|
do_normalize=False,
|
243
242
|
do_binarize=True,
|
244
243
|
do_convert_grayscale=True,
|
@@ -508,7 +507,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
508
507
|
if padding_mask_crop is not None:
|
509
508
|
if not isinstance(image, PIL.Image.Image):
|
510
509
|
raise ValueError(
|
511
|
-
f"The image should be a PIL image when inpainting mask crop, but is of type
|
510
|
+
f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
|
512
511
|
)
|
513
512
|
if not isinstance(mask_image, PIL.Image.Image):
|
514
513
|
raise ValueError(
|
@@ -516,7 +515,7 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
516
515
|
f" {type(mask_image)}."
|
517
516
|
)
|
518
517
|
if output_type != "pil":
|
519
|
-
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is
|
518
|
+
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
|
520
519
|
|
521
520
|
if max_sequence_length is not None and max_sequence_length > 512:
|
522
521
|
raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
|
@@ -562,7 +561,6 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
562
561
|
|
563
562
|
return latents
|
564
563
|
|
565
|
-
# Copied from diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline.prepare_latents
|
566
564
|
def prepare_latents(
|
567
565
|
self,
|
568
566
|
image,
|
@@ -615,7 +613,6 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
615
613
|
latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
|
616
614
|
return latents, noise, image_latents, latent_image_ids
|
617
615
|
|
618
|
-
# Copied from diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline.prepare_mask_latents
|
619
616
|
def prepare_mask_latents(
|
620
617
|
self,
|
621
618
|
mask,
|
@@ -931,8 +928,8 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
931
928
|
if isinstance(self.controlnet, FluxControlNetModel):
|
932
929
|
control_image = self.prepare_image(
|
933
930
|
image=control_image,
|
934
|
-
width=
|
935
|
-
height=
|
931
|
+
width=width,
|
932
|
+
height=height,
|
936
933
|
batch_size=batch_size * num_images_per_prompt,
|
937
934
|
num_images_per_prompt=num_images_per_prompt,
|
938
935
|
device=device,
|
@@ -1017,10 +1014,10 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
1017
1014
|
)
|
1018
1015
|
mu = calculate_shift(
|
1019
1016
|
image_seq_len,
|
1020
|
-
self.scheduler.config.base_image_seq_len,
|
1021
|
-
self.scheduler.config.max_image_seq_len,
|
1022
|
-
self.scheduler.config.base_shift,
|
1023
|
-
self.scheduler.config.max_shift,
|
1017
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
1018
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
1019
|
+
self.scheduler.config.get("base_shift", 0.5),
|
1020
|
+
self.scheduler.config.get("max_shift", 1.15),
|
1024
1021
|
)
|
1025
1022
|
timesteps, num_inference_steps = retrieve_timesteps(
|
1026
1023
|
self.scheduler,
|
@@ -1179,6 +1176,9 @@ class FluxControlNetInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, From
|
|
1179
1176
|
|
1180
1177
|
latents = callback_outputs.pop("latents", latents)
|
1181
1178
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1179
|
+
control_image = callback_outputs.pop("control_image", control_image)
|
1180
|
+
mask = callback_outputs.pop("mask", mask)
|
1181
|
+
masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
|
1182
1182
|
|
1183
1183
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
1184
1184
|
progress_bar.update()
|