diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +186 -3
- diffusers/configuration_utils.py +40 -12
- diffusers/dependency_versions_table.py +9 -2
- diffusers/hooks/__init__.py +9 -0
- diffusers/hooks/faster_cache.py +653 -0
- diffusers/hooks/group_offloading.py +793 -0
- diffusers/hooks/hooks.py +236 -0
- diffusers/hooks/layerwise_casting.py +245 -0
- diffusers/hooks/pyramid_attention_broadcast.py +311 -0
- diffusers/loaders/__init__.py +6 -0
- diffusers/loaders/ip_adapter.py +38 -30
- diffusers/loaders/lora_base.py +121 -86
- diffusers/loaders/lora_conversion_utils.py +504 -44
- diffusers/loaders/lora_pipeline.py +1769 -181
- diffusers/loaders/peft.py +167 -57
- diffusers/loaders/single_file.py +17 -2
- diffusers/loaders/single_file_model.py +53 -5
- diffusers/loaders/single_file_utils.py +646 -72
- diffusers/loaders/textual_inversion.py +9 -9
- diffusers/loaders/transformer_flux.py +8 -9
- diffusers/loaders/transformer_sd3.py +120 -39
- diffusers/loaders/unet.py +20 -7
- diffusers/models/__init__.py +22 -0
- diffusers/models/activations.py +9 -9
- diffusers/models/attention.py +0 -1
- diffusers/models/attention_processor.py +163 -25
- diffusers/models/auto_model.py +169 -0
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
- diffusers/models/autoencoders/autoencoder_dc.py +106 -4
- diffusers/models/autoencoders/autoencoder_kl.py +0 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
- diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
- diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
- diffusers/models/autoencoders/vae.py +31 -141
- diffusers/models/autoencoders/vq_model.py +3 -0
- diffusers/models/cache_utils.py +108 -0
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -8
- diffusers/models/controlnets/controlnet_flux.py +14 -42
- diffusers/models/controlnets/controlnet_sd3.py +58 -34
- diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
- diffusers/models/controlnets/controlnet_union.py +27 -18
- diffusers/models/controlnets/controlnet_xs.py +7 -46
- diffusers/models/controlnets/multicontrolnet_union.py +196 -0
- diffusers/models/embeddings.py +18 -7
- diffusers/models/model_loading_utils.py +122 -80
- diffusers/models/modeling_flax_pytorch_utils.py +1 -1
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +617 -272
- diffusers/models/normalization.py +67 -14
- diffusers/models/resnet.py +1 -1
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
- diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
- diffusers/models/transformers/consisid_transformer_3d.py +789 -0
- diffusers/models/transformers/dit_transformer_2d.py +5 -19
- diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
- diffusers/models/transformers/latte_transformer_3d.py +20 -15
- diffusers/models/transformers/lumina_nextdit2d.py +3 -1
- diffusers/models/transformers/pixart_transformer_2d.py +4 -19
- diffusers/models/transformers/prior_transformer.py +5 -1
- diffusers/models/transformers/sana_transformer.py +144 -40
- diffusers/models/transformers/stable_audio_transformer.py +5 -20
- diffusers/models/transformers/transformer_2d.py +7 -22
- diffusers/models/transformers/transformer_allegro.py +9 -17
- diffusers/models/transformers/transformer_cogview3plus.py +6 -17
- diffusers/models/transformers/transformer_cogview4.py +462 -0
- diffusers/models/transformers/transformer_easyanimate.py +527 -0
- diffusers/models/transformers/transformer_flux.py +68 -110
- diffusers/models/transformers/transformer_hunyuan_video.py +404 -46
- diffusers/models/transformers/transformer_ltx.py +53 -35
- diffusers/models/transformers/transformer_lumina2.py +548 -0
- diffusers/models/transformers/transformer_mochi.py +6 -17
- diffusers/models/transformers/transformer_omnigen.py +469 -0
- diffusers/models/transformers/transformer_sd3.py +56 -86
- diffusers/models/transformers/transformer_temporal.py +5 -11
- diffusers/models/transformers/transformer_wan.py +469 -0
- diffusers/models/unets/unet_1d.py +3 -1
- diffusers/models/unets/unet_2d.py +21 -20
- diffusers/models/unets/unet_2d_blocks.py +19 -243
- diffusers/models/unets/unet_2d_condition.py +4 -6
- diffusers/models/unets/unet_3d_blocks.py +14 -127
- diffusers/models/unets/unet_3d_condition.py +8 -12
- diffusers/models/unets/unet_i2vgen_xl.py +5 -13
- diffusers/models/unets/unet_kandinsky3.py +0 -4
- diffusers/models/unets/unet_motion_model.py +20 -114
- diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
- diffusers/models/unets/unet_stable_cascade.py +8 -35
- diffusers/models/unets/uvit_2d.py +1 -4
- diffusers/optimization.py +2 -2
- diffusers/pipelines/__init__.py +57 -8
- diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
- diffusers/pipelines/amused/pipeline_amused.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
- diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
- diffusers/pipelines/auto_pipeline.py +35 -14
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
- diffusers/pipelines/cogview4/__init__.py +49 -0
- diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
- diffusers/pipelines/cogview4/pipeline_output.py +21 -0
- diffusers/pipelines/consisid/__init__.py +49 -0
- diffusers/pipelines/consisid/consisid_utils.py +357 -0
- diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
- diffusers/pipelines/consisid/pipeline_output.py +20 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
- diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
- diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +15 -2
- diffusers/pipelines/easyanimate/__init__.py +52 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
- diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -21
- diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
- diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
- diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hunyuan_video/__init__.py +4 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
- diffusers/pipelines/kolors/text_encoder.py +7 -34
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
- diffusers/pipelines/latte/pipeline_latte.py +36 -7
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
- diffusers/pipelines/ltx/__init__.py +2 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
- diffusers/pipelines/lumina/__init__.py +2 -2
- diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
- diffusers/pipelines/lumina2/__init__.py +48 -0
- diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
- diffusers/pipelines/marigold/__init__.py +2 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
- diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
- diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
- diffusers/pipelines/omnigen/__init__.py +50 -0
- diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
- diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
- diffusers/pipelines/onnx_utils.py +5 -3
- diffusers/pipelines/pag/pag_utils.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
- diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
- diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
- diffusers/pipelines/pia/pipeline_pia.py +13 -1
- diffusers/pipelines/pipeline_flax_utils.py +7 -7
- diffusers/pipelines/pipeline_loading_utils.py +193 -83
- diffusers/pipelines/pipeline_utils.py +221 -106
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
- diffusers/pipelines/sana/__init__.py +2 -0
- diffusers/pipelines/sana/pipeline_sana.py +183 -58
- diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
- diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
- diffusers/pipelines/shap_e/renderer.py +6 -6
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
- diffusers/pipelines/transformers_loading_utils.py +121 -0
- diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
- diffusers/pipelines/wan/__init__.py +51 -0
- diffusers/pipelines/wan/pipeline_output.py +20 -0
- diffusers/pipelines/wan/pipeline_wan.py +593 -0
- diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
- diffusers/quantizers/auto.py +5 -1
- diffusers/quantizers/base.py +5 -9
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
- diffusers/quantizers/bitsandbytes/utils.py +30 -20
- diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
- diffusers/quantizers/gguf/utils.py +4 -2
- diffusers/quantizers/quantization_config.py +59 -4
- diffusers/quantizers/quanto/__init__.py +1 -0
- diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
- diffusers/quantizers/quanto/utils.py +60 -0
- diffusers/quantizers/torchao/__init__.py +1 -1
- diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
- diffusers/schedulers/__init__.py +2 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -2
- diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
- diffusers/schedulers/scheduling_ddpm.py +2 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
- diffusers/schedulers/scheduling_edm_euler.py +45 -10
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
- diffusers/schedulers/scheduling_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_lcm.py +1 -2
- diffusers/schedulers/scheduling_lms_discrete.py +1 -1
- diffusers/schedulers/scheduling_repaint.py +5 -1
- diffusers/schedulers/scheduling_scm.py +265 -0
- diffusers/schedulers/scheduling_tcd.py +1 -2
- diffusers/schedulers/scheduling_utils.py +2 -1
- diffusers/training_utils.py +14 -7
- diffusers/utils/__init__.py +9 -1
- diffusers/utils/constants.py +13 -1
- diffusers/utils/deprecation_utils.py +1 -1
- diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
- diffusers/utils/dummy_gguf_objects.py +17 -0
- diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
- diffusers/utils/dummy_pt_objects.py +233 -0
- diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dummy_torchao_objects.py +17 -0
- diffusers/utils/dynamic_modules_utils.py +1 -1
- diffusers/utils/export_utils.py +28 -3
- diffusers/utils/hub_utils.py +52 -102
- diffusers/utils/import_utils.py +121 -221
- diffusers/utils/loading_utils.py +2 -1
- diffusers/utils/logging.py +1 -2
- diffusers/utils/peft_utils.py +6 -14
- diffusers/utils/remote_utils.py +425 -0
- diffusers/utils/source_code_parsing_utils.py +52 -0
- diffusers/utils/state_dict_utils.py +15 -1
- diffusers/utils/testing_utils.py +243 -13
- diffusers/utils/torch_utils.py +10 -0
- diffusers/utils/typing_utils.py +91 -0
- diffusers/video_processor.py +1 -1
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
- diffusers-0.33.0.dist-info/RECORD +608 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
- diffusers-0.32.2.dist-info/RECORD +0 -550
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -82,7 +82,7 @@ def calculate_shift(
|
|
82
82
|
base_seq_len: int = 256,
|
83
83
|
max_seq_len: int = 4096,
|
84
84
|
base_shift: float = 0.5,
|
85
|
-
max_shift: float = 1.
|
85
|
+
max_shift: float = 1.15,
|
86
86
|
):
|
87
87
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
88
88
|
b = base_shift - m * base_seq_len
|
@@ -221,15 +221,16 @@ class FluxFillPipeline(
|
|
221
221
|
transformer=transformer,
|
222
222
|
scheduler=scheduler,
|
223
223
|
)
|
224
|
-
self.vae_scale_factor = (
|
225
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
226
|
-
)
|
224
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
227
225
|
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
228
226
|
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
229
|
-
self.
|
227
|
+
self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
|
228
|
+
self.image_processor = VaeImageProcessor(
|
229
|
+
vae_scale_factor=self.vae_scale_factor * 2, vae_latent_channels=self.latent_channels
|
230
|
+
)
|
230
231
|
self.mask_processor = VaeImageProcessor(
|
231
232
|
vae_scale_factor=self.vae_scale_factor * 2,
|
232
|
-
vae_latent_channels=self.
|
233
|
+
vae_latent_channels=self.latent_channels,
|
233
234
|
do_normalize=False,
|
234
235
|
do_binarize=True,
|
235
236
|
do_convert_grayscale=True,
|
@@ -494,10 +495,38 @@ class FluxFillPipeline(
|
|
494
495
|
|
495
496
|
return prompt_embeds, pooled_prompt_embeds, text_ids
|
496
497
|
|
498
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
|
499
|
+
def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
|
500
|
+
if isinstance(generator, list):
|
501
|
+
image_latents = [
|
502
|
+
retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
|
503
|
+
for i in range(image.shape[0])
|
504
|
+
]
|
505
|
+
image_latents = torch.cat(image_latents, dim=0)
|
506
|
+
else:
|
507
|
+
image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
|
508
|
+
|
509
|
+
image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
|
510
|
+
|
511
|
+
return image_latents
|
512
|
+
|
513
|
+
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
|
514
|
+
def get_timesteps(self, num_inference_steps, strength, device):
|
515
|
+
# get the original timestep using init_timestep
|
516
|
+
init_timestep = min(num_inference_steps * strength, num_inference_steps)
|
517
|
+
|
518
|
+
t_start = int(max(num_inference_steps - init_timestep, 0))
|
519
|
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
520
|
+
if hasattr(self.scheduler, "set_begin_index"):
|
521
|
+
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
522
|
+
|
523
|
+
return timesteps, num_inference_steps - t_start
|
524
|
+
|
497
525
|
def check_inputs(
|
498
526
|
self,
|
499
527
|
prompt,
|
500
528
|
prompt_2,
|
529
|
+
strength,
|
501
530
|
height,
|
502
531
|
width,
|
503
532
|
prompt_embeds=None,
|
@@ -508,6 +537,9 @@ class FluxFillPipeline(
|
|
508
537
|
mask_image=None,
|
509
538
|
masked_image_latents=None,
|
510
539
|
):
|
540
|
+
if strength < 0 or strength > 1:
|
541
|
+
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
542
|
+
|
511
543
|
if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
|
512
544
|
logger.warning(
|
513
545
|
f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
|
@@ -625,9 +657,11 @@ class FluxFillPipeline(
|
|
625
657
|
"""
|
626
658
|
self.vae.disable_tiling()
|
627
659
|
|
628
|
-
# Copied from diffusers.pipelines.flux.
|
660
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents
|
629
661
|
def prepare_latents(
|
630
662
|
self,
|
663
|
+
image,
|
664
|
+
timestep,
|
631
665
|
batch_size,
|
632
666
|
num_channels_latents,
|
633
667
|
height,
|
@@ -637,28 +671,41 @@ class FluxFillPipeline(
|
|
637
671
|
generator,
|
638
672
|
latents=None,
|
639
673
|
):
|
674
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
675
|
+
raise ValueError(
|
676
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
677
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
678
|
+
)
|
679
|
+
|
640
680
|
# VAE applies 8x compression on images but we must also account for packing which requires
|
641
681
|
# latent height and width to be divisible by 2.
|
642
682
|
height = 2 * (int(height) // (self.vae_scale_factor * 2))
|
643
683
|
width = 2 * (int(width) // (self.vae_scale_factor * 2))
|
644
|
-
|
645
684
|
shape = (batch_size, num_channels_latents, height, width)
|
685
|
+
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
|
646
686
|
|
647
687
|
if latents is not None:
|
648
|
-
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
|
649
688
|
return latents.to(device=device, dtype=dtype), latent_image_ids
|
650
689
|
|
651
|
-
|
690
|
+
image = image.to(device=device, dtype=dtype)
|
691
|
+
if image.shape[1] != self.latent_channels:
|
692
|
+
image_latents = self._encode_vae_image(image=image, generator=generator)
|
693
|
+
else:
|
694
|
+
image_latents = image
|
695
|
+
if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
|
696
|
+
# expand init_latents for batch_size
|
697
|
+
additional_image_per_prompt = batch_size // image_latents.shape[0]
|
698
|
+
image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
|
699
|
+
elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
|
652
700
|
raise ValueError(
|
653
|
-
f"
|
654
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
701
|
+
f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
|
655
702
|
)
|
703
|
+
else:
|
704
|
+
image_latents = torch.cat([image_latents], dim=0)
|
656
705
|
|
657
|
-
|
706
|
+
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
707
|
+
latents = self.scheduler.scale_noise(image_latents, timestep, noise)
|
658
708
|
latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
|
659
|
-
|
660
|
-
latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
|
661
|
-
|
662
709
|
return latents, latent_image_ids
|
663
710
|
|
664
711
|
@property
|
@@ -688,6 +735,7 @@ class FluxFillPipeline(
|
|
688
735
|
masked_image_latents: Optional[torch.FloatTensor] = None,
|
689
736
|
height: Optional[int] = None,
|
690
737
|
width: Optional[int] = None,
|
738
|
+
strength: float = 1.0,
|
691
739
|
num_inference_steps: int = 50,
|
692
740
|
sigmas: Optional[List[float]] = None,
|
693
741
|
guidance_scale: float = 30.0,
|
@@ -732,6 +780,12 @@ class FluxFillPipeline(
|
|
732
780
|
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
733
781
|
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
734
782
|
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
783
|
+
strength (`float`, *optional*, defaults to 1.0):
|
784
|
+
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
|
785
|
+
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
|
786
|
+
on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
|
787
|
+
process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
|
788
|
+
essentially ignores `image`.
|
735
789
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
736
790
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
737
791
|
expense of slower inference.
|
@@ -739,7 +793,7 @@ class FluxFillPipeline(
|
|
739
793
|
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
740
794
|
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
741
795
|
will be used.
|
742
|
-
guidance_scale (`float`, *optional*, defaults to
|
796
|
+
guidance_scale (`float`, *optional*, defaults to 30.0):
|
743
797
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
744
798
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
745
799
|
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
@@ -795,6 +849,7 @@ class FluxFillPipeline(
|
|
795
849
|
self.check_inputs(
|
796
850
|
prompt,
|
797
851
|
prompt_2,
|
852
|
+
strength,
|
798
853
|
height,
|
799
854
|
width,
|
800
855
|
prompt_embeds=prompt_embeds,
|
@@ -810,6 +865,9 @@ class FluxFillPipeline(
|
|
810
865
|
self._joint_attention_kwargs = joint_attention_kwargs
|
811
866
|
self._interrupt = False
|
812
867
|
|
868
|
+
init_image = self.image_processor.preprocess(image, height=height, width=width)
|
869
|
+
init_image = init_image.to(dtype=torch.float32)
|
870
|
+
|
813
871
|
# 2. Define call parameters
|
814
872
|
if prompt is not None and isinstance(prompt, str):
|
815
873
|
batch_size = 1
|
@@ -839,9 +897,37 @@ class FluxFillPipeline(
|
|
839
897
|
lora_scale=lora_scale,
|
840
898
|
)
|
841
899
|
|
842
|
-
# 4. Prepare
|
900
|
+
# 4. Prepare timesteps
|
901
|
+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
|
902
|
+
image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
|
903
|
+
mu = calculate_shift(
|
904
|
+
image_seq_len,
|
905
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
906
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
907
|
+
self.scheduler.config.get("base_shift", 0.5),
|
908
|
+
self.scheduler.config.get("max_shift", 1.15),
|
909
|
+
)
|
910
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
911
|
+
self.scheduler,
|
912
|
+
num_inference_steps,
|
913
|
+
device,
|
914
|
+
sigmas=sigmas,
|
915
|
+
mu=mu,
|
916
|
+
)
|
917
|
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
918
|
+
|
919
|
+
if num_inference_steps < 1:
|
920
|
+
raise ValueError(
|
921
|
+
f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
|
922
|
+
f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
|
923
|
+
)
|
924
|
+
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
925
|
+
|
926
|
+
# 5. Prepare latent variables
|
843
927
|
num_channels_latents = self.vae.config.latent_channels
|
844
928
|
latents, latent_image_ids = self.prepare_latents(
|
929
|
+
init_image,
|
930
|
+
latent_timestep,
|
845
931
|
batch_size * num_images_per_prompt,
|
846
932
|
num_channels_latents,
|
847
933
|
height,
|
@@ -852,17 +938,16 @@ class FluxFillPipeline(
|
|
852
938
|
latents,
|
853
939
|
)
|
854
940
|
|
855
|
-
#
|
941
|
+
# 6. Prepare mask and masked image latents
|
856
942
|
if masked_image_latents is not None:
|
857
943
|
masked_image_latents = masked_image_latents.to(latents.device)
|
858
944
|
else:
|
859
|
-
image = self.image_processor.preprocess(image, height=height, width=width)
|
860
945
|
mask_image = self.mask_processor.preprocess(mask_image, height=height, width=width)
|
861
946
|
|
862
|
-
masked_image =
|
947
|
+
masked_image = init_image * (1 - mask_image)
|
863
948
|
masked_image = masked_image.to(device=device, dtype=prompt_embeds.dtype)
|
864
949
|
|
865
|
-
height, width =
|
950
|
+
height, width = init_image.shape[-2:]
|
866
951
|
mask, masked_image_latents = self.prepare_mask_latents(
|
867
952
|
mask_image,
|
868
953
|
masked_image,
|
@@ -877,23 +962,6 @@ class FluxFillPipeline(
|
|
877
962
|
)
|
878
963
|
masked_image_latents = torch.cat((masked_image_latents, mask), dim=-1)
|
879
964
|
|
880
|
-
# 6. Prepare timesteps
|
881
|
-
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
|
882
|
-
image_seq_len = latents.shape[1]
|
883
|
-
mu = calculate_shift(
|
884
|
-
image_seq_len,
|
885
|
-
self.scheduler.config.base_image_seq_len,
|
886
|
-
self.scheduler.config.max_image_seq_len,
|
887
|
-
self.scheduler.config.base_shift,
|
888
|
-
self.scheduler.config.max_shift,
|
889
|
-
)
|
890
|
-
timesteps, num_inference_steps = retrieve_timesteps(
|
891
|
-
self.scheduler,
|
892
|
-
num_inference_steps,
|
893
|
-
device,
|
894
|
-
sigmas=sigmas,
|
895
|
-
mu=mu,
|
896
|
-
)
|
897
965
|
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
898
966
|
self._num_timesteps = len(timesteps)
|
899
967
|
|
@@ -17,10 +17,17 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
17
17
|
|
18
18
|
import numpy as np
|
19
19
|
import torch
|
20
|
-
from transformers import
|
20
|
+
from transformers import (
|
21
|
+
CLIPImageProcessor,
|
22
|
+
CLIPTextModel,
|
23
|
+
CLIPTokenizer,
|
24
|
+
CLIPVisionModelWithProjection,
|
25
|
+
T5EncoderModel,
|
26
|
+
T5TokenizerFast,
|
27
|
+
)
|
21
28
|
|
22
29
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
23
|
-
from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
|
30
|
+
from ...loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
|
24
31
|
from ...models.autoencoders import AutoencoderKL
|
25
32
|
from ...models.transformers import FluxTransformer2DModel
|
26
33
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
@@ -77,7 +84,7 @@ def calculate_shift(
|
|
77
84
|
base_seq_len: int = 256,
|
78
85
|
max_seq_len: int = 4096,
|
79
86
|
base_shift: float = 0.5,
|
80
|
-
max_shift: float = 1.
|
87
|
+
max_shift: float = 1.15,
|
81
88
|
):
|
82
89
|
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
83
90
|
b = base_shift - m * base_seq_len
|
@@ -159,7 +166,7 @@ def retrieve_timesteps(
|
|
159
166
|
return timesteps, num_inference_steps
|
160
167
|
|
161
168
|
|
162
|
-
class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
169
|
+
class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin, FluxIPAdapterMixin):
|
163
170
|
r"""
|
164
171
|
The Flux pipeline for image inpainting.
|
165
172
|
|
@@ -186,8 +193,8 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
186
193
|
[T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
|
187
194
|
"""
|
188
195
|
|
189
|
-
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
190
|
-
_optional_components = []
|
196
|
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
|
197
|
+
_optional_components = ["image_encoder", "feature_extractor"]
|
191
198
|
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
192
199
|
|
193
200
|
def __init__(
|
@@ -199,6 +206,8 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
199
206
|
text_encoder_2: T5EncoderModel,
|
200
207
|
tokenizer_2: T5TokenizerFast,
|
201
208
|
transformer: FluxTransformer2DModel,
|
209
|
+
image_encoder: CLIPVisionModelWithProjection = None,
|
210
|
+
feature_extractor: CLIPImageProcessor = None,
|
202
211
|
):
|
203
212
|
super().__init__()
|
204
213
|
|
@@ -210,13 +219,16 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
210
219
|
tokenizer_2=tokenizer_2,
|
211
220
|
transformer=transformer,
|
212
221
|
scheduler=scheduler,
|
222
|
+
image_encoder=image_encoder,
|
223
|
+
feature_extractor=feature_extractor,
|
213
224
|
)
|
214
|
-
self.vae_scale_factor = (
|
215
|
-
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
216
|
-
)
|
225
|
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
217
226
|
# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
|
218
227
|
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
|
219
|
-
self.
|
228
|
+
self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
|
229
|
+
self.image_processor = VaeImageProcessor(
|
230
|
+
vae_scale_factor=self.vae_scale_factor * 2, vae_latent_channels=self.latent_channels
|
231
|
+
)
|
220
232
|
self.tokenizer_max_length = (
|
221
233
|
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
222
234
|
)
|
@@ -397,6 +409,55 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
397
409
|
|
398
410
|
return prompt_embeds, pooled_prompt_embeds, text_ids
|
399
411
|
|
412
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
|
413
|
+
def encode_image(self, image, device, num_images_per_prompt):
|
414
|
+
dtype = next(self.image_encoder.parameters()).dtype
|
415
|
+
|
416
|
+
if not isinstance(image, torch.Tensor):
|
417
|
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
418
|
+
|
419
|
+
image = image.to(device=device, dtype=dtype)
|
420
|
+
image_embeds = self.image_encoder(image).image_embeds
|
421
|
+
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
422
|
+
return image_embeds
|
423
|
+
|
424
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
|
425
|
+
def prepare_ip_adapter_image_embeds(
|
426
|
+
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
|
427
|
+
):
|
428
|
+
image_embeds = []
|
429
|
+
if ip_adapter_image_embeds is None:
|
430
|
+
if not isinstance(ip_adapter_image, list):
|
431
|
+
ip_adapter_image = [ip_adapter_image]
|
432
|
+
|
433
|
+
if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
|
434
|
+
raise ValueError(
|
435
|
+
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
|
436
|
+
)
|
437
|
+
|
438
|
+
for single_ip_adapter_image in ip_adapter_image:
|
439
|
+
single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
|
440
|
+
image_embeds.append(single_image_embeds[None, :])
|
441
|
+
else:
|
442
|
+
if not isinstance(ip_adapter_image_embeds, list):
|
443
|
+
ip_adapter_image_embeds = [ip_adapter_image_embeds]
|
444
|
+
|
445
|
+
if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
|
446
|
+
raise ValueError(
|
447
|
+
f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
|
448
|
+
)
|
449
|
+
|
450
|
+
for single_image_embeds in ip_adapter_image_embeds:
|
451
|
+
image_embeds.append(single_image_embeds)
|
452
|
+
|
453
|
+
ip_adapter_image_embeds = []
|
454
|
+
for single_image_embeds in image_embeds:
|
455
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
456
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
457
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
458
|
+
|
459
|
+
return ip_adapter_image_embeds
|
460
|
+
|
400
461
|
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
|
401
462
|
def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
|
402
463
|
if isinstance(generator, list):
|
@@ -431,8 +492,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
431
492
|
strength,
|
432
493
|
height,
|
433
494
|
width,
|
495
|
+
negative_prompt=None,
|
496
|
+
negative_prompt_2=None,
|
434
497
|
prompt_embeds=None,
|
498
|
+
negative_prompt_embeds=None,
|
435
499
|
pooled_prompt_embeds=None,
|
500
|
+
negative_pooled_prompt_embeds=None,
|
436
501
|
callback_on_step_end_tensor_inputs=None,
|
437
502
|
max_sequence_length=None,
|
438
503
|
):
|
@@ -470,10 +535,33 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
470
535
|
elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
|
471
536
|
raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
|
472
537
|
|
538
|
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
539
|
+
raise ValueError(
|
540
|
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
541
|
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
542
|
+
)
|
543
|
+
elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
|
544
|
+
raise ValueError(
|
545
|
+
f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
|
546
|
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
547
|
+
)
|
548
|
+
|
549
|
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
550
|
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
551
|
+
raise ValueError(
|
552
|
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
553
|
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
554
|
+
f" {negative_prompt_embeds.shape}."
|
555
|
+
)
|
556
|
+
|
473
557
|
if prompt_embeds is not None and pooled_prompt_embeds is None:
|
474
558
|
raise ValueError(
|
475
559
|
"If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
|
476
560
|
)
|
561
|
+
if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
|
562
|
+
raise ValueError(
|
563
|
+
"If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
|
564
|
+
)
|
477
565
|
|
478
566
|
if max_sequence_length is not None and max_sequence_length > 512:
|
479
567
|
raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
|
@@ -549,7 +637,10 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
549
637
|
return latents.to(device=device, dtype=dtype), latent_image_ids
|
550
638
|
|
551
639
|
image = image.to(device=device, dtype=dtype)
|
552
|
-
|
640
|
+
if image.shape[1] != self.latent_channels:
|
641
|
+
image_latents = self._encode_vae_image(image=image, generator=generator)
|
642
|
+
else:
|
643
|
+
image_latents = image
|
553
644
|
if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
|
554
645
|
# expand init_latents for batch_size
|
555
646
|
additional_image_per_prompt = batch_size // image_latents.shape[0]
|
@@ -588,6 +679,9 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
588
679
|
self,
|
589
680
|
prompt: Union[str, List[str]] = None,
|
590
681
|
prompt_2: Optional[Union[str, List[str]]] = None,
|
682
|
+
negative_prompt: Union[str, List[str]] = None,
|
683
|
+
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
684
|
+
true_cfg_scale: float = 1.0,
|
591
685
|
image: PipelineImageInput = None,
|
592
686
|
height: Optional[int] = None,
|
593
687
|
width: Optional[int] = None,
|
@@ -600,6 +694,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
600
694
|
latents: Optional[torch.FloatTensor] = None,
|
601
695
|
prompt_embeds: Optional[torch.FloatTensor] = None,
|
602
696
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
697
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
698
|
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
699
|
+
negative_ip_adapter_image: Optional[PipelineImageInput] = None,
|
700
|
+
negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
701
|
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
702
|
+
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
603
703
|
output_type: Optional[str] = "pil",
|
604
704
|
return_dict: bool = True,
|
605
705
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -661,6 +761,17 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
661
761
|
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
662
762
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
663
763
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
764
|
+
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
765
|
+
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
766
|
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
767
|
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
|
768
|
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
769
|
+
negative_ip_adapter_image:
|
770
|
+
(`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
771
|
+
negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
772
|
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
773
|
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
|
774
|
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
664
775
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
665
776
|
The output format of the generate image. Choose between
|
666
777
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
@@ -699,8 +810,12 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
699
810
|
strength,
|
700
811
|
height,
|
701
812
|
width,
|
813
|
+
negative_prompt=negative_prompt,
|
814
|
+
negative_prompt_2=negative_prompt_2,
|
702
815
|
prompt_embeds=prompt_embeds,
|
816
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
703
817
|
pooled_prompt_embeds=pooled_prompt_embeds,
|
818
|
+
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
704
819
|
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
705
820
|
max_sequence_length=max_sequence_length,
|
706
821
|
)
|
@@ -726,6 +841,7 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
726
841
|
lora_scale = (
|
727
842
|
self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
|
728
843
|
)
|
844
|
+
do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
|
729
845
|
(
|
730
846
|
prompt_embeds,
|
731
847
|
pooled_prompt_embeds,
|
@@ -740,16 +856,31 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
740
856
|
max_sequence_length=max_sequence_length,
|
741
857
|
lora_scale=lora_scale,
|
742
858
|
)
|
859
|
+
if do_true_cfg:
|
860
|
+
(
|
861
|
+
negative_prompt_embeds,
|
862
|
+
negative_pooled_prompt_embeds,
|
863
|
+
_,
|
864
|
+
) = self.encode_prompt(
|
865
|
+
prompt=negative_prompt,
|
866
|
+
prompt_2=negative_prompt_2,
|
867
|
+
prompt_embeds=negative_prompt_embeds,
|
868
|
+
pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
869
|
+
device=device,
|
870
|
+
num_images_per_prompt=num_images_per_prompt,
|
871
|
+
max_sequence_length=max_sequence_length,
|
872
|
+
lora_scale=lora_scale,
|
873
|
+
)
|
743
874
|
|
744
875
|
# 4.Prepare timesteps
|
745
876
|
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
|
746
877
|
image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
|
747
878
|
mu = calculate_shift(
|
748
879
|
image_seq_len,
|
749
|
-
self.scheduler.config.base_image_seq_len,
|
750
|
-
self.scheduler.config.max_image_seq_len,
|
751
|
-
self.scheduler.config.base_shift,
|
752
|
-
self.scheduler.config.max_shift,
|
880
|
+
self.scheduler.config.get("base_image_seq_len", 256),
|
881
|
+
self.scheduler.config.get("max_image_seq_len", 4096),
|
882
|
+
self.scheduler.config.get("base_shift", 0.5),
|
883
|
+
self.scheduler.config.get("max_shift", 1.15),
|
753
884
|
)
|
754
885
|
timesteps, num_inference_steps = retrieve_timesteps(
|
755
886
|
self.scheduler,
|
@@ -793,12 +924,43 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
793
924
|
else:
|
794
925
|
guidance = None
|
795
926
|
|
927
|
+
if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
|
928
|
+
negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
|
929
|
+
):
|
930
|
+
negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
|
931
|
+
elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
|
932
|
+
negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
|
933
|
+
):
|
934
|
+
ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
|
935
|
+
|
936
|
+
if self.joint_attention_kwargs is None:
|
937
|
+
self._joint_attention_kwargs = {}
|
938
|
+
|
939
|
+
image_embeds = None
|
940
|
+
negative_image_embeds = None
|
941
|
+
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
942
|
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
943
|
+
ip_adapter_image,
|
944
|
+
ip_adapter_image_embeds,
|
945
|
+
device,
|
946
|
+
batch_size * num_images_per_prompt,
|
947
|
+
)
|
948
|
+
if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
|
949
|
+
negative_image_embeds = self.prepare_ip_adapter_image_embeds(
|
950
|
+
negative_ip_adapter_image,
|
951
|
+
negative_ip_adapter_image_embeds,
|
952
|
+
device,
|
953
|
+
batch_size * num_images_per_prompt,
|
954
|
+
)
|
955
|
+
|
796
956
|
# 6. Denoising loop
|
797
957
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
798
958
|
for i, t in enumerate(timesteps):
|
799
959
|
if self.interrupt:
|
800
960
|
continue
|
801
961
|
|
962
|
+
if image_embeds is not None:
|
963
|
+
self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
|
802
964
|
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
803
965
|
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
804
966
|
noise_pred = self.transformer(
|
@@ -813,6 +975,22 @@ class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
813
975
|
return_dict=False,
|
814
976
|
)[0]
|
815
977
|
|
978
|
+
if do_true_cfg:
|
979
|
+
if negative_image_embeds is not None:
|
980
|
+
self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
|
981
|
+
neg_noise_pred = self.transformer(
|
982
|
+
hidden_states=latents,
|
983
|
+
timestep=timestep / 1000,
|
984
|
+
guidance=guidance,
|
985
|
+
pooled_projections=negative_pooled_prompt_embeds,
|
986
|
+
encoder_hidden_states=negative_prompt_embeds,
|
987
|
+
txt_ids=text_ids,
|
988
|
+
img_ids=latent_image_ids,
|
989
|
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
990
|
+
return_dict=False,
|
991
|
+
)[0]
|
992
|
+
noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
|
993
|
+
|
816
994
|
# compute the previous noisy sample x_t -> x_t-1
|
817
995
|
latents_dtype = latents.dtype
|
818
996
|
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|