diffusers 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +48 -1
- diffusers/commands/__init__.py +1 -1
- diffusers/commands/diffusers_cli.py +1 -1
- diffusers/commands/env.py +1 -1
- diffusers/commands/fp16_safetensors.py +1 -1
- diffusers/dependency_versions_check.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/experimental/rl/value_guided_sampling.py +1 -1
- diffusers/hooks/faster_cache.py +2 -2
- diffusers/hooks/group_offloading.py +128 -29
- diffusers/hooks/hooks.py +2 -2
- diffusers/hooks/layerwise_casting.py +3 -3
- diffusers/hooks/pyramid_attention_broadcast.py +1 -1
- diffusers/image_processor.py +7 -2
- diffusers/loaders/__init__.py +4 -0
- diffusers/loaders/ip_adapter.py +5 -14
- diffusers/loaders/lora_base.py +212 -111
- diffusers/loaders/lora_conversion_utils.py +275 -34
- diffusers/loaders/lora_pipeline.py +1554 -819
- diffusers/loaders/peft.py +52 -109
- diffusers/loaders/single_file.py +2 -2
- diffusers/loaders/single_file_model.py +20 -4
- diffusers/loaders/single_file_utils.py +225 -5
- diffusers/loaders/textual_inversion.py +3 -2
- diffusers/loaders/transformer_flux.py +1 -1
- diffusers/loaders/transformer_sd3.py +2 -2
- diffusers/loaders/unet.py +2 -16
- diffusers/loaders/unet_loader_utils.py +1 -1
- diffusers/loaders/utils.py +1 -1
- diffusers/models/__init__.py +15 -1
- diffusers/models/activations.py +5 -5
- diffusers/models/adapter.py +2 -3
- diffusers/models/attention.py +4 -4
- diffusers/models/attention_flax.py +10 -10
- diffusers/models/attention_processor.py +14 -10
- diffusers/models/auto_model.py +47 -10
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_dc.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
- diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
- diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vae.py +13 -2
- diffusers/models/autoencoders/vq_model.py +2 -2
- diffusers/models/cache_utils.py +1 -1
- diffusers/models/controlnet.py +1 -1
- diffusers/models/controlnet_flux.py +1 -1
- diffusers/models/controlnet_sd3.py +1 -1
- diffusers/models/controlnet_sparsectrl.py +1 -1
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -3
- diffusers/models/controlnets/controlnet_flax.py +1 -1
- diffusers/models/controlnets/controlnet_flux.py +16 -15
- diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
- diffusers/models/controlnets/controlnet_sana.py +290 -0
- diffusers/models/controlnets/controlnet_sd3.py +1 -1
- diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
- diffusers/models/controlnets/controlnet_union.py +1 -1
- diffusers/models/controlnets/controlnet_xs.py +7 -7
- diffusers/models/controlnets/multicontrolnet.py +4 -5
- diffusers/models/controlnets/multicontrolnet_union.py +5 -6
- diffusers/models/downsampling.py +2 -2
- diffusers/models/embeddings.py +10 -12
- diffusers/models/embeddings_flax.py +2 -2
- diffusers/models/lora.py +3 -3
- diffusers/models/modeling_utils.py +44 -14
- diffusers/models/normalization.py +4 -4
- diffusers/models/resnet.py +2 -2
- diffusers/models/resnet_flax.py +1 -1
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
- diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
- diffusers/models/transformers/consisid_transformer_3d.py +1 -1
- diffusers/models/transformers/dit_transformer_2d.py +2 -2
- diffusers/models/transformers/dual_transformer_2d.py +1 -1
- diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
- diffusers/models/transformers/latte_transformer_3d.py +4 -5
- diffusers/models/transformers/lumina_nextdit2d.py +2 -2
- diffusers/models/transformers/pixart_transformer_2d.py +3 -3
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/sana_transformer.py +8 -3
- diffusers/models/transformers/stable_audio_transformer.py +5 -9
- diffusers/models/transformers/t5_film_transformer.py +3 -3
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +1 -1
- diffusers/models/transformers/transformer_chroma.py +742 -0
- diffusers/models/transformers/transformer_cogview3plus.py +5 -10
- diffusers/models/transformers/transformer_cogview4.py +317 -25
- diffusers/models/transformers/transformer_cosmos.py +579 -0
- diffusers/models/transformers/transformer_flux.py +9 -11
- diffusers/models/transformers/transformer_hidream_image.py +942 -0
- diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
- diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
- diffusers/models/transformers/transformer_ltx.py +2 -2
- diffusers/models/transformers/transformer_lumina2.py +1 -1
- diffusers/models/transformers/transformer_mochi.py +1 -1
- diffusers/models/transformers/transformer_omnigen.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +7 -7
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/transformers/transformer_wan.py +24 -8
- diffusers/models/transformers/transformer_wan_vace.py +393 -0
- diffusers/models/unets/unet_1d.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +1 -1
- diffusers/models/unets/unet_2d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
- diffusers/models/unets/unet_2d_condition.py +2 -2
- diffusers/models/unets/unet_2d_condition_flax.py +2 -2
- diffusers/models/unets/unet_3d_blocks.py +1 -1
- diffusers/models/unets/unet_3d_condition.py +3 -3
- diffusers/models/unets/unet_i2vgen_xl.py +3 -3
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +2 -2
- diffusers/models/unets/unet_stable_cascade.py +1 -1
- diffusers/models/upsampling.py +2 -2
- diffusers/models/vae_flax.py +2 -2
- diffusers/models/vq_model.py +1 -1
- diffusers/pipelines/__init__.py +37 -6
- diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
- diffusers/pipelines/amused/pipeline_amused.py +7 -6
- diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
- diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
- diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
- diffusers/pipelines/auto_pipeline.py +6 -7
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
- diffusers/pipelines/chroma/__init__.py +49 -0
- diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
- diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
- diffusers/pipelines/chroma/pipeline_output.py +21 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
- diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
- diffusers/pipelines/consisid/consisid_utils.py +2 -2
- diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
- diffusers/pipelines/cosmos/__init__.py +54 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
- diffusers/pipelines/cosmos/pipeline_output.py +40 -0
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
- diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
- diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
- diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
- diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
- diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
- diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
- diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
- diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +1 -1
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
- diffusers/pipelines/flux/modeling_flux.py +1 -1
- diffusers/pipelines/flux/pipeline_flux.py +10 -17
- diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
- diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
- diffusers/pipelines/free_init_utils.py +2 -2
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hidream_image/__init__.py +47 -0
- diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
- diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
- diffusers/pipelines/hunyuan_video/__init__.py +2 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
- diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
- diffusers/pipelines/kolors/text_encoder.py +3 -3
- diffusers/pipelines/kolors/tokenizer.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
- diffusers/pipelines/latte/pipeline_latte.py +12 -12
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
- diffusers/pipelines/ltx/__init__.py +4 -0
- diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
- diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
- diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
- diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
- diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
- diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
- diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
- diffusers/pipelines/onnx_utils.py +15 -2
- diffusers/pipelines/pag/pag_utils.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
- diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
- diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
- diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
- diffusers/pipelines/pia/pipeline_pia.py +8 -6
- diffusers/pipelines/pipeline_flax_utils.py +3 -4
- diffusers/pipelines/pipeline_loading_utils.py +89 -13
- diffusers/pipelines/pipeline_utils.py +105 -33
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
- diffusers/pipelines/sana/__init__.py +4 -0
- diffusers/pipelines/sana/pipeline_sana.py +23 -21
- diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
- diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
- diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
- diffusers/pipelines/shap_e/camera.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
- diffusers/pipelines/shap_e/renderer.py +3 -3
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
- diffusers/pipelines/stable_diffusion/__init__.py +0 -7
- diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
- diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
- diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
- diffusers/pipelines/unclip/text_proj.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
- diffusers/pipelines/visualcloze/__init__.py +52 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
- diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
- diffusers/pipelines/wan/__init__.py +2 -0
- diffusers/pipelines/wan/pipeline_wan.py +13 -10
- diffusers/pipelines/wan/pipeline_wan_i2v.py +38 -18
- diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
- diffusers/quantizers/__init__.py +179 -1
- diffusers/quantizers/base.py +6 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
- diffusers/quantizers/bitsandbytes/utils.py +10 -7
- diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
- diffusers/quantizers/gguf/utils.py +16 -13
- diffusers/quantizers/quantization_config.py +18 -16
- diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
- diffusers/schedulers/__init__.py +3 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
- diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -1
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
- diffusers/schedulers/scheduling_ddim.py +8 -8
- diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_ddim_flax.py +6 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
- diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
- diffusers/schedulers/scheduling_ddpm.py +9 -9
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
- diffusers/schedulers/scheduling_deis_multistep.py +8 -8
- diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
- diffusers/schedulers/scheduling_edm_euler.py +20 -11
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
- diffusers/schedulers/scheduling_heun_discrete.py +2 -2
- diffusers/schedulers/scheduling_ipndm.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
- diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
- diffusers/schedulers/scheduling_lcm.py +3 -3
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
- diffusers/schedulers/scheduling_pndm.py +4 -4
- diffusers/schedulers/scheduling_pndm_flax.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +9 -9
- diffusers/schedulers/scheduling_sasolver.py +15 -15
- diffusers/schedulers/scheduling_scm.py +1 -1
- diffusers/schedulers/scheduling_sde_ve.py +1 -1
- diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
- diffusers/schedulers/scheduling_tcd.py +3 -3
- diffusers/schedulers/scheduling_unclip.py +5 -5
- diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
- diffusers/schedulers/scheduling_utils.py +1 -1
- diffusers/schedulers/scheduling_utils_flax.py +1 -1
- diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
- diffusers/training_utils.py +13 -5
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/accelerate_utils.py +1 -1
- diffusers/utils/doc_utils.py +1 -1
- diffusers/utils/dummy_pt_objects.py +120 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
- diffusers/utils/dynamic_modules_utils.py +21 -3
- diffusers/utils/export_utils.py +1 -1
- diffusers/utils/import_utils.py +81 -18
- diffusers/utils/logging.py +1 -1
- diffusers/utils/outputs.py +2 -1
- diffusers/utils/peft_utils.py +91 -8
- diffusers/utils/state_dict_utils.py +20 -3
- diffusers/utils/testing_utils.py +59 -7
- diffusers/utils/torch_utils.py +25 -5
- diffusers/video_processor.py +2 -2
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/METADATA +70 -55
- diffusers-0.34.0.dist-info/RECORD +639 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/WHEEL +1 -1
- diffusers-0.33.1.dist-info/RECORD +0 -608
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright 2025 VisualCloze team and The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Dict, List, Optional, Tuple, Union
|
16
|
+
|
17
|
+
import torch
|
18
|
+
from PIL import Image
|
19
|
+
|
20
|
+
from ...image_processor import VaeImageProcessor
|
21
|
+
|
22
|
+
|
23
|
+
class VisualClozeProcessor(VaeImageProcessor):
|
24
|
+
"""
|
25
|
+
Image processor for the VisualCloze pipeline.
|
26
|
+
|
27
|
+
This processor handles the preprocessing of images for visual cloze tasks, including resizing, normalization, and
|
28
|
+
mask generation.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
resolution (int, optional):
|
32
|
+
Target resolution for processing images. Each image will be resized to this resolution before being
|
33
|
+
concatenated to avoid the out-of-memory error. Defaults to 384.
|
34
|
+
*args: Additional arguments passed to [~image_processor.VaeImageProcessor]
|
35
|
+
**kwargs: Additional keyword arguments passed to [~image_processor.VaeImageProcessor]
|
36
|
+
"""
|
37
|
+
|
38
|
+
def __init__(self, *args, resolution: int = 384, **kwargs):
|
39
|
+
super().__init__(*args, **kwargs)
|
40
|
+
self.resolution = resolution
|
41
|
+
|
42
|
+
def preprocess_image(
|
43
|
+
self, input_images: List[List[Optional[Image.Image]]], vae_scale_factor: int
|
44
|
+
) -> Tuple[List[List[torch.Tensor]], List[List[List[int]]], List[int]]:
|
45
|
+
"""
|
46
|
+
Preprocesses input images for the VisualCloze pipeline.
|
47
|
+
|
48
|
+
This function handles the preprocessing of input images by:
|
49
|
+
1. Resizing and cropping images to maintain consistent dimensions
|
50
|
+
2. Converting images to the Tensor format for the VAE
|
51
|
+
3. Normalizing pixel values
|
52
|
+
4. Tracking image sizes and positions of target images
|
53
|
+
|
54
|
+
Args:
|
55
|
+
input_images (List[List[Optional[Image.Image]]]):
|
56
|
+
A nested list of PIL Images where:
|
57
|
+
- Outer list represents different samples, including in-context examples and the query
|
58
|
+
- Inner list contains images for the task
|
59
|
+
- In the last row, condition images are provided and the target images are placed as None
|
60
|
+
vae_scale_factor (int):
|
61
|
+
The scale factor used by the VAE for resizing images
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
Tuple containing:
|
65
|
+
- List[List[torch.Tensor]]: Preprocessed images in tensor format
|
66
|
+
- List[List[List[int]]]: Dimensions of each processed image [height, width]
|
67
|
+
- List[int]: Target positions indicating which images are to be generated
|
68
|
+
"""
|
69
|
+
n_samples, n_task_images = len(input_images), len(input_images[0])
|
70
|
+
divisible = 2 * vae_scale_factor
|
71
|
+
|
72
|
+
processed_images: List[List[Image.Image]] = [[] for _ in range(n_samples)]
|
73
|
+
resize_size: List[Optional[Tuple[int, int]]] = [None for _ in range(n_samples)]
|
74
|
+
target_position: List[int] = []
|
75
|
+
|
76
|
+
# Process each sample
|
77
|
+
for i in range(n_samples):
|
78
|
+
# Determine size from first non-None image
|
79
|
+
for j in range(n_task_images):
|
80
|
+
if input_images[i][j] is not None:
|
81
|
+
aspect_ratio = input_images[i][j].width / input_images[i][j].height
|
82
|
+
target_area = self.resolution * self.resolution
|
83
|
+
new_h = int((target_area / aspect_ratio) ** 0.5)
|
84
|
+
new_w = int(new_h * aspect_ratio)
|
85
|
+
|
86
|
+
new_w = max(new_w // divisible, 1) * divisible
|
87
|
+
new_h = max(new_h // divisible, 1) * divisible
|
88
|
+
resize_size[i] = (new_w, new_h)
|
89
|
+
break
|
90
|
+
|
91
|
+
# Process all images in the sample
|
92
|
+
for j in range(n_task_images):
|
93
|
+
if input_images[i][j] is not None:
|
94
|
+
target = self._resize_and_crop(input_images[i][j], resize_size[i][0], resize_size[i][1])
|
95
|
+
processed_images[i].append(target)
|
96
|
+
if i == n_samples - 1:
|
97
|
+
target_position.append(0)
|
98
|
+
else:
|
99
|
+
blank = Image.new("RGB", resize_size[i] or (self.resolution, self.resolution), (0, 0, 0))
|
100
|
+
processed_images[i].append(blank)
|
101
|
+
if i == n_samples - 1:
|
102
|
+
target_position.append(1)
|
103
|
+
|
104
|
+
# Ensure consistent width for multiple target images when there are multiple target images
|
105
|
+
if len(target_position) > 1 and sum(target_position) > 1:
|
106
|
+
new_w = resize_size[n_samples - 1][0] or 384
|
107
|
+
for i in range(len(processed_images)):
|
108
|
+
for j in range(len(processed_images[i])):
|
109
|
+
if processed_images[i][j] is not None:
|
110
|
+
new_h = int(processed_images[i][j].height * (new_w / processed_images[i][j].width))
|
111
|
+
new_w = int(new_w / 16) * 16
|
112
|
+
new_h = int(new_h / 16) * 16
|
113
|
+
processed_images[i][j] = self.height(processed_images[i][j], new_h, new_w)
|
114
|
+
|
115
|
+
# Convert to tensors and normalize
|
116
|
+
image_sizes = []
|
117
|
+
for i in range(len(processed_images)):
|
118
|
+
image_sizes.append([[img.height, img.width] for img in processed_images[i]])
|
119
|
+
for j, image in enumerate(processed_images[i]):
|
120
|
+
image = self.pil_to_numpy(image)
|
121
|
+
image = self.numpy_to_pt(image)
|
122
|
+
image = self.normalize(image)
|
123
|
+
processed_images[i][j] = image
|
124
|
+
|
125
|
+
return processed_images, image_sizes, target_position
|
126
|
+
|
127
|
+
def preprocess_mask(
|
128
|
+
self, input_images: List[List[Image.Image]], target_position: List[int]
|
129
|
+
) -> List[List[torch.Tensor]]:
|
130
|
+
"""
|
131
|
+
Generate masks for the VisualCloze pipeline.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
input_images (List[List[Image.Image]]):
|
135
|
+
Processed images from preprocess_image
|
136
|
+
target_position (List[int]):
|
137
|
+
Binary list marking the positions of target images (1 for target, 0 for condition)
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
List[List[torch.Tensor]]:
|
141
|
+
A nested list of mask tensors (1 for target positions, 0 for condition images)
|
142
|
+
"""
|
143
|
+
mask = []
|
144
|
+
for i, row in enumerate(input_images):
|
145
|
+
if i == len(input_images) - 1: # Query row
|
146
|
+
row_masks = [
|
147
|
+
torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=m) for m in target_position
|
148
|
+
]
|
149
|
+
else: # In-context examples
|
150
|
+
row_masks = [
|
151
|
+
torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=0) for _ in target_position
|
152
|
+
]
|
153
|
+
mask.append(row_masks)
|
154
|
+
return mask
|
155
|
+
|
156
|
+
def preprocess_image_upsampling(
|
157
|
+
self,
|
158
|
+
input_images: List[List[Image.Image]],
|
159
|
+
height: int,
|
160
|
+
width: int,
|
161
|
+
) -> Tuple[List[List[Image.Image]], List[List[List[int]]]]:
|
162
|
+
"""Process images for the upsampling stage in the VisualCloze pipeline.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
input_images: Input image to process
|
166
|
+
height: Target height
|
167
|
+
width: Target width
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
Tuple of processed image and its size
|
171
|
+
"""
|
172
|
+
image = self.resize(input_images[0][0], height, width)
|
173
|
+
image = self.pil_to_numpy(image) # to np
|
174
|
+
image = self.numpy_to_pt(image) # to pt
|
175
|
+
image = self.normalize(image)
|
176
|
+
|
177
|
+
input_images[0][0] = image
|
178
|
+
image_sizes = [[[height, width]]]
|
179
|
+
return input_images, image_sizes
|
180
|
+
|
181
|
+
def preprocess_mask_upsampling(self, input_images: List[List[Image.Image]]) -> List[List[torch.Tensor]]:
|
182
|
+
return [[torch.ones((1, 1, input_images[0][0].shape[2], input_images[0][0].shape[3]))]]
|
183
|
+
|
184
|
+
def get_layout_prompt(self, size: Tuple[int, int]) -> str:
|
185
|
+
layout_instruction = (
|
186
|
+
f"A grid layout with {size[0]} rows and {size[1]} columns, displaying {size[0] * size[1]} images arranged side by side.",
|
187
|
+
)
|
188
|
+
return layout_instruction
|
189
|
+
|
190
|
+
def preprocess(
|
191
|
+
self,
|
192
|
+
task_prompt: Union[str, List[str]],
|
193
|
+
content_prompt: Union[str, List[str]],
|
194
|
+
input_images: Optional[List[List[List[Optional[str]]]]] = None,
|
195
|
+
height: Optional[int] = None,
|
196
|
+
width: Optional[int] = None,
|
197
|
+
upsampling: bool = False,
|
198
|
+
vae_scale_factor: int = 16,
|
199
|
+
) -> Dict:
|
200
|
+
"""Process visual cloze inputs.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
task_prompt: Task description(s)
|
204
|
+
content_prompt: Content description(s)
|
205
|
+
input_images: List of images or None for the target images
|
206
|
+
height: Optional target height for upsampling stage
|
207
|
+
width: Optional target width for upsampling stage
|
208
|
+
upsampling: Whether this is in the upsampling processing stage
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
Dictionary containing processed images, masks, prompts and metadata
|
212
|
+
"""
|
213
|
+
if isinstance(task_prompt, str):
|
214
|
+
task_prompt = [task_prompt]
|
215
|
+
content_prompt = [content_prompt]
|
216
|
+
input_images = [input_images]
|
217
|
+
|
218
|
+
output = {
|
219
|
+
"init_image": [],
|
220
|
+
"mask": [],
|
221
|
+
"task_prompt": task_prompt if not upsampling else [None for _ in range(len(task_prompt))],
|
222
|
+
"content_prompt": content_prompt,
|
223
|
+
"layout_prompt": [],
|
224
|
+
"target_position": [],
|
225
|
+
"image_size": [],
|
226
|
+
}
|
227
|
+
for i in range(len(task_prompt)):
|
228
|
+
if upsampling:
|
229
|
+
layout_prompt = None
|
230
|
+
else:
|
231
|
+
layout_prompt = self.get_layout_prompt((len(input_images[i]), len(input_images[i][0])))
|
232
|
+
|
233
|
+
if upsampling:
|
234
|
+
cur_processed_images, cur_image_size = self.preprocess_image_upsampling(
|
235
|
+
input_images[i], height=height, width=width
|
236
|
+
)
|
237
|
+
cur_mask = self.preprocess_mask_upsampling(cur_processed_images)
|
238
|
+
else:
|
239
|
+
cur_processed_images, cur_image_size, cur_target_position = self.preprocess_image(
|
240
|
+
input_images[i], vae_scale_factor=vae_scale_factor
|
241
|
+
)
|
242
|
+
cur_mask = self.preprocess_mask(cur_processed_images, cur_target_position)
|
243
|
+
|
244
|
+
output["target_position"].append(cur_target_position)
|
245
|
+
|
246
|
+
output["image_size"].append(cur_image_size)
|
247
|
+
output["init_image"].append(cur_processed_images)
|
248
|
+
output["mask"].append(cur_mask)
|
249
|
+
output["layout_prompt"].append(layout_prompt)
|
250
|
+
|
251
|
+
return output
|
@@ -24,6 +24,7 @@ except OptionalDependencyNotAvailable:
|
|
24
24
|
else:
|
25
25
|
_import_structure["pipeline_wan"] = ["WanPipeline"]
|
26
26
|
_import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
|
27
|
+
_import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
|
27
28
|
_import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
|
28
29
|
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
29
30
|
try:
|
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
35
36
|
else:
|
36
37
|
from .pipeline_wan import WanPipeline
|
37
38
|
from .pipeline_wan_i2v import WanImageToVideoPipeline
|
39
|
+
from .pipeline_wan_vace import WanVACEPipeline
|
38
40
|
from .pipeline_wan_video2video import WanVideoToVideoPipeline
|
39
41
|
|
40
42
|
else:
|
@@ -388,8 +388,10 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
388
388
|
|
389
389
|
Args:
|
390
390
|
prompt (`str` or `List[str]`, *optional*):
|
391
|
-
The prompt or prompts to guide the image generation. If not defined,
|
392
|
-
|
391
|
+
The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
|
392
|
+
negative_prompt (`str` or `List[str]`, *optional*):
|
393
|
+
The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
|
394
|
+
instead. Ignored when not using guidance (`guidance_scale` < `1`).
|
393
395
|
height (`int`, defaults to `480`):
|
394
396
|
The height in pixels of the generated image.
|
395
397
|
width (`int`, defaults to `832`):
|
@@ -400,11 +402,11 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
400
402
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
401
403
|
expense of slower inference.
|
402
404
|
guidance_scale (`float`, defaults to `5.0`):
|
403
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
404
|
-
`guidance_scale` is defined as `w` of equation 2.
|
405
|
-
Paper](https://
|
406
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
407
|
-
usually at the expense of lower image quality.
|
405
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
406
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
407
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
408
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
409
|
+
the text `prompt`, usually at the expense of lower image quality.
|
408
410
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
409
411
|
The number of images to generate per prompt.
|
410
412
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -417,7 +419,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
417
419
|
prompt_embeds (`torch.Tensor`, *optional*):
|
418
420
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
419
421
|
provided, text embeddings are generated from the `prompt` input argument.
|
420
|
-
output_type (`str`, *optional*, defaults to `"
|
422
|
+
output_type (`str`, *optional*, defaults to `"np"`):
|
421
423
|
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
422
424
|
return_dict (`bool`, *optional*, defaults to `True`):
|
423
425
|
Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
|
@@ -434,8 +436,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
434
436
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
435
437
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
436
438
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
437
|
-
|
438
|
-
The
|
439
|
+
max_sequence_length (`int`, defaults to `512`):
|
440
|
+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
|
441
|
+
truncated. If the prompt is shorter, it will be padded to this length.
|
439
442
|
|
440
443
|
Examples:
|
441
444
|
|
@@ -380,6 +380,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
380
380
|
device: Optional[torch.device] = None,
|
381
381
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
382
382
|
latents: Optional[torch.Tensor] = None,
|
383
|
+
last_image: Optional[torch.Tensor] = None,
|
383
384
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
384
385
|
num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
385
386
|
latent_height = height // self.vae_scale_factor_spatial
|
@@ -398,10 +399,17 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
398
399
|
latents = latents.to(device=device, dtype=dtype)
|
399
400
|
|
400
401
|
image = image.unsqueeze(2)
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
402
|
+
if last_image is None:
|
403
|
+
video_condition = torch.cat(
|
404
|
+
[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
|
405
|
+
)
|
406
|
+
else:
|
407
|
+
last_image = last_image.unsqueeze(2)
|
408
|
+
video_condition = torch.cat(
|
409
|
+
[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
|
410
|
+
dim=2,
|
411
|
+
)
|
412
|
+
video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
|
405
413
|
|
406
414
|
latents_mean = (
|
407
415
|
torch.tensor(self.vae.config.latents_mean)
|
@@ -421,10 +429,15 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
421
429
|
latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
|
422
430
|
latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
|
423
431
|
|
432
|
+
latent_condition = latent_condition.to(dtype)
|
424
433
|
latent_condition = (latent_condition - latents_mean) * latents_std
|
425
434
|
|
426
435
|
mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
|
427
|
-
|
436
|
+
|
437
|
+
if last_image is None:
|
438
|
+
mask_lat_size[:, :, list(range(1, num_frames))] = 0
|
439
|
+
else:
|
440
|
+
mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
|
428
441
|
first_frame_mask = mask_lat_size[:, :, 0:1]
|
429
442
|
first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
|
430
443
|
mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
|
@@ -476,6 +489,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
476
489
|
prompt_embeds: Optional[torch.Tensor] = None,
|
477
490
|
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
478
491
|
image_embeds: Optional[torch.Tensor] = None,
|
492
|
+
last_image: Optional[torch.Tensor] = None,
|
479
493
|
output_type: Optional[str] = "np",
|
480
494
|
return_dict: bool = True,
|
481
495
|
attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -508,11 +522,11 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
508
522
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
509
523
|
expense of slower inference.
|
510
524
|
guidance_scale (`float`, defaults to `5.0`):
|
511
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
512
|
-
`guidance_scale` is defined as `w` of equation 2.
|
513
|
-
Paper](https://
|
514
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
515
|
-
usually at the expense of lower image quality.
|
525
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
526
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
527
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
528
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
529
|
+
the text `prompt`, usually at the expense of lower image quality.
|
516
530
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
517
531
|
The number of images to generate per prompt.
|
518
532
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -531,7 +545,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
531
545
|
image_embeds (`torch.Tensor`, *optional*):
|
532
546
|
Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
|
533
547
|
image embeddings are generated from the `image` input argument.
|
534
|
-
output_type (`str`, *optional*, defaults to `"
|
548
|
+
output_type (`str`, *optional*, defaults to `"np"`):
|
535
549
|
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
536
550
|
return_dict (`bool`, *optional*, defaults to `True`):
|
537
551
|
Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
|
@@ -548,12 +562,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
548
562
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
549
563
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
550
564
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
551
|
-
max_sequence_length (`int`,
|
552
|
-
The maximum sequence length of the prompt
|
553
|
-
|
554
|
-
|
555
|
-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
|
556
|
-
The dtype to use for the torch.amp.autocast.
|
565
|
+
max_sequence_length (`int`, defaults to `512`):
|
566
|
+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
|
567
|
+
truncated. If the prompt is shorter, it will be padded to this length.
|
568
|
+
|
557
569
|
Examples:
|
558
570
|
|
559
571
|
Returns:
|
@@ -620,7 +632,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
620
632
|
negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
|
621
633
|
|
622
634
|
if image_embeds is None:
|
623
|
-
|
635
|
+
if last_image is None:
|
636
|
+
image_embeds = self.encode_image(image, device)
|
637
|
+
else:
|
638
|
+
image_embeds = self.encode_image([image, last_image], device)
|
624
639
|
image_embeds = image_embeds.repeat(batch_size, 1, 1)
|
625
640
|
image_embeds = image_embeds.to(transformer_dtype)
|
626
641
|
|
@@ -631,6 +646,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
631
646
|
# 5. Prepare latent variables
|
632
647
|
num_channels_latents = self.vae.config.z_dim
|
633
648
|
image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
|
649
|
+
if last_image is not None:
|
650
|
+
last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
|
651
|
+
device, dtype=torch.float32
|
652
|
+
)
|
634
653
|
latents, condition = self.prepare_latents(
|
635
654
|
image,
|
636
655
|
batch_size * num_videos_per_prompt,
|
@@ -642,6 +661,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
642
661
|
device,
|
643
662
|
generator,
|
644
663
|
latents,
|
664
|
+
last_image,
|
645
665
|
)
|
646
666
|
|
647
667
|
# 6. Denoising loop
|