diffusers 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +48 -1
- diffusers/commands/__init__.py +1 -1
- diffusers/commands/diffusers_cli.py +1 -1
- diffusers/commands/env.py +1 -1
- diffusers/commands/fp16_safetensors.py +1 -1
- diffusers/dependency_versions_check.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/experimental/rl/value_guided_sampling.py +1 -1
- diffusers/hooks/faster_cache.py +2 -2
- diffusers/hooks/group_offloading.py +128 -29
- diffusers/hooks/hooks.py +2 -2
- diffusers/hooks/layerwise_casting.py +3 -3
- diffusers/hooks/pyramid_attention_broadcast.py +1 -1
- diffusers/image_processor.py +7 -2
- diffusers/loaders/__init__.py +4 -0
- diffusers/loaders/ip_adapter.py +5 -14
- diffusers/loaders/lora_base.py +212 -111
- diffusers/loaders/lora_conversion_utils.py +275 -34
- diffusers/loaders/lora_pipeline.py +1554 -819
- diffusers/loaders/peft.py +52 -109
- diffusers/loaders/single_file.py +2 -2
- diffusers/loaders/single_file_model.py +20 -4
- diffusers/loaders/single_file_utils.py +225 -5
- diffusers/loaders/textual_inversion.py +3 -2
- diffusers/loaders/transformer_flux.py +1 -1
- diffusers/loaders/transformer_sd3.py +2 -2
- diffusers/loaders/unet.py +2 -16
- diffusers/loaders/unet_loader_utils.py +1 -1
- diffusers/loaders/utils.py +1 -1
- diffusers/models/__init__.py +15 -1
- diffusers/models/activations.py +5 -5
- diffusers/models/adapter.py +2 -3
- diffusers/models/attention.py +4 -4
- diffusers/models/attention_flax.py +10 -10
- diffusers/models/attention_processor.py +14 -10
- diffusers/models/auto_model.py +47 -10
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_dc.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
- diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
- diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vae.py +13 -2
- diffusers/models/autoencoders/vq_model.py +2 -2
- diffusers/models/cache_utils.py +1 -1
- diffusers/models/controlnet.py +1 -1
- diffusers/models/controlnet_flux.py +1 -1
- diffusers/models/controlnet_sd3.py +1 -1
- diffusers/models/controlnet_sparsectrl.py +1 -1
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -3
- diffusers/models/controlnets/controlnet_flax.py +1 -1
- diffusers/models/controlnets/controlnet_flux.py +16 -15
- diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
- diffusers/models/controlnets/controlnet_sana.py +290 -0
- diffusers/models/controlnets/controlnet_sd3.py +1 -1
- diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
- diffusers/models/controlnets/controlnet_union.py +1 -1
- diffusers/models/controlnets/controlnet_xs.py +7 -7
- diffusers/models/controlnets/multicontrolnet.py +4 -5
- diffusers/models/controlnets/multicontrolnet_union.py +5 -6
- diffusers/models/downsampling.py +2 -2
- diffusers/models/embeddings.py +10 -12
- diffusers/models/embeddings_flax.py +2 -2
- diffusers/models/lora.py +3 -3
- diffusers/models/modeling_utils.py +44 -14
- diffusers/models/normalization.py +4 -4
- diffusers/models/resnet.py +2 -2
- diffusers/models/resnet_flax.py +1 -1
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
- diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
- diffusers/models/transformers/consisid_transformer_3d.py +1 -1
- diffusers/models/transformers/dit_transformer_2d.py +2 -2
- diffusers/models/transformers/dual_transformer_2d.py +1 -1
- diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
- diffusers/models/transformers/latte_transformer_3d.py +4 -5
- diffusers/models/transformers/lumina_nextdit2d.py +2 -2
- diffusers/models/transformers/pixart_transformer_2d.py +3 -3
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/sana_transformer.py +8 -3
- diffusers/models/transformers/stable_audio_transformer.py +5 -9
- diffusers/models/transformers/t5_film_transformer.py +3 -3
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +1 -1
- diffusers/models/transformers/transformer_chroma.py +742 -0
- diffusers/models/transformers/transformer_cogview3plus.py +5 -10
- diffusers/models/transformers/transformer_cogview4.py +317 -25
- diffusers/models/transformers/transformer_cosmos.py +579 -0
- diffusers/models/transformers/transformer_flux.py +9 -11
- diffusers/models/transformers/transformer_hidream_image.py +942 -0
- diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
- diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
- diffusers/models/transformers/transformer_ltx.py +2 -2
- diffusers/models/transformers/transformer_lumina2.py +1 -1
- diffusers/models/transformers/transformer_mochi.py +1 -1
- diffusers/models/transformers/transformer_omnigen.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +7 -7
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/transformers/transformer_wan.py +24 -8
- diffusers/models/transformers/transformer_wan_vace.py +393 -0
- diffusers/models/unets/unet_1d.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +1 -1
- diffusers/models/unets/unet_2d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
- diffusers/models/unets/unet_2d_condition.py +2 -2
- diffusers/models/unets/unet_2d_condition_flax.py +2 -2
- diffusers/models/unets/unet_3d_blocks.py +1 -1
- diffusers/models/unets/unet_3d_condition.py +3 -3
- diffusers/models/unets/unet_i2vgen_xl.py +3 -3
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +2 -2
- diffusers/models/unets/unet_stable_cascade.py +1 -1
- diffusers/models/upsampling.py +2 -2
- diffusers/models/vae_flax.py +2 -2
- diffusers/models/vq_model.py +1 -1
- diffusers/pipelines/__init__.py +37 -6
- diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
- diffusers/pipelines/amused/pipeline_amused.py +7 -6
- diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
- diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
- diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
- diffusers/pipelines/auto_pipeline.py +6 -7
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
- diffusers/pipelines/chroma/__init__.py +49 -0
- diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
- diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
- diffusers/pipelines/chroma/pipeline_output.py +21 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
- diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
- diffusers/pipelines/consisid/consisid_utils.py +2 -2
- diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
- diffusers/pipelines/cosmos/__init__.py +54 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
- diffusers/pipelines/cosmos/pipeline_output.py +40 -0
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
- diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
- diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
- diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
- diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
- diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
- diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
- diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
- diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +1 -1
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
- diffusers/pipelines/flux/modeling_flux.py +1 -1
- diffusers/pipelines/flux/pipeline_flux.py +10 -17
- diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
- diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
- diffusers/pipelines/free_init_utils.py +2 -2
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hidream_image/__init__.py +47 -0
- diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
- diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
- diffusers/pipelines/hunyuan_video/__init__.py +2 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
- diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
- diffusers/pipelines/kolors/text_encoder.py +3 -3
- diffusers/pipelines/kolors/tokenizer.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
- diffusers/pipelines/latte/pipeline_latte.py +12 -12
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
- diffusers/pipelines/ltx/__init__.py +4 -0
- diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
- diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
- diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
- diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
- diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
- diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
- diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
- diffusers/pipelines/onnx_utils.py +15 -2
- diffusers/pipelines/pag/pag_utils.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
- diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
- diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
- diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
- diffusers/pipelines/pia/pipeline_pia.py +8 -6
- diffusers/pipelines/pipeline_flax_utils.py +3 -4
- diffusers/pipelines/pipeline_loading_utils.py +89 -13
- diffusers/pipelines/pipeline_utils.py +105 -33
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
- diffusers/pipelines/sana/__init__.py +4 -0
- diffusers/pipelines/sana/pipeline_sana.py +23 -21
- diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
- diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
- diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
- diffusers/pipelines/shap_e/camera.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
- diffusers/pipelines/shap_e/renderer.py +3 -3
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
- diffusers/pipelines/stable_diffusion/__init__.py +0 -7
- diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
- diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
- diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
- diffusers/pipelines/unclip/text_proj.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
- diffusers/pipelines/visualcloze/__init__.py +52 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
- diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
- diffusers/pipelines/wan/__init__.py +2 -0
- diffusers/pipelines/wan/pipeline_wan.py +17 -12
- diffusers/pipelines/wan/pipeline_wan_i2v.py +42 -20
- diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +18 -18
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
- diffusers/quantizers/__init__.py +179 -1
- diffusers/quantizers/base.py +6 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
- diffusers/quantizers/bitsandbytes/utils.py +10 -7
- diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
- diffusers/quantizers/gguf/utils.py +16 -13
- diffusers/quantizers/quantization_config.py +18 -16
- diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
- diffusers/schedulers/__init__.py +3 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
- diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -1
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
- diffusers/schedulers/scheduling_ddim.py +8 -8
- diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_ddim_flax.py +6 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
- diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
- diffusers/schedulers/scheduling_ddpm.py +9 -9
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
- diffusers/schedulers/scheduling_deis_multistep.py +8 -8
- diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
- diffusers/schedulers/scheduling_edm_euler.py +20 -11
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
- diffusers/schedulers/scheduling_heun_discrete.py +2 -2
- diffusers/schedulers/scheduling_ipndm.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
- diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
- diffusers/schedulers/scheduling_lcm.py +3 -3
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
- diffusers/schedulers/scheduling_pndm.py +4 -4
- diffusers/schedulers/scheduling_pndm_flax.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +9 -9
- diffusers/schedulers/scheduling_sasolver.py +15 -15
- diffusers/schedulers/scheduling_scm.py +1 -1
- diffusers/schedulers/scheduling_sde_ve.py +1 -1
- diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
- diffusers/schedulers/scheduling_tcd.py +3 -3
- diffusers/schedulers/scheduling_unclip.py +5 -5
- diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
- diffusers/schedulers/scheduling_utils.py +1 -1
- diffusers/schedulers/scheduling_utils_flax.py +1 -1
- diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
- diffusers/training_utils.py +13 -5
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/accelerate_utils.py +1 -1
- diffusers/utils/doc_utils.py +1 -1
- diffusers/utils/dummy_pt_objects.py +120 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
- diffusers/utils/dynamic_modules_utils.py +21 -3
- diffusers/utils/export_utils.py +1 -1
- diffusers/utils/import_utils.py +81 -18
- diffusers/utils/logging.py +1 -1
- diffusers/utils/outputs.py +2 -1
- diffusers/utils/peft_utils.py +91 -8
- diffusers/utils/state_dict_utils.py +20 -3
- diffusers/utils/testing_utils.py +59 -7
- diffusers/utils/torch_utils.py +25 -5
- diffusers/video_processor.py +2 -2
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/METADATA +3 -3
- diffusers-0.34.0.dist-info/RECORD +639 -0
- diffusers-0.33.0.dist-info/RECORD +0 -608
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/WHEEL +0 -0
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
|
|
1
|
+
# Copyright 2025 VisualCloze team and The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Dict, List, Optional, Tuple, Union
|
16
|
+
|
17
|
+
import torch
|
18
|
+
from PIL import Image
|
19
|
+
|
20
|
+
from ...image_processor import VaeImageProcessor
|
21
|
+
|
22
|
+
|
23
|
+
class VisualClozeProcessor(VaeImageProcessor):
|
24
|
+
"""
|
25
|
+
Image processor for the VisualCloze pipeline.
|
26
|
+
|
27
|
+
This processor handles the preprocessing of images for visual cloze tasks, including resizing, normalization, and
|
28
|
+
mask generation.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
resolution (int, optional):
|
32
|
+
Target resolution for processing images. Each image will be resized to this resolution before being
|
33
|
+
concatenated to avoid the out-of-memory error. Defaults to 384.
|
34
|
+
*args: Additional arguments passed to [~image_processor.VaeImageProcessor]
|
35
|
+
**kwargs: Additional keyword arguments passed to [~image_processor.VaeImageProcessor]
|
36
|
+
"""
|
37
|
+
|
38
|
+
def __init__(self, *args, resolution: int = 384, **kwargs):
|
39
|
+
super().__init__(*args, **kwargs)
|
40
|
+
self.resolution = resolution
|
41
|
+
|
42
|
+
def preprocess_image(
|
43
|
+
self, input_images: List[List[Optional[Image.Image]]], vae_scale_factor: int
|
44
|
+
) -> Tuple[List[List[torch.Tensor]], List[List[List[int]]], List[int]]:
|
45
|
+
"""
|
46
|
+
Preprocesses input images for the VisualCloze pipeline.
|
47
|
+
|
48
|
+
This function handles the preprocessing of input images by:
|
49
|
+
1. Resizing and cropping images to maintain consistent dimensions
|
50
|
+
2. Converting images to the Tensor format for the VAE
|
51
|
+
3. Normalizing pixel values
|
52
|
+
4. Tracking image sizes and positions of target images
|
53
|
+
|
54
|
+
Args:
|
55
|
+
input_images (List[List[Optional[Image.Image]]]):
|
56
|
+
A nested list of PIL Images where:
|
57
|
+
- Outer list represents different samples, including in-context examples and the query
|
58
|
+
- Inner list contains images for the task
|
59
|
+
- In the last row, condition images are provided and the target images are placed as None
|
60
|
+
vae_scale_factor (int):
|
61
|
+
The scale factor used by the VAE for resizing images
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
Tuple containing:
|
65
|
+
- List[List[torch.Tensor]]: Preprocessed images in tensor format
|
66
|
+
- List[List[List[int]]]: Dimensions of each processed image [height, width]
|
67
|
+
- List[int]: Target positions indicating which images are to be generated
|
68
|
+
"""
|
69
|
+
n_samples, n_task_images = len(input_images), len(input_images[0])
|
70
|
+
divisible = 2 * vae_scale_factor
|
71
|
+
|
72
|
+
processed_images: List[List[Image.Image]] = [[] for _ in range(n_samples)]
|
73
|
+
resize_size: List[Optional[Tuple[int, int]]] = [None for _ in range(n_samples)]
|
74
|
+
target_position: List[int] = []
|
75
|
+
|
76
|
+
# Process each sample
|
77
|
+
for i in range(n_samples):
|
78
|
+
# Determine size from first non-None image
|
79
|
+
for j in range(n_task_images):
|
80
|
+
if input_images[i][j] is not None:
|
81
|
+
aspect_ratio = input_images[i][j].width / input_images[i][j].height
|
82
|
+
target_area = self.resolution * self.resolution
|
83
|
+
new_h = int((target_area / aspect_ratio) ** 0.5)
|
84
|
+
new_w = int(new_h * aspect_ratio)
|
85
|
+
|
86
|
+
new_w = max(new_w // divisible, 1) * divisible
|
87
|
+
new_h = max(new_h // divisible, 1) * divisible
|
88
|
+
resize_size[i] = (new_w, new_h)
|
89
|
+
break
|
90
|
+
|
91
|
+
# Process all images in the sample
|
92
|
+
for j in range(n_task_images):
|
93
|
+
if input_images[i][j] is not None:
|
94
|
+
target = self._resize_and_crop(input_images[i][j], resize_size[i][0], resize_size[i][1])
|
95
|
+
processed_images[i].append(target)
|
96
|
+
if i == n_samples - 1:
|
97
|
+
target_position.append(0)
|
98
|
+
else:
|
99
|
+
blank = Image.new("RGB", resize_size[i] or (self.resolution, self.resolution), (0, 0, 0))
|
100
|
+
processed_images[i].append(blank)
|
101
|
+
if i == n_samples - 1:
|
102
|
+
target_position.append(1)
|
103
|
+
|
104
|
+
# Ensure consistent width for multiple target images when there are multiple target images
|
105
|
+
if len(target_position) > 1 and sum(target_position) > 1:
|
106
|
+
new_w = resize_size[n_samples - 1][0] or 384
|
107
|
+
for i in range(len(processed_images)):
|
108
|
+
for j in range(len(processed_images[i])):
|
109
|
+
if processed_images[i][j] is not None:
|
110
|
+
new_h = int(processed_images[i][j].height * (new_w / processed_images[i][j].width))
|
111
|
+
new_w = int(new_w / 16) * 16
|
112
|
+
new_h = int(new_h / 16) * 16
|
113
|
+
processed_images[i][j] = self.height(processed_images[i][j], new_h, new_w)
|
114
|
+
|
115
|
+
# Convert to tensors and normalize
|
116
|
+
image_sizes = []
|
117
|
+
for i in range(len(processed_images)):
|
118
|
+
image_sizes.append([[img.height, img.width] for img in processed_images[i]])
|
119
|
+
for j, image in enumerate(processed_images[i]):
|
120
|
+
image = self.pil_to_numpy(image)
|
121
|
+
image = self.numpy_to_pt(image)
|
122
|
+
image = self.normalize(image)
|
123
|
+
processed_images[i][j] = image
|
124
|
+
|
125
|
+
return processed_images, image_sizes, target_position
|
126
|
+
|
127
|
+
def preprocess_mask(
|
128
|
+
self, input_images: List[List[Image.Image]], target_position: List[int]
|
129
|
+
) -> List[List[torch.Tensor]]:
|
130
|
+
"""
|
131
|
+
Generate masks for the VisualCloze pipeline.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
input_images (List[List[Image.Image]]):
|
135
|
+
Processed images from preprocess_image
|
136
|
+
target_position (List[int]):
|
137
|
+
Binary list marking the positions of target images (1 for target, 0 for condition)
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
List[List[torch.Tensor]]:
|
141
|
+
A nested list of mask tensors (1 for target positions, 0 for condition images)
|
142
|
+
"""
|
143
|
+
mask = []
|
144
|
+
for i, row in enumerate(input_images):
|
145
|
+
if i == len(input_images) - 1: # Query row
|
146
|
+
row_masks = [
|
147
|
+
torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=m) for m in target_position
|
148
|
+
]
|
149
|
+
else: # In-context examples
|
150
|
+
row_masks = [
|
151
|
+
torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=0) for _ in target_position
|
152
|
+
]
|
153
|
+
mask.append(row_masks)
|
154
|
+
return mask
|
155
|
+
|
156
|
+
def preprocess_image_upsampling(
|
157
|
+
self,
|
158
|
+
input_images: List[List[Image.Image]],
|
159
|
+
height: int,
|
160
|
+
width: int,
|
161
|
+
) -> Tuple[List[List[Image.Image]], List[List[List[int]]]]:
|
162
|
+
"""Process images for the upsampling stage in the VisualCloze pipeline.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
input_images: Input image to process
|
166
|
+
height: Target height
|
167
|
+
width: Target width
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
Tuple of processed image and its size
|
171
|
+
"""
|
172
|
+
image = self.resize(input_images[0][0], height, width)
|
173
|
+
image = self.pil_to_numpy(image) # to np
|
174
|
+
image = self.numpy_to_pt(image) # to pt
|
175
|
+
image = self.normalize(image)
|
176
|
+
|
177
|
+
input_images[0][0] = image
|
178
|
+
image_sizes = [[[height, width]]]
|
179
|
+
return input_images, image_sizes
|
180
|
+
|
181
|
+
def preprocess_mask_upsampling(self, input_images: List[List[Image.Image]]) -> List[List[torch.Tensor]]:
|
182
|
+
return [[torch.ones((1, 1, input_images[0][0].shape[2], input_images[0][0].shape[3]))]]
|
183
|
+
|
184
|
+
def get_layout_prompt(self, size: Tuple[int, int]) -> str:
|
185
|
+
layout_instruction = (
|
186
|
+
f"A grid layout with {size[0]} rows and {size[1]} columns, displaying {size[0] * size[1]} images arranged side by side.",
|
187
|
+
)
|
188
|
+
return layout_instruction
|
189
|
+
|
190
|
+
def preprocess(
|
191
|
+
self,
|
192
|
+
task_prompt: Union[str, List[str]],
|
193
|
+
content_prompt: Union[str, List[str]],
|
194
|
+
input_images: Optional[List[List[List[Optional[str]]]]] = None,
|
195
|
+
height: Optional[int] = None,
|
196
|
+
width: Optional[int] = None,
|
197
|
+
upsampling: bool = False,
|
198
|
+
vae_scale_factor: int = 16,
|
199
|
+
) -> Dict:
|
200
|
+
"""Process visual cloze inputs.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
task_prompt: Task description(s)
|
204
|
+
content_prompt: Content description(s)
|
205
|
+
input_images: List of images or None for the target images
|
206
|
+
height: Optional target height for upsampling stage
|
207
|
+
width: Optional target width for upsampling stage
|
208
|
+
upsampling: Whether this is in the upsampling processing stage
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
Dictionary containing processed images, masks, prompts and metadata
|
212
|
+
"""
|
213
|
+
if isinstance(task_prompt, str):
|
214
|
+
task_prompt = [task_prompt]
|
215
|
+
content_prompt = [content_prompt]
|
216
|
+
input_images = [input_images]
|
217
|
+
|
218
|
+
output = {
|
219
|
+
"init_image": [],
|
220
|
+
"mask": [],
|
221
|
+
"task_prompt": task_prompt if not upsampling else [None for _ in range(len(task_prompt))],
|
222
|
+
"content_prompt": content_prompt,
|
223
|
+
"layout_prompt": [],
|
224
|
+
"target_position": [],
|
225
|
+
"image_size": [],
|
226
|
+
}
|
227
|
+
for i in range(len(task_prompt)):
|
228
|
+
if upsampling:
|
229
|
+
layout_prompt = None
|
230
|
+
else:
|
231
|
+
layout_prompt = self.get_layout_prompt((len(input_images[i]), len(input_images[i][0])))
|
232
|
+
|
233
|
+
if upsampling:
|
234
|
+
cur_processed_images, cur_image_size = self.preprocess_image_upsampling(
|
235
|
+
input_images[i], height=height, width=width
|
236
|
+
)
|
237
|
+
cur_mask = self.preprocess_mask_upsampling(cur_processed_images)
|
238
|
+
else:
|
239
|
+
cur_processed_images, cur_image_size, cur_target_position = self.preprocess_image(
|
240
|
+
input_images[i], vae_scale_factor=vae_scale_factor
|
241
|
+
)
|
242
|
+
cur_mask = self.preprocess_mask(cur_processed_images, cur_target_position)
|
243
|
+
|
244
|
+
output["target_position"].append(cur_target_position)
|
245
|
+
|
246
|
+
output["image_size"].append(cur_image_size)
|
247
|
+
output["init_image"].append(cur_processed_images)
|
248
|
+
output["mask"].append(cur_mask)
|
249
|
+
output["layout_prompt"].append(layout_prompt)
|
250
|
+
|
251
|
+
return output
|
@@ -24,6 +24,7 @@ except OptionalDependencyNotAvailable:
|
|
24
24
|
else:
|
25
25
|
_import_structure["pipeline_wan"] = ["WanPipeline"]
|
26
26
|
_import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
|
27
|
+
_import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
|
27
28
|
_import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
|
28
29
|
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
29
30
|
try:
|
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
35
36
|
else:
|
36
37
|
from .pipeline_wan import WanPipeline
|
37
38
|
from .pipeline_wan_i2v import WanImageToVideoPipeline
|
39
|
+
from .pipeline_wan_vace import WanVACEPipeline
|
38
40
|
from .pipeline_wan_video2video import WanVideoToVideoPipeline
|
39
41
|
|
40
42
|
else:
|
@@ -15,7 +15,6 @@
|
|
15
15
|
import html
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Union
|
17
17
|
|
18
|
-
import ftfy
|
19
18
|
import regex as re
|
20
19
|
import torch
|
21
20
|
from transformers import AutoTokenizer, UMT5EncoderModel
|
@@ -24,7 +23,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
|
24
23
|
from ...loaders import WanLoraLoaderMixin
|
25
24
|
from ...models import AutoencoderKLWan, WanTransformer3DModel
|
26
25
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
27
|
-
from ...utils import is_torch_xla_available, logging, replace_example_docstring
|
26
|
+
from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
|
28
27
|
from ...utils.torch_utils import randn_tensor
|
29
28
|
from ...video_processor import VideoProcessor
|
30
29
|
from ..pipeline_utils import DiffusionPipeline
|
@@ -40,6 +39,9 @@ else:
|
|
40
39
|
|
41
40
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
42
41
|
|
42
|
+
if is_ftfy_available():
|
43
|
+
import ftfy
|
44
|
+
|
43
45
|
|
44
46
|
EXAMPLE_DOC_STRING = """
|
45
47
|
Examples:
|
@@ -386,8 +388,10 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
386
388
|
|
387
389
|
Args:
|
388
390
|
prompt (`str` or `List[str]`, *optional*):
|
389
|
-
The prompt or prompts to guide the image generation. If not defined,
|
390
|
-
|
391
|
+
The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
|
392
|
+
negative_prompt (`str` or `List[str]`, *optional*):
|
393
|
+
The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
|
394
|
+
instead. Ignored when not using guidance (`guidance_scale` < `1`).
|
391
395
|
height (`int`, defaults to `480`):
|
392
396
|
The height in pixels of the generated image.
|
393
397
|
width (`int`, defaults to `832`):
|
@@ -398,11 +402,11 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
398
402
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
399
403
|
expense of slower inference.
|
400
404
|
guidance_scale (`float`, defaults to `5.0`):
|
401
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
402
|
-
`guidance_scale` is defined as `w` of equation 2.
|
403
|
-
Paper](https://
|
404
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
405
|
-
usually at the expense of lower image quality.
|
405
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
406
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
407
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
408
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
409
|
+
the text `prompt`, usually at the expense of lower image quality.
|
406
410
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
407
411
|
The number of images to generate per prompt.
|
408
412
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -415,7 +419,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
415
419
|
prompt_embeds (`torch.Tensor`, *optional*):
|
416
420
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
417
421
|
provided, text embeddings are generated from the `prompt` input argument.
|
418
|
-
output_type (`str`, *optional*, defaults to `"
|
422
|
+
output_type (`str`, *optional*, defaults to `"np"`):
|
419
423
|
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
420
424
|
return_dict (`bool`, *optional*, defaults to `True`):
|
421
425
|
Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
|
@@ -432,8 +436,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
432
436
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
433
437
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
434
438
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
435
|
-
|
436
|
-
The
|
439
|
+
max_sequence_length (`int`, defaults to `512`):
|
440
|
+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
|
441
|
+
truncated. If the prompt is shorter, it will be padded to this length.
|
437
442
|
|
438
443
|
Examples:
|
439
444
|
|
@@ -15,7 +15,6 @@
|
|
15
15
|
import html
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
17
17
|
|
18
|
-
import ftfy
|
19
18
|
import PIL
|
20
19
|
import regex as re
|
21
20
|
import torch
|
@@ -26,7 +25,7 @@ from ...image_processor import PipelineImageInput
|
|
26
25
|
from ...loaders import WanLoraLoaderMixin
|
27
26
|
from ...models import AutoencoderKLWan, WanTransformer3DModel
|
28
27
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
29
|
-
from ...utils import is_torch_xla_available, logging, replace_example_docstring
|
28
|
+
from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
|
30
29
|
from ...utils.torch_utils import randn_tensor
|
31
30
|
from ...video_processor import VideoProcessor
|
32
31
|
from ..pipeline_utils import DiffusionPipeline
|
@@ -42,6 +41,9 @@ else:
|
|
42
41
|
|
43
42
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
44
43
|
|
44
|
+
if is_ftfy_available():
|
45
|
+
import ftfy
|
46
|
+
|
45
47
|
EXAMPLE_DOC_STRING = """
|
46
48
|
Examples:
|
47
49
|
```python
|
@@ -378,6 +380,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
378
380
|
device: Optional[torch.device] = None,
|
379
381
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
380
382
|
latents: Optional[torch.Tensor] = None,
|
383
|
+
last_image: Optional[torch.Tensor] = None,
|
381
384
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
382
385
|
num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
383
386
|
latent_height = height // self.vae_scale_factor_spatial
|
@@ -396,10 +399,17 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
396
399
|
latents = latents.to(device=device, dtype=dtype)
|
397
400
|
|
398
401
|
image = image.unsqueeze(2)
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
402
|
+
if last_image is None:
|
403
|
+
video_condition = torch.cat(
|
404
|
+
[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
|
405
|
+
)
|
406
|
+
else:
|
407
|
+
last_image = last_image.unsqueeze(2)
|
408
|
+
video_condition = torch.cat(
|
409
|
+
[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
|
410
|
+
dim=2,
|
411
|
+
)
|
412
|
+
video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
|
403
413
|
|
404
414
|
latents_mean = (
|
405
415
|
torch.tensor(self.vae.config.latents_mean)
|
@@ -419,10 +429,15 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
419
429
|
latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
|
420
430
|
latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
|
421
431
|
|
432
|
+
latent_condition = latent_condition.to(dtype)
|
422
433
|
latent_condition = (latent_condition - latents_mean) * latents_std
|
423
434
|
|
424
435
|
mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
|
425
|
-
|
436
|
+
|
437
|
+
if last_image is None:
|
438
|
+
mask_lat_size[:, :, list(range(1, num_frames))] = 0
|
439
|
+
else:
|
440
|
+
mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
|
426
441
|
first_frame_mask = mask_lat_size[:, :, 0:1]
|
427
442
|
first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
|
428
443
|
mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
|
@@ -474,6 +489,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
474
489
|
prompt_embeds: Optional[torch.Tensor] = None,
|
475
490
|
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
476
491
|
image_embeds: Optional[torch.Tensor] = None,
|
492
|
+
last_image: Optional[torch.Tensor] = None,
|
477
493
|
output_type: Optional[str] = "np",
|
478
494
|
return_dict: bool = True,
|
479
495
|
attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -506,11 +522,11 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
506
522
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
507
523
|
expense of slower inference.
|
508
524
|
guidance_scale (`float`, defaults to `5.0`):
|
509
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
510
|
-
`guidance_scale` is defined as `w` of equation 2.
|
511
|
-
Paper](https://
|
512
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
513
|
-
usually at the expense of lower image quality.
|
525
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
526
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
527
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
528
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
529
|
+
the text `prompt`, usually at the expense of lower image quality.
|
514
530
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
515
531
|
The number of images to generate per prompt.
|
516
532
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -529,7 +545,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
529
545
|
image_embeds (`torch.Tensor`, *optional*):
|
530
546
|
Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
|
531
547
|
image embeddings are generated from the `image` input argument.
|
532
|
-
output_type (`str`, *optional*, defaults to `"
|
548
|
+
output_type (`str`, *optional*, defaults to `"np"`):
|
533
549
|
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
534
550
|
return_dict (`bool`, *optional*, defaults to `True`):
|
535
551
|
Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
|
@@ -546,12 +562,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
546
562
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
547
563
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
548
564
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
549
|
-
max_sequence_length (`int`,
|
550
|
-
The maximum sequence length of the prompt
|
551
|
-
|
552
|
-
|
553
|
-
autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
|
554
|
-
The dtype to use for the torch.amp.autocast.
|
565
|
+
max_sequence_length (`int`, defaults to `512`):
|
566
|
+
The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
|
567
|
+
truncated. If the prompt is shorter, it will be padded to this length.
|
568
|
+
|
555
569
|
Examples:
|
556
570
|
|
557
571
|
Returns:
|
@@ -618,7 +632,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
618
632
|
negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
|
619
633
|
|
620
634
|
if image_embeds is None:
|
621
|
-
|
635
|
+
if last_image is None:
|
636
|
+
image_embeds = self.encode_image(image, device)
|
637
|
+
else:
|
638
|
+
image_embeds = self.encode_image([image, last_image], device)
|
622
639
|
image_embeds = image_embeds.repeat(batch_size, 1, 1)
|
623
640
|
image_embeds = image_embeds.to(transformer_dtype)
|
624
641
|
|
@@ -629,6 +646,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
629
646
|
# 5. Prepare latent variables
|
630
647
|
num_channels_latents = self.vae.config.z_dim
|
631
648
|
image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
|
649
|
+
if last_image is not None:
|
650
|
+
last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
|
651
|
+
device, dtype=torch.float32
|
652
|
+
)
|
632
653
|
latents, condition = self.prepare_latents(
|
633
654
|
image,
|
634
655
|
batch_size * num_videos_per_prompt,
|
@@ -640,6 +661,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
640
661
|
device,
|
641
662
|
generator,
|
642
663
|
latents,
|
664
|
+
last_image,
|
643
665
|
)
|
644
666
|
|
645
667
|
# 6. Denoising loop
|