diffusers 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +48 -1
- diffusers/commands/__init__.py +1 -1
- diffusers/commands/diffusers_cli.py +1 -1
- diffusers/commands/env.py +1 -1
- diffusers/commands/fp16_safetensors.py +1 -1
- diffusers/dependency_versions_check.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/experimental/rl/value_guided_sampling.py +1 -1
- diffusers/hooks/faster_cache.py +2 -2
- diffusers/hooks/group_offloading.py +128 -29
- diffusers/hooks/hooks.py +2 -2
- diffusers/hooks/layerwise_casting.py +3 -3
- diffusers/hooks/pyramid_attention_broadcast.py +1 -1
- diffusers/image_processor.py +7 -2
- diffusers/loaders/__init__.py +4 -0
- diffusers/loaders/ip_adapter.py +5 -14
- diffusers/loaders/lora_base.py +212 -111
- diffusers/loaders/lora_conversion_utils.py +275 -34
- diffusers/loaders/lora_pipeline.py +1554 -819
- diffusers/loaders/peft.py +52 -109
- diffusers/loaders/single_file.py +2 -2
- diffusers/loaders/single_file_model.py +20 -4
- diffusers/loaders/single_file_utils.py +225 -5
- diffusers/loaders/textual_inversion.py +3 -2
- diffusers/loaders/transformer_flux.py +1 -1
- diffusers/loaders/transformer_sd3.py +2 -2
- diffusers/loaders/unet.py +2 -16
- diffusers/loaders/unet_loader_utils.py +1 -1
- diffusers/loaders/utils.py +1 -1
- diffusers/models/__init__.py +15 -1
- diffusers/models/activations.py +5 -5
- diffusers/models/adapter.py +2 -3
- diffusers/models/attention.py +4 -4
- diffusers/models/attention_flax.py +10 -10
- diffusers/models/attention_processor.py +14 -10
- diffusers/models/auto_model.py +47 -10
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_dc.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
- diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
- diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vae.py +13 -2
- diffusers/models/autoencoders/vq_model.py +2 -2
- diffusers/models/cache_utils.py +1 -1
- diffusers/models/controlnet.py +1 -1
- diffusers/models/controlnet_flux.py +1 -1
- diffusers/models/controlnet_sd3.py +1 -1
- diffusers/models/controlnet_sparsectrl.py +1 -1
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -3
- diffusers/models/controlnets/controlnet_flax.py +1 -1
- diffusers/models/controlnets/controlnet_flux.py +16 -15
- diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
- diffusers/models/controlnets/controlnet_sana.py +290 -0
- diffusers/models/controlnets/controlnet_sd3.py +1 -1
- diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
- diffusers/models/controlnets/controlnet_union.py +1 -1
- diffusers/models/controlnets/controlnet_xs.py +7 -7
- diffusers/models/controlnets/multicontrolnet.py +4 -5
- diffusers/models/controlnets/multicontrolnet_union.py +5 -6
- diffusers/models/downsampling.py +2 -2
- diffusers/models/embeddings.py +10 -12
- diffusers/models/embeddings_flax.py +2 -2
- diffusers/models/lora.py +3 -3
- diffusers/models/modeling_utils.py +44 -14
- diffusers/models/normalization.py +4 -4
- diffusers/models/resnet.py +2 -2
- diffusers/models/resnet_flax.py +1 -1
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
- diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
- diffusers/models/transformers/consisid_transformer_3d.py +1 -1
- diffusers/models/transformers/dit_transformer_2d.py +2 -2
- diffusers/models/transformers/dual_transformer_2d.py +1 -1
- diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
- diffusers/models/transformers/latte_transformer_3d.py +4 -5
- diffusers/models/transformers/lumina_nextdit2d.py +2 -2
- diffusers/models/transformers/pixart_transformer_2d.py +3 -3
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/sana_transformer.py +8 -3
- diffusers/models/transformers/stable_audio_transformer.py +5 -9
- diffusers/models/transformers/t5_film_transformer.py +3 -3
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +1 -1
- diffusers/models/transformers/transformer_chroma.py +742 -0
- diffusers/models/transformers/transformer_cogview3plus.py +5 -10
- diffusers/models/transformers/transformer_cogview4.py +317 -25
- diffusers/models/transformers/transformer_cosmos.py +579 -0
- diffusers/models/transformers/transformer_flux.py +9 -11
- diffusers/models/transformers/transformer_hidream_image.py +942 -0
- diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
- diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
- diffusers/models/transformers/transformer_ltx.py +2 -2
- diffusers/models/transformers/transformer_lumina2.py +1 -1
- diffusers/models/transformers/transformer_mochi.py +1 -1
- diffusers/models/transformers/transformer_omnigen.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +7 -7
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/transformers/transformer_wan.py +24 -8
- diffusers/models/transformers/transformer_wan_vace.py +393 -0
- diffusers/models/unets/unet_1d.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +1 -1
- diffusers/models/unets/unet_2d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
- diffusers/models/unets/unet_2d_condition.py +2 -2
- diffusers/models/unets/unet_2d_condition_flax.py +2 -2
- diffusers/models/unets/unet_3d_blocks.py +1 -1
- diffusers/models/unets/unet_3d_condition.py +3 -3
- diffusers/models/unets/unet_i2vgen_xl.py +3 -3
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +2 -2
- diffusers/models/unets/unet_stable_cascade.py +1 -1
- diffusers/models/upsampling.py +2 -2
- diffusers/models/vae_flax.py +2 -2
- diffusers/models/vq_model.py +1 -1
- diffusers/pipelines/__init__.py +37 -6
- diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
- diffusers/pipelines/amused/pipeline_amused.py +7 -6
- diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
- diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
- diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
- diffusers/pipelines/auto_pipeline.py +6 -7
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
- diffusers/pipelines/chroma/__init__.py +49 -0
- diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
- diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
- diffusers/pipelines/chroma/pipeline_output.py +21 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
- diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
- diffusers/pipelines/consisid/consisid_utils.py +2 -2
- diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
- diffusers/pipelines/cosmos/__init__.py +54 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
- diffusers/pipelines/cosmos/pipeline_output.py +40 -0
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
- diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
- diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
- diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
- diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
- diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
- diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
- diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
- diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +1 -1
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
- diffusers/pipelines/flux/modeling_flux.py +1 -1
- diffusers/pipelines/flux/pipeline_flux.py +10 -17
- diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
- diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
- diffusers/pipelines/free_init_utils.py +2 -2
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hidream_image/__init__.py +47 -0
- diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
- diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
- diffusers/pipelines/hunyuan_video/__init__.py +2 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
- diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
- diffusers/pipelines/kolors/text_encoder.py +3 -3
- diffusers/pipelines/kolors/tokenizer.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
- diffusers/pipelines/latte/pipeline_latte.py +12 -12
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
- diffusers/pipelines/ltx/__init__.py +4 -0
- diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
- diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
- diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
- diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
- diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
- diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
- diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
- diffusers/pipelines/onnx_utils.py +15 -2
- diffusers/pipelines/pag/pag_utils.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
- diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
- diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
- diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
- diffusers/pipelines/pia/pipeline_pia.py +8 -6
- diffusers/pipelines/pipeline_flax_utils.py +3 -4
- diffusers/pipelines/pipeline_loading_utils.py +89 -13
- diffusers/pipelines/pipeline_utils.py +105 -33
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
- diffusers/pipelines/sana/__init__.py +4 -0
- diffusers/pipelines/sana/pipeline_sana.py +23 -21
- diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
- diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
- diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
- diffusers/pipelines/shap_e/camera.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
- diffusers/pipelines/shap_e/renderer.py +3 -3
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
- diffusers/pipelines/stable_diffusion/__init__.py +0 -7
- diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
- diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
- diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
- diffusers/pipelines/unclip/text_proj.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
- diffusers/pipelines/visualcloze/__init__.py +52 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
- diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
- diffusers/pipelines/wan/__init__.py +2 -0
- diffusers/pipelines/wan/pipeline_wan.py +13 -10
- diffusers/pipelines/wan/pipeline_wan_i2v.py +38 -18
- diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
- diffusers/quantizers/__init__.py +179 -1
- diffusers/quantizers/base.py +6 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
- diffusers/quantizers/bitsandbytes/utils.py +10 -7
- diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
- diffusers/quantizers/gguf/utils.py +16 -13
- diffusers/quantizers/quantization_config.py +18 -16
- diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
- diffusers/schedulers/__init__.py +3 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
- diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -1
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
- diffusers/schedulers/scheduling_ddim.py +8 -8
- diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_ddim_flax.py +6 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
- diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
- diffusers/schedulers/scheduling_ddpm.py +9 -9
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
- diffusers/schedulers/scheduling_deis_multistep.py +8 -8
- diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
- diffusers/schedulers/scheduling_edm_euler.py +20 -11
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
- diffusers/schedulers/scheduling_heun_discrete.py +2 -2
- diffusers/schedulers/scheduling_ipndm.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
- diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
- diffusers/schedulers/scheduling_lcm.py +3 -3
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
- diffusers/schedulers/scheduling_pndm.py +4 -4
- diffusers/schedulers/scheduling_pndm_flax.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +9 -9
- diffusers/schedulers/scheduling_sasolver.py +15 -15
- diffusers/schedulers/scheduling_scm.py +1 -1
- diffusers/schedulers/scheduling_sde_ve.py +1 -1
- diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
- diffusers/schedulers/scheduling_tcd.py +3 -3
- diffusers/schedulers/scheduling_unclip.py +5 -5
- diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
- diffusers/schedulers/scheduling_utils.py +1 -1
- diffusers/schedulers/scheduling_utils_flax.py +1 -1
- diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
- diffusers/training_utils.py +13 -5
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/accelerate_utils.py +1 -1
- diffusers/utils/doc_utils.py +1 -1
- diffusers/utils/dummy_pt_objects.py +120 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
- diffusers/utils/dynamic_modules_utils.py +21 -3
- diffusers/utils/export_utils.py +1 -1
- diffusers/utils/import_utils.py +81 -18
- diffusers/utils/logging.py +1 -1
- diffusers/utils/outputs.py +2 -1
- diffusers/utils/peft_utils.py +91 -8
- diffusers/utils/state_dict_utils.py +20 -3
- diffusers/utils/testing_utils.py +59 -7
- diffusers/utils/torch_utils.py +25 -5
- diffusers/video_processor.py +2 -2
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/METADATA +70 -55
- diffusers-0.34.0.dist-info/RECORD +639 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/WHEEL +1 -1
- diffusers-0.33.1.dist-info/RECORD +0 -608
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -100,6 +100,50 @@ DEFAULT_PROMPT_TEMPLATE = {
|
|
100
100
|
}
|
101
101
|
|
102
102
|
|
103
|
+
def _expand_input_ids_with_image_tokens(
|
104
|
+
text_input_ids,
|
105
|
+
prompt_attention_mask,
|
106
|
+
max_sequence_length,
|
107
|
+
image_token_index,
|
108
|
+
image_emb_len,
|
109
|
+
image_emb_start,
|
110
|
+
image_emb_end,
|
111
|
+
pad_token_id,
|
112
|
+
):
|
113
|
+
special_image_token_mask = text_input_ids == image_token_index
|
114
|
+
num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
|
115
|
+
batch_indices, non_image_indices = torch.where(text_input_ids != image_token_index)
|
116
|
+
|
117
|
+
max_expanded_length = max_sequence_length + (num_special_image_tokens.max() * (image_emb_len - 1))
|
118
|
+
new_token_positions = torch.cumsum((special_image_token_mask * (image_emb_len - 1) + 1), -1) - 1
|
119
|
+
text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
|
120
|
+
|
121
|
+
expanded_input_ids = torch.full(
|
122
|
+
(text_input_ids.shape[0], max_expanded_length),
|
123
|
+
pad_token_id,
|
124
|
+
dtype=text_input_ids.dtype,
|
125
|
+
device=text_input_ids.device,
|
126
|
+
)
|
127
|
+
expanded_input_ids[batch_indices, text_to_overwrite] = text_input_ids[batch_indices, non_image_indices]
|
128
|
+
expanded_input_ids[batch_indices, image_emb_start:image_emb_end] = image_token_index
|
129
|
+
|
130
|
+
expanded_attention_mask = torch.zeros(
|
131
|
+
(text_input_ids.shape[0], max_expanded_length),
|
132
|
+
dtype=prompt_attention_mask.dtype,
|
133
|
+
device=prompt_attention_mask.device,
|
134
|
+
)
|
135
|
+
attn_batch_indices, attention_indices = torch.where(expanded_input_ids != pad_token_id)
|
136
|
+
expanded_attention_mask[attn_batch_indices, attention_indices] = 1.0
|
137
|
+
expanded_attention_mask = expanded_attention_mask.to(prompt_attention_mask.dtype)
|
138
|
+
position_ids = (expanded_attention_mask.cumsum(-1) - 1).masked_fill_((expanded_attention_mask == 0), 1)
|
139
|
+
|
140
|
+
return {
|
141
|
+
"input_ids": expanded_input_ids,
|
142
|
+
"attention_mask": expanded_attention_mask,
|
143
|
+
"position_ids": position_ids,
|
144
|
+
}
|
145
|
+
|
146
|
+
|
103
147
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
104
148
|
def retrieve_timesteps(
|
105
149
|
scheduler,
|
@@ -251,6 +295,12 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
|
|
251
295
|
prompt = [prompt_template["template"].format(p) for p in prompt]
|
252
296
|
|
253
297
|
crop_start = prompt_template.get("crop_start", None)
|
298
|
+
|
299
|
+
image_emb_len = prompt_template.get("image_emb_len", 576)
|
300
|
+
image_emb_start = prompt_template.get("image_emb_start", 5)
|
301
|
+
image_emb_end = prompt_template.get("image_emb_end", 581)
|
302
|
+
double_return_token_id = prompt_template.get("double_return_token_id", 271)
|
303
|
+
|
254
304
|
if crop_start is None:
|
255
305
|
prompt_template_input = self.tokenizer(
|
256
306
|
prompt_template["template"],
|
@@ -280,19 +330,25 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
|
|
280
330
|
|
281
331
|
image_embeds = self.image_processor(image, return_tensors="pt").pixel_values.to(device)
|
282
332
|
|
333
|
+
image_token_index = self.text_encoder.config.image_token_index
|
334
|
+
pad_token_id = self.text_encoder.config.pad_token_id
|
335
|
+
expanded_inputs = _expand_input_ids_with_image_tokens(
|
336
|
+
text_input_ids,
|
337
|
+
prompt_attention_mask,
|
338
|
+
max_sequence_length,
|
339
|
+
image_token_index,
|
340
|
+
image_emb_len,
|
341
|
+
image_emb_start,
|
342
|
+
image_emb_end,
|
343
|
+
pad_token_id,
|
344
|
+
)
|
283
345
|
prompt_embeds = self.text_encoder(
|
284
|
-
|
285
|
-
attention_mask=prompt_attention_mask,
|
346
|
+
**expanded_inputs,
|
286
347
|
pixel_values=image_embeds,
|
287
348
|
output_hidden_states=True,
|
288
349
|
).hidden_states[-(num_hidden_layers_to_skip + 1)]
|
289
350
|
prompt_embeds = prompt_embeds.to(dtype=dtype)
|
290
351
|
|
291
|
-
image_emb_len = prompt_template.get("image_emb_len", 576)
|
292
|
-
image_emb_start = prompt_template.get("image_emb_start", 5)
|
293
|
-
image_emb_end = prompt_template.get("image_emb_end", 581)
|
294
|
-
double_return_token_id = prompt_template.get("double_return_token_id", 271)
|
295
|
-
|
296
352
|
if crop_start is not None and crop_start > 0:
|
297
353
|
text_crop_start = crop_start - 1 + image_emb_len
|
298
354
|
batch_indices, last_double_return_token_indices = torch.where(text_input_ids == double_return_token_id)
|
@@ -655,13 +711,13 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
|
|
655
711
|
true_cfg_scale (`float`, *optional*, defaults to 1.0):
|
656
712
|
When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
|
657
713
|
guidance_scale (`float`, defaults to `1.0`):
|
658
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
659
|
-
`guidance_scale` is defined as `w` of equation 2.
|
660
|
-
Paper](https://
|
661
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
662
|
-
usually at the expense of lower image quality. Note that the only available
|
663
|
-
CFG-distilled, which means that traditional guidance between unconditional and
|
664
|
-
not applied.
|
714
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
715
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
716
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
717
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
718
|
+
the text `prompt`, usually at the expense of lower image quality. Note that the only available
|
719
|
+
HunyuanVideo model is CFG-distilled, which means that traditional guidance between unconditional and
|
720
|
+
conditional latent is not applied.
|
665
721
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
666
722
|
The number of images to generate per prompt.
|
667
723
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -1,5 +1,8 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
|
+
from typing import List, Union
|
2
3
|
|
4
|
+
import numpy as np
|
5
|
+
import PIL.Image
|
3
6
|
import torch
|
4
7
|
|
5
8
|
from diffusers.utils import BaseOutput
|
@@ -18,3 +21,19 @@ class HunyuanVideoPipelineOutput(BaseOutput):
|
|
18
21
|
"""
|
19
22
|
|
20
23
|
frames: torch.Tensor
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class HunyuanVideoFramepackPipelineOutput(BaseOutput):
|
28
|
+
r"""
|
29
|
+
Output class for HunyuanVideo pipelines.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
33
|
+
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
34
|
+
denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
35
|
+
`(batch_size, num_frames, channels, height, width)`. Or, a list of torch tensors where each tensor
|
36
|
+
corresponds to a latent that decodes to multiple frames.
|
37
|
+
"""
|
38
|
+
|
39
|
+
frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]], List[torch.Tensor]]
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -128,7 +128,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
|
128
128
|
r"""
|
129
129
|
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
130
130
|
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
131
|
-
Flawed](https://
|
131
|
+
Flawed](https://huggingface.co/papers/2305.08891).
|
132
132
|
|
133
133
|
Args:
|
134
134
|
noise_cfg (`torch.Tensor`):
|
@@ -433,7 +433,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
433
433
|
def prepare_extra_step_kwargs(self, generator, eta):
|
434
434
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
435
435
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
436
|
-
# eta corresponds to η in DDIM paper: https://
|
436
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
437
437
|
# and should be between [0, 1]
|
438
438
|
|
439
439
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -555,7 +555,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
555
555
|
return self._guidance_rescale
|
556
556
|
|
557
557
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
558
|
-
# of the Imagen paper: https://
|
558
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
559
559
|
# corresponds to doing no classifier free guidance.
|
560
560
|
@property
|
561
561
|
def do_classifier_free_guidance(self):
|
@@ -625,8 +625,8 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
625
625
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
626
626
|
The number of images to generate per prompt.
|
627
627
|
eta (`float`, *optional*, defaults to 0.0):
|
628
|
-
Corresponds to parameter eta (η) from the [DDIM](https://
|
629
|
-
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
628
|
+
Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
|
629
|
+
applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
630
630
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
631
631
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
632
632
|
generation deterministic.
|
@@ -662,7 +662,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
662
662
|
inputs will be passed.
|
663
663
|
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
664
664
|
Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
|
665
|
-
Schedules and Sample Steps are Flawed](https://
|
665
|
+
Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891). See Section 3.4
|
666
666
|
original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
|
667
667
|
The original size of the image. Used to calculate the time ids.
|
668
668
|
target_size (`Tuple[int, int]`, *optional*):
|
@@ -865,7 +865,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
865
865
|
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
866
866
|
|
867
867
|
if self.do_classifier_free_guidance and guidance_rescale > 0.0:
|
868
|
-
# Based on 3.4. in https://
|
868
|
+
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
869
869
|
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
|
870
870
|
|
871
871
|
# compute the previous noisy sample x_t -> x_t-1
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -33,7 +33,7 @@ from ...utils import (
|
|
33
33
|
)
|
34
34
|
from ...utils.torch_utils import randn_tensor
|
35
35
|
from ...video_processor import VideoProcessor
|
36
|
-
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
36
|
+
from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
|
37
37
|
|
38
38
|
|
39
39
|
if is_torch_xla_available():
|
@@ -97,9 +97,11 @@ class I2VGenXLPipelineOutput(BaseOutput):
|
|
97
97
|
|
98
98
|
|
99
99
|
class I2VGenXLPipeline(
|
100
|
+
DeprecatedPipelineMixin,
|
100
101
|
DiffusionPipeline,
|
101
102
|
StableDiffusionMixin,
|
102
103
|
):
|
104
|
+
_last_supported_version = "0.33.1"
|
103
105
|
r"""
|
104
106
|
Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/).
|
105
107
|
|
@@ -151,7 +153,7 @@ class I2VGenXLPipeline(
|
|
151
153
|
return self._guidance_scale
|
152
154
|
|
153
155
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
154
|
-
# of the Imagen paper: https://
|
156
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
155
157
|
# corresponds to doing no classifier free guidance.
|
156
158
|
@property
|
157
159
|
def do_classifier_free_guidance(self):
|
@@ -384,7 +386,7 @@ class I2VGenXLPipeline(
|
|
384
386
|
def prepare_extra_step_kwargs(self, generator, eta):
|
385
387
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
386
388
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
387
|
-
# eta corresponds to η in DDIM paper: https://
|
389
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
388
390
|
# and should be between [0, 1]
|
389
391
|
|
390
392
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -462,7 +464,7 @@ class I2VGenXLPipeline(
|
|
462
464
|
image_latents = image_latents.unsqueeze(2)
|
463
465
|
|
464
466
|
# Append a position mask for each subsequent frame
|
465
|
-
# after the
|
467
|
+
# after the initial image latent frame
|
466
468
|
frame_position_mask = []
|
467
469
|
for frame_idx in range(num_frames - 1):
|
468
470
|
scale = (frame_idx + 1) / (num_frames - 1)
|
@@ -557,8 +559,8 @@ class I2VGenXLPipeline(
|
|
557
559
|
The prompt or prompts to guide what to not include in image generation. If not defined, you need to
|
558
560
|
pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
|
559
561
|
eta (`float`, *optional*):
|
560
|
-
Corresponds to parameter eta (η) from the [DDIM](https://
|
561
|
-
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
562
|
+
Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
|
563
|
+
applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
562
564
|
num_videos_per_prompt (`int`, *optional*):
|
563
565
|
The number of images to generate per prompt.
|
564
566
|
decode_chunk_size (`int`, *optional*):
|
@@ -614,7 +616,7 @@ class I2VGenXLPipeline(
|
|
614
616
|
|
615
617
|
device = self._execution_device
|
616
618
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
617
|
-
# of the Imagen paper: https://
|
619
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
618
620
|
# corresponds to doing no classifier free guidance.
|
619
621
|
self._guidance_scale = guidance_scale
|
620
622
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -278,11 +278,11 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
278
278
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
279
279
|
expense of slower inference.
|
280
280
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
281
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
282
|
-
`guidance_scale` is defined as `w` of equation 2.
|
283
|
-
Paper](https://
|
284
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
285
|
-
usually at the expense of lower image quality.
|
281
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
282
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
283
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
284
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
285
|
+
the text `prompt`, usually at the expense of lower image quality.
|
286
286
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
287
287
|
The number of images to generate per prompt.
|
288
288
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -193,7 +193,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
193
193
|
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
194
194
|
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
195
195
|
|
196
|
-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] =
|
196
|
+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
|
197
197
|
r"""
|
198
198
|
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
|
199
199
|
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
|
@@ -251,20 +251,20 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
251
251
|
width (`int`, *optional*, defaults to 512):
|
252
252
|
The width in pixels of the generated image.
|
253
253
|
prior_guidance_scale (`float`, *optional*, defaults to 4.0):
|
254
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
255
|
-
`guidance_scale` is defined as `w` of equation 2.
|
256
|
-
Paper](https://
|
257
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
258
|
-
usually at the expense of lower image quality.
|
254
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
255
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
256
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
257
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
258
|
+
the text `prompt`, usually at the expense of lower image quality.
|
259
259
|
prior_num_inference_steps (`int`, *optional*, defaults to 100):
|
260
260
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
261
261
|
expense of slower inference.
|
262
262
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
263
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
264
|
-
`guidance_scale` is defined as `w` of equation 2.
|
265
|
-
Paper](https://
|
266
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
267
|
-
usually at the expense of lower image quality.
|
263
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
264
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
265
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
266
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
267
|
+
the text `prompt`, usually at the expense of lower image quality.
|
268
268
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
269
269
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
270
270
|
to make generation deterministic.
|
@@ -411,7 +411,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
411
411
|
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
412
412
|
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
413
413
|
|
414
|
-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] =
|
414
|
+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
|
415
415
|
r"""
|
416
416
|
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
417
417
|
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
@@ -482,20 +482,20 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
482
482
|
be maximum and the denoising process will run for the full number of iterations specified in
|
483
483
|
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
484
484
|
prior_guidance_scale (`float`, *optional*, defaults to 4.0):
|
485
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
486
|
-
`guidance_scale` is defined as `w` of equation 2.
|
487
|
-
Paper](https://
|
488
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
489
|
-
usually at the expense of lower image quality.
|
485
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
486
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
487
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
488
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
489
|
+
the text `prompt`, usually at the expense of lower image quality.
|
490
490
|
prior_num_inference_steps (`int`, *optional*, defaults to 100):
|
491
491
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
492
492
|
expense of slower inference.
|
493
493
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
494
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
495
|
-
`guidance_scale` is defined as `w` of equation 2.
|
496
|
-
Paper](https://
|
497
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
498
|
-
usually at the expense of lower image quality.
|
494
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
495
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
496
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
497
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
498
|
+
the text `prompt`, usually at the expense of lower image quality.
|
499
499
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
500
500
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
501
501
|
to make generation deterministic.
|
@@ -652,7 +652,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
652
652
|
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
653
653
|
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
654
654
|
|
655
|
-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] =
|
655
|
+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
|
656
656
|
r"""
|
657
657
|
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
658
658
|
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
@@ -722,20 +722,20 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
722
722
|
width (`int`, *optional*, defaults to 512):
|
723
723
|
The width in pixels of the generated image.
|
724
724
|
prior_guidance_scale (`float`, *optional*, defaults to 4.0):
|
725
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
726
|
-
`guidance_scale` is defined as `w` of equation 2.
|
727
|
-
Paper](https://
|
728
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
729
|
-
usually at the expense of lower image quality.
|
725
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
726
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
727
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
728
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
729
|
+
the text `prompt`, usually at the expense of lower image quality.
|
730
730
|
prior_num_inference_steps (`int`, *optional*, defaults to 100):
|
731
731
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
732
732
|
expense of slower inference.
|
733
733
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
734
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
735
|
-
`guidance_scale` is defined as `w` of equation 2.
|
736
|
-
Paper](https://
|
737
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
738
|
-
usually at the expense of lower image quality.
|
734
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
735
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
736
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
737
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
738
|
+
the text `prompt`, usually at the expense of lower image quality.
|
739
739
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
740
740
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
741
741
|
to make generation deterministic.
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -14,14 +14,13 @@
|
|
14
14
|
|
15
15
|
from typing import Callable, List, Optional, Union
|
16
16
|
|
17
|
-
import numpy as np
|
18
17
|
import PIL.Image
|
19
18
|
import torch
|
20
|
-
from PIL import Image
|
21
19
|
from transformers import (
|
22
20
|
XLMRobertaTokenizer,
|
23
21
|
)
|
24
22
|
|
23
|
+
from ...image_processor import VaeImageProcessor
|
25
24
|
from ...models import UNet2DConditionModel, VQModel
|
26
25
|
from ...schedulers import DDIMScheduler
|
27
26
|
from ...utils import (
|
@@ -95,15 +94,6 @@ def get_new_h_w(h, w, scale_factor=8):
|
|
95
94
|
return new_h * scale_factor, new_w * scale_factor
|
96
95
|
|
97
96
|
|
98
|
-
def prepare_image(pil_image, w=512, h=512):
|
99
|
-
pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
|
100
|
-
arr = np.array(pil_image.convert("RGB"))
|
101
|
-
arr = arr.astype(np.float32) / 127.5 - 1
|
102
|
-
arr = np.transpose(arr, [2, 0, 1])
|
103
|
-
image = torch.from_numpy(arr).unsqueeze(0)
|
104
|
-
return image
|
105
|
-
|
106
|
-
|
107
97
|
class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
108
98
|
"""
|
109
99
|
Pipeline for image-to-image generation using Kandinsky
|
@@ -143,7 +133,16 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
143
133
|
scheduler=scheduler,
|
144
134
|
movq=movq,
|
145
135
|
)
|
146
|
-
self.movq_scale_factor =
|
136
|
+
self.movq_scale_factor = (
|
137
|
+
2 ** (len(self.movq.config.block_out_channels) - 1) if getattr(self, "movq", None) else 8
|
138
|
+
)
|
139
|
+
movq_latent_channels = self.movq.config.latent_channels if getattr(self, "movq", None) else 4
|
140
|
+
self.image_processor = VaeImageProcessor(
|
141
|
+
vae_scale_factor=self.movq_scale_factor,
|
142
|
+
vae_latent_channels=movq_latent_channels,
|
143
|
+
resample="bicubic",
|
144
|
+
reducing_gap=1,
|
145
|
+
)
|
147
146
|
|
148
147
|
def get_timesteps(self, num_inference_steps, strength, device):
|
149
148
|
# get the original timestep using init_timestep
|
@@ -350,11 +349,11 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
350
349
|
be maximum and the denoising process will run for the full number of iterations specified in
|
351
350
|
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
352
351
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
353
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
354
|
-
`guidance_scale` is defined as `w` of equation 2.
|
355
|
-
Paper](https://
|
356
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
357
|
-
usually at the expense of lower image quality.
|
352
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
353
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
354
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
355
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
356
|
+
the text `prompt`, usually at the expense of lower image quality.
|
358
357
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
359
358
|
The number of images to generate per prompt.
|
360
359
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -417,7 +416,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
417
416
|
f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support PIL image and pytorch tensor"
|
418
417
|
)
|
419
418
|
|
420
|
-
image = torch.cat([
|
419
|
+
image = torch.cat([self.image_processor.preprocess(i, width, height) for i in image], dim=0)
|
421
420
|
image = image.to(dtype=prompt_embeds.dtype, device=device)
|
422
421
|
|
423
422
|
latents = self.movq.encode(image)["latents"]
|
@@ -498,13 +497,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
498
497
|
if output_type not in ["pt", "np", "pil"]:
|
499
498
|
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
|
500
499
|
|
501
|
-
|
502
|
-
image = image * 0.5 + 0.5
|
503
|
-
image = image.clamp(0, 1)
|
504
|
-
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
505
|
-
|
506
|
-
if output_type == "pil":
|
507
|
-
image = self.numpy_to_pil(image)
|
500
|
+
image = self.image_processor.postprocess(image, output_type)
|
508
501
|
|
509
502
|
if not return_dict:
|
510
503
|
return (image,)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -456,11 +456,11 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
456
456
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
457
457
|
expense of slower inference.
|
458
458
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
459
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
460
|
-
`guidance_scale` is defined as `w` of equation 2.
|
461
|
-
Paper](https://
|
462
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
463
|
-
usually at the expense of lower image quality.
|
459
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
460
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
461
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
462
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
463
|
+
the text `prompt`, usually at the expense of lower image quality.
|
464
464
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
465
465
|
The number of images to generate per prompt.
|
466
466
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -496,7 +496,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
496
496
|
"As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
|
497
497
|
"This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
|
498
498
|
"THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
|
499
|
-
"This warning will be
|
499
|
+
"This warning will be suppressed after the first inference call and will be removed in diffusers>0.23.0"
|
500
500
|
)
|
501
501
|
self._warn_has_been_called = True
|
502
502
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -220,11 +220,11 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
220
220
|
The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
|
221
221
|
`guidance_scale` is less than `1`).
|
222
222
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
223
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
224
|
-
`guidance_scale` is defined as `w` of equation 2.
|
225
|
-
Paper](https://
|
226
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
227
|
-
usually at the expense of lower image quality.
|
223
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
224
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
225
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
226
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
227
|
+
the text `prompt`, usually at the expense of lower image quality.
|
228
228
|
|
229
229
|
Examples:
|
230
230
|
|
@@ -439,11 +439,11 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
439
439
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
440
440
|
tensor will ge generated by sampling using the supplied random `generator`.
|
441
441
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
442
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
443
|
-
`guidance_scale` is defined as `w` of equation 2.
|
444
|
-
Paper](https://
|
445
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
446
|
-
usually at the expense of lower image quality.
|
442
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
443
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
444
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
445
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
446
|
+
the text `prompt`, usually at the expense of lower image quality.
|
447
447
|
output_type (`str`, *optional*, defaults to `"pt"`):
|
448
448
|
The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
|
449
449
|
(`torch.Tensor`).
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -162,11 +162,11 @@ class KandinskyV22Pipeline(DiffusionPipeline):
|
|
162
162
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
163
163
|
expense of slower inference.
|
164
164
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
165
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
166
|
-
`guidance_scale` is defined as `w` of equation 2.
|
167
|
-
Paper](https://
|
168
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
169
|
-
usually at the expense of lower image quality.
|
165
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
166
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
167
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
168
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
169
|
+
the text `prompt`, usually at the expense of lower image quality.
|
170
170
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
171
171
|
The number of images to generate per prompt.
|
172
172
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|