diffusers 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +48 -1
- diffusers/commands/__init__.py +1 -1
- diffusers/commands/diffusers_cli.py +1 -1
- diffusers/commands/env.py +1 -1
- diffusers/commands/fp16_safetensors.py +1 -1
- diffusers/dependency_versions_check.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/experimental/rl/value_guided_sampling.py +1 -1
- diffusers/hooks/faster_cache.py +2 -2
- diffusers/hooks/group_offloading.py +128 -29
- diffusers/hooks/hooks.py +2 -2
- diffusers/hooks/layerwise_casting.py +3 -3
- diffusers/hooks/pyramid_attention_broadcast.py +1 -1
- diffusers/image_processor.py +7 -2
- diffusers/loaders/__init__.py +4 -0
- diffusers/loaders/ip_adapter.py +5 -14
- diffusers/loaders/lora_base.py +212 -111
- diffusers/loaders/lora_conversion_utils.py +275 -34
- diffusers/loaders/lora_pipeline.py +1554 -819
- diffusers/loaders/peft.py +52 -109
- diffusers/loaders/single_file.py +2 -2
- diffusers/loaders/single_file_model.py +20 -4
- diffusers/loaders/single_file_utils.py +225 -5
- diffusers/loaders/textual_inversion.py +3 -2
- diffusers/loaders/transformer_flux.py +1 -1
- diffusers/loaders/transformer_sd3.py +2 -2
- diffusers/loaders/unet.py +2 -16
- diffusers/loaders/unet_loader_utils.py +1 -1
- diffusers/loaders/utils.py +1 -1
- diffusers/models/__init__.py +15 -1
- diffusers/models/activations.py +5 -5
- diffusers/models/adapter.py +2 -3
- diffusers/models/attention.py +4 -4
- diffusers/models/attention_flax.py +10 -10
- diffusers/models/attention_processor.py +14 -10
- diffusers/models/auto_model.py +47 -10
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_dc.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
- diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
- diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vae.py +13 -2
- diffusers/models/autoencoders/vq_model.py +2 -2
- diffusers/models/cache_utils.py +1 -1
- diffusers/models/controlnet.py +1 -1
- diffusers/models/controlnet_flux.py +1 -1
- diffusers/models/controlnet_sd3.py +1 -1
- diffusers/models/controlnet_sparsectrl.py +1 -1
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -3
- diffusers/models/controlnets/controlnet_flax.py +1 -1
- diffusers/models/controlnets/controlnet_flux.py +16 -15
- diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
- diffusers/models/controlnets/controlnet_sana.py +290 -0
- diffusers/models/controlnets/controlnet_sd3.py +1 -1
- diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
- diffusers/models/controlnets/controlnet_union.py +1 -1
- diffusers/models/controlnets/controlnet_xs.py +7 -7
- diffusers/models/controlnets/multicontrolnet.py +4 -5
- diffusers/models/controlnets/multicontrolnet_union.py +5 -6
- diffusers/models/downsampling.py +2 -2
- diffusers/models/embeddings.py +10 -12
- diffusers/models/embeddings_flax.py +2 -2
- diffusers/models/lora.py +3 -3
- diffusers/models/modeling_utils.py +44 -14
- diffusers/models/normalization.py +4 -4
- diffusers/models/resnet.py +2 -2
- diffusers/models/resnet_flax.py +1 -1
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
- diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
- diffusers/models/transformers/consisid_transformer_3d.py +1 -1
- diffusers/models/transformers/dit_transformer_2d.py +2 -2
- diffusers/models/transformers/dual_transformer_2d.py +1 -1
- diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
- diffusers/models/transformers/latte_transformer_3d.py +4 -5
- diffusers/models/transformers/lumina_nextdit2d.py +2 -2
- diffusers/models/transformers/pixart_transformer_2d.py +3 -3
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/sana_transformer.py +8 -3
- diffusers/models/transformers/stable_audio_transformer.py +5 -9
- diffusers/models/transformers/t5_film_transformer.py +3 -3
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +1 -1
- diffusers/models/transformers/transformer_chroma.py +742 -0
- diffusers/models/transformers/transformer_cogview3plus.py +5 -10
- diffusers/models/transformers/transformer_cogview4.py +317 -25
- diffusers/models/transformers/transformer_cosmos.py +579 -0
- diffusers/models/transformers/transformer_flux.py +9 -11
- diffusers/models/transformers/transformer_hidream_image.py +942 -0
- diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
- diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
- diffusers/models/transformers/transformer_ltx.py +2 -2
- diffusers/models/transformers/transformer_lumina2.py +1 -1
- diffusers/models/transformers/transformer_mochi.py +1 -1
- diffusers/models/transformers/transformer_omnigen.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +7 -7
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/transformers/transformer_wan.py +24 -8
- diffusers/models/transformers/transformer_wan_vace.py +393 -0
- diffusers/models/unets/unet_1d.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +1 -1
- diffusers/models/unets/unet_2d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
- diffusers/models/unets/unet_2d_condition.py +2 -2
- diffusers/models/unets/unet_2d_condition_flax.py +2 -2
- diffusers/models/unets/unet_3d_blocks.py +1 -1
- diffusers/models/unets/unet_3d_condition.py +3 -3
- diffusers/models/unets/unet_i2vgen_xl.py +3 -3
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +2 -2
- diffusers/models/unets/unet_stable_cascade.py +1 -1
- diffusers/models/upsampling.py +2 -2
- diffusers/models/vae_flax.py +2 -2
- diffusers/models/vq_model.py +1 -1
- diffusers/pipelines/__init__.py +37 -6
- diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
- diffusers/pipelines/amused/pipeline_amused.py +7 -6
- diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
- diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
- diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
- diffusers/pipelines/auto_pipeline.py +6 -7
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
- diffusers/pipelines/chroma/__init__.py +49 -0
- diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
- diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
- diffusers/pipelines/chroma/pipeline_output.py +21 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
- diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
- diffusers/pipelines/consisid/consisid_utils.py +2 -2
- diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
- diffusers/pipelines/cosmos/__init__.py +54 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
- diffusers/pipelines/cosmos/pipeline_output.py +40 -0
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
- diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
- diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
- diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
- diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
- diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
- diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
- diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
- diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +1 -1
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
- diffusers/pipelines/flux/modeling_flux.py +1 -1
- diffusers/pipelines/flux/pipeline_flux.py +10 -17
- diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
- diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
- diffusers/pipelines/free_init_utils.py +2 -2
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hidream_image/__init__.py +47 -0
- diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
- diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
- diffusers/pipelines/hunyuan_video/__init__.py +2 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
- diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
- diffusers/pipelines/kolors/text_encoder.py +3 -3
- diffusers/pipelines/kolors/tokenizer.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
- diffusers/pipelines/latte/pipeline_latte.py +12 -12
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
- diffusers/pipelines/ltx/__init__.py +4 -0
- diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
- diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
- diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
- diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
- diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
- diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
- diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
- diffusers/pipelines/onnx_utils.py +15 -2
- diffusers/pipelines/pag/pag_utils.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
- diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
- diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
- diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
- diffusers/pipelines/pia/pipeline_pia.py +8 -6
- diffusers/pipelines/pipeline_flax_utils.py +3 -4
- diffusers/pipelines/pipeline_loading_utils.py +89 -13
- diffusers/pipelines/pipeline_utils.py +105 -33
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
- diffusers/pipelines/sana/__init__.py +4 -0
- diffusers/pipelines/sana/pipeline_sana.py +23 -21
- diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
- diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
- diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
- diffusers/pipelines/shap_e/camera.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
- diffusers/pipelines/shap_e/renderer.py +3 -3
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
- diffusers/pipelines/stable_diffusion/__init__.py +0 -7
- diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
- diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
- diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
- diffusers/pipelines/unclip/text_proj.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
- diffusers/pipelines/visualcloze/__init__.py +52 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
- diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
- diffusers/pipelines/wan/__init__.py +2 -0
- diffusers/pipelines/wan/pipeline_wan.py +13 -10
- diffusers/pipelines/wan/pipeline_wan_i2v.py +38 -18
- diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
- diffusers/quantizers/__init__.py +179 -1
- diffusers/quantizers/base.py +6 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
- diffusers/quantizers/bitsandbytes/utils.py +10 -7
- diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
- diffusers/quantizers/gguf/utils.py +16 -13
- diffusers/quantizers/quantization_config.py +18 -16
- diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
- diffusers/schedulers/__init__.py +3 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
- diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -1
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
- diffusers/schedulers/scheduling_ddim.py +8 -8
- diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_ddim_flax.py +6 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
- diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
- diffusers/schedulers/scheduling_ddpm.py +9 -9
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
- diffusers/schedulers/scheduling_deis_multistep.py +8 -8
- diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
- diffusers/schedulers/scheduling_edm_euler.py +20 -11
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
- diffusers/schedulers/scheduling_heun_discrete.py +2 -2
- diffusers/schedulers/scheduling_ipndm.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
- diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
- diffusers/schedulers/scheduling_lcm.py +3 -3
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
- diffusers/schedulers/scheduling_pndm.py +4 -4
- diffusers/schedulers/scheduling_pndm_flax.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +9 -9
- diffusers/schedulers/scheduling_sasolver.py +15 -15
- diffusers/schedulers/scheduling_scm.py +1 -1
- diffusers/schedulers/scheduling_sde_ve.py +1 -1
- diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
- diffusers/schedulers/scheduling_tcd.py +3 -3
- diffusers/schedulers/scheduling_unclip.py +5 -5
- diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
- diffusers/schedulers/scheduling_utils.py +1 -1
- diffusers/schedulers/scheduling_utils_flax.py +1 -1
- diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
- diffusers/training_utils.py +13 -5
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/accelerate_utils.py +1 -1
- diffusers/utils/doc_utils.py +1 -1
- diffusers/utils/dummy_pt_objects.py +120 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
- diffusers/utils/dynamic_modules_utils.py +21 -3
- diffusers/utils/export_utils.py +1 -1
- diffusers/utils/import_utils.py +81 -18
- diffusers/utils/logging.py +1 -1
- diffusers/utils/outputs.py +2 -1
- diffusers/utils/peft_utils.py +91 -8
- diffusers/utils/state_dict_utils.py +20 -3
- diffusers/utils/testing_utils.py +59 -7
- diffusers/utils/torch_utils.py +25 -5
- diffusers/video_processor.py +2 -2
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/METADATA +70 -55
- diffusers-0.34.0.dist-info/RECORD +639 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/WHEEL +1 -1
- diffusers-0.33.1.dist-info/RECORD +0 -608
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -159,6 +159,33 @@ def retrieve_latents(
|
|
159
159
|
raise AttributeError("Could not access latents of provided encoder_output")
|
160
160
|
|
161
161
|
|
162
|
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
163
|
+
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
164
|
+
r"""
|
165
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
166
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
167
|
+
Flawed](https://huggingface.co/papers/2305.08891).
|
168
|
+
|
169
|
+
Args:
|
170
|
+
noise_cfg (`torch.Tensor`):
|
171
|
+
The predicted noise tensor for the guided diffusion process.
|
172
|
+
noise_pred_text (`torch.Tensor`):
|
173
|
+
The predicted noise tensor for the text-guided diffusion process.
|
174
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
175
|
+
A rescale factor applied to the noise predictions.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
179
|
+
"""
|
180
|
+
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
181
|
+
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
182
|
+
# rescale the results from guidance (fixes overexposure)
|
183
|
+
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
|
184
|
+
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
|
185
|
+
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
|
186
|
+
return noise_cfg
|
187
|
+
|
188
|
+
|
162
189
|
class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
|
163
190
|
r"""
|
164
191
|
Pipeline for image-to-video generation.
|
@@ -542,6 +569,10 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
542
569
|
def guidance_scale(self):
|
543
570
|
return self._guidance_scale
|
544
571
|
|
572
|
+
@property
|
573
|
+
def guidance_rescale(self):
|
574
|
+
return self._guidance_rescale
|
575
|
+
|
545
576
|
@property
|
546
577
|
def do_classifier_free_guidance(self):
|
547
578
|
return self._guidance_scale > 1.0
|
@@ -576,6 +607,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
576
607
|
num_inference_steps: int = 50,
|
577
608
|
timesteps: List[int] = None,
|
578
609
|
guidance_scale: float = 3,
|
610
|
+
guidance_rescale: float = 0.0,
|
579
611
|
num_videos_per_prompt: Optional[int] = 1,
|
580
612
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
581
613
|
latents: Optional[torch.Tensor] = None,
|
@@ -615,11 +647,16 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
615
647
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
616
648
|
passed will be used. Must be in descending order.
|
617
649
|
guidance_scale (`float`, defaults to `3 `):
|
618
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
619
|
-
`guidance_scale` is defined as `w` of equation 2.
|
620
|
-
Paper](https://
|
621
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
622
|
-
usually at the expense of lower image quality.
|
650
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
651
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
652
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
653
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
654
|
+
the text `prompt`, usually at the expense of lower image quality.
|
655
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
656
|
+
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
657
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
658
|
+
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
659
|
+
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
623
660
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
624
661
|
The number of videos to generate per prompt.
|
625
662
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -688,6 +725,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
688
725
|
)
|
689
726
|
|
690
727
|
self._guidance_scale = guidance_scale
|
728
|
+
self._guidance_rescale = guidance_rescale
|
691
729
|
self._attention_kwargs = attention_kwargs
|
692
730
|
self._interrupt = False
|
693
731
|
self._current_timestep = None
|
@@ -811,6 +849,12 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
811
849
|
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
812
850
|
timestep, _ = timestep.chunk(2)
|
813
851
|
|
852
|
+
if self.guidance_rescale > 0:
|
853
|
+
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
854
|
+
noise_pred = rescale_noise_cfg(
|
855
|
+
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
856
|
+
)
|
857
|
+
|
814
858
|
# compute the previous noisy sample x_t -> x_t-1
|
815
859
|
noise_pred = self._unpack_latents(
|
816
860
|
noise_pred,
|
@@ -0,0 +1,277 @@
|
|
1
|
+
# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import List, Optional, Union
|
16
|
+
|
17
|
+
import torch
|
18
|
+
|
19
|
+
from ...image_processor import PipelineImageInput
|
20
|
+
from ...models import AutoencoderKLLTXVideo
|
21
|
+
from ...utils import get_logger
|
22
|
+
from ...utils.torch_utils import randn_tensor
|
23
|
+
from ...video_processor import VideoProcessor
|
24
|
+
from ..pipeline_utils import DiffusionPipeline
|
25
|
+
from .modeling_latent_upsampler import LTXLatentUpsamplerModel
|
26
|
+
from .pipeline_output import LTXPipelineOutput
|
27
|
+
|
28
|
+
|
29
|
+
logger = get_logger(__name__) # pylint: disable=invalid-name
|
30
|
+
|
31
|
+
|
32
|
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
33
|
+
def retrieve_latents(
|
34
|
+
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
35
|
+
):
|
36
|
+
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
|
37
|
+
return encoder_output.latent_dist.sample(generator)
|
38
|
+
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
|
39
|
+
return encoder_output.latent_dist.mode()
|
40
|
+
elif hasattr(encoder_output, "latents"):
|
41
|
+
return encoder_output.latents
|
42
|
+
else:
|
43
|
+
raise AttributeError("Could not access latents of provided encoder_output")
|
44
|
+
|
45
|
+
|
46
|
+
class LTXLatentUpsamplePipeline(DiffusionPipeline):
|
47
|
+
model_cpu_offload_seq = ""
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
vae: AutoencoderKLLTXVideo,
|
52
|
+
latent_upsampler: LTXLatentUpsamplerModel,
|
53
|
+
) -> None:
|
54
|
+
super().__init__()
|
55
|
+
|
56
|
+
self.register_modules(vae=vae, latent_upsampler=latent_upsampler)
|
57
|
+
|
58
|
+
self.vae_spatial_compression_ratio = (
|
59
|
+
self.vae.spatial_compression_ratio if getattr(self, "vae", None) is not None else 32
|
60
|
+
)
|
61
|
+
self.vae_temporal_compression_ratio = (
|
62
|
+
self.vae.temporal_compression_ratio if getattr(self, "vae", None) is not None else 8
|
63
|
+
)
|
64
|
+
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_spatial_compression_ratio)
|
65
|
+
|
66
|
+
def prepare_latents(
|
67
|
+
self,
|
68
|
+
video: Optional[torch.Tensor] = None,
|
69
|
+
batch_size: int = 1,
|
70
|
+
dtype: Optional[torch.dtype] = None,
|
71
|
+
device: Optional[torch.device] = None,
|
72
|
+
generator: Optional[torch.Generator] = None,
|
73
|
+
latents: Optional[torch.Tensor] = None,
|
74
|
+
) -> torch.Tensor:
|
75
|
+
if latents is not None:
|
76
|
+
return latents.to(device=device, dtype=dtype)
|
77
|
+
|
78
|
+
video = video.to(device=device, dtype=self.vae.dtype)
|
79
|
+
if isinstance(generator, list):
|
80
|
+
if len(generator) != batch_size:
|
81
|
+
raise ValueError(
|
82
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
83
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
84
|
+
)
|
85
|
+
|
86
|
+
init_latents = [
|
87
|
+
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
|
88
|
+
]
|
89
|
+
else:
|
90
|
+
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
|
91
|
+
|
92
|
+
init_latents = torch.cat(init_latents, dim=0).to(dtype)
|
93
|
+
init_latents = self._normalize_latents(init_latents, self.vae.latents_mean, self.vae.latents_std)
|
94
|
+
return init_latents
|
95
|
+
|
96
|
+
def adain_filter_latent(self, latents: torch.Tensor, reference_latents: torch.Tensor, factor: float = 1.0):
|
97
|
+
"""
|
98
|
+
Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on statistics from a reference latent
|
99
|
+
tensor.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
latent (`torch.Tensor`):
|
103
|
+
Input latents to normalize
|
104
|
+
reference_latents (`torch.Tensor`):
|
105
|
+
The reference latents providing style statistics.
|
106
|
+
factor (`float`):
|
107
|
+
Blending factor between original and transformed latent. Range: -10.0 to 10.0, Default: 1.0
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
torch.Tensor: The transformed latent tensor
|
111
|
+
"""
|
112
|
+
result = latents.clone()
|
113
|
+
|
114
|
+
for i in range(latents.size(0)):
|
115
|
+
for c in range(latents.size(1)):
|
116
|
+
r_sd, r_mean = torch.std_mean(reference_latents[i, c], dim=None) # index by original dim order
|
117
|
+
i_sd, i_mean = torch.std_mean(result[i, c], dim=None)
|
118
|
+
|
119
|
+
result[i, c] = ((result[i, c] - i_mean) / i_sd) * r_sd + r_mean
|
120
|
+
|
121
|
+
result = torch.lerp(latents, result, factor)
|
122
|
+
return result
|
123
|
+
|
124
|
+
@staticmethod
|
125
|
+
# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
|
126
|
+
def _normalize_latents(
|
127
|
+
latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
|
128
|
+
) -> torch.Tensor:
|
129
|
+
# Normalize latents across the channel dimension [B, C, F, H, W]
|
130
|
+
latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
131
|
+
latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
132
|
+
latents = (latents - latents_mean) * scaling_factor / latents_std
|
133
|
+
return latents
|
134
|
+
|
135
|
+
@staticmethod
|
136
|
+
# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._denormalize_latents
|
137
|
+
def _denormalize_latents(
|
138
|
+
latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
|
139
|
+
) -> torch.Tensor:
|
140
|
+
# Denormalize latents across the channel dimension [B, C, F, H, W]
|
141
|
+
latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
142
|
+
latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
143
|
+
latents = latents * latents_std / scaling_factor + latents_mean
|
144
|
+
return latents
|
145
|
+
|
146
|
+
def enable_vae_slicing(self):
|
147
|
+
r"""
|
148
|
+
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
149
|
+
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
150
|
+
"""
|
151
|
+
self.vae.enable_slicing()
|
152
|
+
|
153
|
+
def disable_vae_slicing(self):
|
154
|
+
r"""
|
155
|
+
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
156
|
+
computing decoding in one step.
|
157
|
+
"""
|
158
|
+
self.vae.disable_slicing()
|
159
|
+
|
160
|
+
def enable_vae_tiling(self):
|
161
|
+
r"""
|
162
|
+
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
163
|
+
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
164
|
+
processing larger images.
|
165
|
+
"""
|
166
|
+
self.vae.enable_tiling()
|
167
|
+
|
168
|
+
def disable_vae_tiling(self):
|
169
|
+
r"""
|
170
|
+
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
171
|
+
computing decoding in one step.
|
172
|
+
"""
|
173
|
+
self.vae.disable_tiling()
|
174
|
+
|
175
|
+
def check_inputs(self, video, height, width, latents):
|
176
|
+
if height % self.vae_spatial_compression_ratio != 0 or width % self.vae_spatial_compression_ratio != 0:
|
177
|
+
raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
|
178
|
+
|
179
|
+
if video is not None and latents is not None:
|
180
|
+
raise ValueError("Only one of `video` or `latents` can be provided.")
|
181
|
+
if video is None and latents is None:
|
182
|
+
raise ValueError("One of `video` or `latents` has to be provided.")
|
183
|
+
|
184
|
+
@torch.no_grad()
|
185
|
+
def __call__(
|
186
|
+
self,
|
187
|
+
video: Optional[List[PipelineImageInput]] = None,
|
188
|
+
height: int = 512,
|
189
|
+
width: int = 704,
|
190
|
+
latents: Optional[torch.Tensor] = None,
|
191
|
+
decode_timestep: Union[float, List[float]] = 0.0,
|
192
|
+
decode_noise_scale: Optional[Union[float, List[float]]] = None,
|
193
|
+
adain_factor: float = 0.0,
|
194
|
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
195
|
+
output_type: Optional[str] = "pil",
|
196
|
+
return_dict: bool = True,
|
197
|
+
):
|
198
|
+
self.check_inputs(
|
199
|
+
video=video,
|
200
|
+
height=height,
|
201
|
+
width=width,
|
202
|
+
latents=latents,
|
203
|
+
)
|
204
|
+
|
205
|
+
if video is not None:
|
206
|
+
# Batched video input is not yet tested/supported. TODO: take a look later
|
207
|
+
batch_size = 1
|
208
|
+
else:
|
209
|
+
batch_size = latents.shape[0]
|
210
|
+
device = self._execution_device
|
211
|
+
|
212
|
+
if video is not None:
|
213
|
+
num_frames = len(video)
|
214
|
+
if num_frames % self.vae_temporal_compression_ratio != 1:
|
215
|
+
num_frames = (
|
216
|
+
num_frames // self.vae_temporal_compression_ratio * self.vae_temporal_compression_ratio + 1
|
217
|
+
)
|
218
|
+
video = video[:num_frames]
|
219
|
+
logger.warning(
|
220
|
+
f"Video length expected to be of the form `k * {self.vae_temporal_compression_ratio} + 1` but is {len(video)}. Truncating to {num_frames} frames."
|
221
|
+
)
|
222
|
+
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
223
|
+
video = video.to(device=device, dtype=torch.float32)
|
224
|
+
|
225
|
+
latents = self.prepare_latents(
|
226
|
+
video=video,
|
227
|
+
batch_size=batch_size,
|
228
|
+
dtype=torch.float32,
|
229
|
+
device=device,
|
230
|
+
generator=generator,
|
231
|
+
latents=latents,
|
232
|
+
)
|
233
|
+
|
234
|
+
latents = self._denormalize_latents(
|
235
|
+
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
|
236
|
+
)
|
237
|
+
latents = latents.to(self.latent_upsampler.dtype)
|
238
|
+
latents_upsampled = self.latent_upsampler(latents)
|
239
|
+
|
240
|
+
if adain_factor > 0.0:
|
241
|
+
latents = self.adain_filter_latent(latents_upsampled, latents, adain_factor)
|
242
|
+
else:
|
243
|
+
latents = latents_upsampled
|
244
|
+
|
245
|
+
if output_type == "latent":
|
246
|
+
latents = self._normalize_latents(
|
247
|
+
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
|
248
|
+
)
|
249
|
+
video = latents
|
250
|
+
else:
|
251
|
+
if not self.vae.config.timestep_conditioning:
|
252
|
+
timestep = None
|
253
|
+
else:
|
254
|
+
noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
|
255
|
+
if not isinstance(decode_timestep, list):
|
256
|
+
decode_timestep = [decode_timestep] * batch_size
|
257
|
+
if decode_noise_scale is None:
|
258
|
+
decode_noise_scale = decode_timestep
|
259
|
+
elif not isinstance(decode_noise_scale, list):
|
260
|
+
decode_noise_scale = [decode_noise_scale] * batch_size
|
261
|
+
|
262
|
+
timestep = torch.tensor(decode_timestep, device=device, dtype=latents.dtype)
|
263
|
+
decode_noise_scale = torch.tensor(decode_noise_scale, device=device, dtype=latents.dtype)[
|
264
|
+
:, None, None, None, None
|
265
|
+
]
|
266
|
+
latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
|
267
|
+
|
268
|
+
video = self.vae.decode(latents, timestep, return_dict=False)[0]
|
269
|
+
video = self.video_processor.postprocess_video(video, output_type=output_type)
|
270
|
+
|
271
|
+
# Offload all models
|
272
|
+
self.maybe_free_model_hooks()
|
273
|
+
|
274
|
+
if not return_dict:
|
275
|
+
return (video,)
|
276
|
+
|
277
|
+
return LTXPipelineOutput(frames=video)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -372,7 +372,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
372
372
|
def prepare_extra_step_kwargs(self, generator, eta):
|
373
373
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
374
374
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
375
|
-
# eta corresponds to η in DDIM paper: https://
|
375
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
376
376
|
# and should be between [0, 1]
|
377
377
|
|
378
378
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -534,7 +534,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
534
534
|
# &
|
535
535
|
caption = re.sub(r"&", "", caption)
|
536
536
|
|
537
|
-
# ip
|
537
|
+
# ip addresses:
|
538
538
|
caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
|
539
539
|
|
540
540
|
# article ids:
|
@@ -619,7 +619,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
619
619
|
return self._guidance_scale
|
620
620
|
|
621
621
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
622
|
-
# of the Imagen paper: https://
|
622
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
623
623
|
# corresponds to doing no classifier free guidance.
|
624
624
|
@property
|
625
625
|
def do_classifier_free_guidance(self):
|
@@ -677,11 +677,11 @@ class LuminaPipeline(DiffusionPipeline):
|
|
677
677
|
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
678
678
|
will be used.
|
679
679
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
680
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
681
|
-
`guidance_scale` is defined as `w` of equation 2.
|
682
|
-
Paper](https://
|
683
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
684
|
-
usually at the expense of lower image quality.
|
680
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
681
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
682
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
683
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
684
|
+
the text `prompt`, usually at the expense of lower image quality.
|
685
685
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
686
686
|
The number of images to generate per prompt.
|
687
687
|
height (`int`, *optional*, defaults to self.unet.config.sample_size):
|
@@ -689,8 +689,8 @@ class LuminaPipeline(DiffusionPipeline):
|
|
689
689
|
width (`int`, *optional*, defaults to self.unet.config.sample_size):
|
690
690
|
The width in pixels of the generated image.
|
691
691
|
eta (`float`, *optional*, defaults to 0.0):
|
692
|
-
Corresponds to parameter eta (η) in the DDIM paper: https://
|
693
|
-
[`schedulers.DDIMScheduler`], will be ignored for others.
|
692
|
+
Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
|
693
|
+
applies to [`schedulers.DDIMScheduler`], will be ignored for others.
|
694
694
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
695
695
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
696
696
|
to make generation deterministic.
|
@@ -771,7 +771,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
771
771
|
device = self._execution_device
|
772
772
|
|
773
773
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
774
|
-
# of the Imagen paper: https://
|
774
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
775
775
|
# corresponds to doing no classifier free guidance.
|
776
776
|
do_classifier_free_guidance = guidance_scale > 1.0
|
777
777
|
|
@@ -848,7 +848,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
848
848
|
# prepare image_rotary_emb for positional encoding
|
849
849
|
# dynamic scaling_factor for different resolution.
|
850
850
|
# NOTE: For `Time-aware` denosing mechanism from Lumina-Next
|
851
|
-
# https://
|
851
|
+
# https://huggingface.co/papers/2406.18583, Sec 2.3
|
852
852
|
# NOTE: We should compute different image_rotary_emb with different timestep.
|
853
853
|
if current_timestep[0] < scaling_watershed:
|
854
854
|
linear_factor = scaling_factor
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -342,7 +342,7 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
342
342
|
def prepare_extra_step_kwargs(self, generator, eta):
|
343
343
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
344
344
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
345
|
-
# eta corresponds to η in DDIM paper: https://
|
345
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
346
346
|
# and should be between [0, 1]
|
347
347
|
|
348
348
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -487,7 +487,7 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
487
487
|
return self._attention_kwargs
|
488
488
|
|
489
489
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
490
|
-
# of the Imagen paper: https://
|
490
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
491
491
|
# corresponds to doing no classifier free guidance.
|
492
492
|
@property
|
493
493
|
def do_classifier_free_guidance(self):
|
@@ -544,11 +544,11 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
544
544
|
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
545
545
|
will be used.
|
546
546
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
547
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
548
|
-
`guidance_scale` is defined as `w` of equation 2.
|
549
|
-
Paper](https://
|
550
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
551
|
-
usually at the expense of lower image quality.
|
547
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
548
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
549
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
550
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
551
|
+
the text `prompt`, usually at the expense of lower image quality.
|
552
552
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
553
553
|
The number of images to generate per prompt.
|
554
554
|
height (`int`, *optional*, defaults to self.unet.config.sample_size):
|
@@ -556,8 +556,8 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
556
556
|
width (`int`, *optional*, defaults to self.unet.config.sample_size):
|
557
557
|
The width in pixels of the generated image.
|
558
558
|
eta (`float`, *optional*, defaults to 0.0):
|
559
|
-
Corresponds to parameter eta (η) in the DDIM paper: https://
|
560
|
-
[`schedulers.DDIMScheduler`], will be ignored for others.
|
559
|
+
Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
|
560
|
+
applies to [`schedulers.DDIMScheduler`], will be ignored for others.
|
561
561
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
562
562
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
563
563
|
to make generation deterministic.
|
@@ -426,7 +426,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
426
426
|
if isinstance(img, np.ndarray):
|
427
427
|
img = torch.from_numpy(img)
|
428
428
|
if not torch.is_floating_point(img):
|
429
|
-
raise ValueError(f"{prefix}:
|
429
|
+
raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
|
430
430
|
else:
|
431
431
|
raise ValueError(f"{prefix}: unexpected type={type(img)}.")
|
432
432
|
if val_min != 0.0 or val_max != 1.0:
|
@@ -464,7 +464,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
464
464
|
if torch.is_tensor(img):
|
465
465
|
img = img.cpu().numpy()
|
466
466
|
if not np.issubdtype(img.dtype, np.floating):
|
467
|
-
raise ValueError(f"{prefix}:
|
467
|
+
raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
|
468
468
|
if val_min != 0.0 or val_max != 1.0:
|
469
469
|
img = (img - val_min) / (val_max - val_min)
|
470
470
|
img = (img * (2**16 - 1)).astype(np.uint16)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Genmo and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -521,11 +521,11 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
|
|
521
521
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
522
522
|
passed will be used. Must be in descending order.
|
523
523
|
guidance_scale (`float`, defaults to `4.5`):
|
524
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
525
|
-
`guidance_scale` is defined as `w` of equation 2.
|
526
|
-
Paper](https://
|
527
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
528
|
-
usually at the expense of lower image quality.
|
524
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
525
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
526
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
527
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
528
|
+
the text `prompt`, usually at the expense of lower image quality.
|
529
529
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
530
530
|
The number of videos to generate per prompt.
|
531
531
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -35,8 +35,8 @@ from ...utils import (
|
|
35
35
|
logging,
|
36
36
|
replace_example_docstring,
|
37
37
|
)
|
38
|
-
from ...utils.torch_utils import randn_tensor
|
39
|
-
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
|
38
|
+
from ...utils.torch_utils import empty_device_cache, get_device, randn_tensor
|
39
|
+
from ..pipeline_utils import AudioPipelineOutput, DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
|
40
40
|
|
41
41
|
|
42
42
|
if is_librosa_available():
|
@@ -76,7 +76,8 @@ EXAMPLE_DOC_STRING = """
|
|
76
76
|
"""
|
77
77
|
|
78
78
|
|
79
|
-
class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
|
79
|
+
class MusicLDMPipeline(DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin):
|
80
|
+
_last_supported_version = "0.33.1"
|
80
81
|
r"""
|
81
82
|
Pipeline for text-to-audio generation using MusicLDM.
|
82
83
|
|
@@ -297,7 +298,7 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
297
298
|
def prepare_extra_step_kwargs(self, generator, eta):
|
298
299
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
299
300
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
300
|
-
# eta corresponds to η in DDIM paper: https://
|
301
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
301
302
|
# and should be between [0, 1]
|
302
303
|
|
303
304
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -396,20 +397,22 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
396
397
|
def enable_model_cpu_offload(self, gpu_id=0):
|
397
398
|
r"""
|
398
399
|
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
399
|
-
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the
|
400
|
-
method is called, and the model remains in
|
401
|
-
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution
|
400
|
+
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
|
401
|
+
`forward` method is called, and the model remains in accelerator until the next model runs. Memory savings are
|
402
|
+
lower than with `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution
|
403
|
+
of the `unet`.
|
402
404
|
"""
|
403
405
|
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
|
404
406
|
from accelerate import cpu_offload_with_hook
|
405
407
|
else:
|
406
408
|
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
407
409
|
|
408
|
-
|
410
|
+
device_type = get_device()
|
411
|
+
device = torch.device(f"{device_type}:{gpu_id}")
|
409
412
|
|
410
413
|
if self.device.type != "cpu":
|
411
414
|
self.to("cpu", silence_dtype_warnings=True)
|
412
|
-
|
415
|
+
empty_device_cache() # otherwise we don't see the memory savings (but they probably exist)
|
413
416
|
|
414
417
|
model_sequence = [
|
415
418
|
self.text_encoder.text_model,
|
@@ -472,8 +475,8 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
472
475
|
and the input text. This scoring ranks the generated waveforms based on their cosine similarity to text
|
473
476
|
input in the joint text-audio embedding space.
|
474
477
|
eta (`float`, *optional*, defaults to 0.0):
|
475
|
-
Corresponds to parameter eta (η) from the [DDIM](https://
|
476
|
-
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
478
|
+
Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
|
479
|
+
applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
477
480
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
478
481
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
479
482
|
generation deterministic.
|
@@ -548,7 +551,7 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
548
551
|
|
549
552
|
device = self._execution_device
|
550
553
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
551
|
-
# of the Imagen paper: https://
|
554
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
552
555
|
# corresponds to doing no classifier free guidance.
|
553
556
|
do_classifier_free_guidance = guidance_scale > 1.0
|
554
557
|
|