diffusers 0.33.1__py3-none-any.whl → 0.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +145 -1
- diffusers/callbacks.py +35 -0
- diffusers/commands/__init__.py +1 -1
- diffusers/commands/custom_blocks.py +134 -0
- diffusers/commands/diffusers_cli.py +3 -1
- diffusers/commands/env.py +1 -1
- diffusers/commands/fp16_safetensors.py +2 -2
- diffusers/configuration_utils.py +11 -2
- diffusers/dependency_versions_check.py +1 -1
- diffusers/dependency_versions_table.py +3 -3
- diffusers/experimental/rl/value_guided_sampling.py +1 -1
- diffusers/guiders/__init__.py +41 -0
- diffusers/guiders/adaptive_projected_guidance.py +188 -0
- diffusers/guiders/auto_guidance.py +190 -0
- diffusers/guiders/classifier_free_guidance.py +141 -0
- diffusers/guiders/classifier_free_zero_star_guidance.py +152 -0
- diffusers/guiders/frequency_decoupled_guidance.py +327 -0
- diffusers/guiders/guider_utils.py +309 -0
- diffusers/guiders/perturbed_attention_guidance.py +271 -0
- diffusers/guiders/skip_layer_guidance.py +262 -0
- diffusers/guiders/smoothed_energy_guidance.py +251 -0
- diffusers/guiders/tangential_classifier_free_guidance.py +143 -0
- diffusers/hooks/__init__.py +17 -0
- diffusers/hooks/_common.py +56 -0
- diffusers/hooks/_helpers.py +293 -0
- diffusers/hooks/faster_cache.py +9 -8
- diffusers/hooks/first_block_cache.py +259 -0
- diffusers/hooks/group_offloading.py +332 -227
- diffusers/hooks/hooks.py +58 -3
- diffusers/hooks/layer_skip.py +263 -0
- diffusers/hooks/layerwise_casting.py +5 -10
- diffusers/hooks/pyramid_attention_broadcast.py +15 -12
- diffusers/hooks/smoothed_energy_guidance_utils.py +167 -0
- diffusers/hooks/utils.py +43 -0
- diffusers/image_processor.py +7 -2
- diffusers/loaders/__init__.py +10 -0
- diffusers/loaders/ip_adapter.py +260 -18
- diffusers/loaders/lora_base.py +261 -127
- diffusers/loaders/lora_conversion_utils.py +657 -35
- diffusers/loaders/lora_pipeline.py +2778 -1246
- diffusers/loaders/peft.py +78 -112
- diffusers/loaders/single_file.py +2 -2
- diffusers/loaders/single_file_model.py +64 -15
- diffusers/loaders/single_file_utils.py +395 -7
- diffusers/loaders/textual_inversion.py +3 -2
- diffusers/loaders/transformer_flux.py +10 -11
- diffusers/loaders/transformer_sd3.py +8 -3
- diffusers/loaders/unet.py +24 -21
- diffusers/loaders/unet_loader_utils.py +6 -3
- diffusers/loaders/utils.py +1 -1
- diffusers/models/__init__.py +23 -1
- diffusers/models/activations.py +5 -5
- diffusers/models/adapter.py +2 -3
- diffusers/models/attention.py +488 -7
- diffusers/models/attention_dispatch.py +1218 -0
- diffusers/models/attention_flax.py +10 -10
- diffusers/models/attention_processor.py +113 -667
- diffusers/models/auto_model.py +49 -12
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_dc.py +17 -4
- diffusers/models/autoencoders/autoencoder_kl.py +5 -5
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
- diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1110 -0
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +1070 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_wan.py +626 -62
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
- diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vae.py +13 -2
- diffusers/models/autoencoders/vq_model.py +2 -2
- diffusers/models/cache_utils.py +32 -10
- diffusers/models/controlnet.py +1 -1
- diffusers/models/controlnet_flux.py +1 -1
- diffusers/models/controlnet_sd3.py +1 -1
- diffusers/models/controlnet_sparsectrl.py +1 -1
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -3
- diffusers/models/controlnets/controlnet_flax.py +1 -1
- diffusers/models/controlnets/controlnet_flux.py +21 -20
- diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
- diffusers/models/controlnets/controlnet_sana.py +290 -0
- diffusers/models/controlnets/controlnet_sd3.py +1 -1
- diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
- diffusers/models/controlnets/controlnet_union.py +5 -5
- diffusers/models/controlnets/controlnet_xs.py +7 -7
- diffusers/models/controlnets/multicontrolnet.py +4 -5
- diffusers/models/controlnets/multicontrolnet_union.py +5 -6
- diffusers/models/downsampling.py +2 -2
- diffusers/models/embeddings.py +36 -46
- diffusers/models/embeddings_flax.py +2 -2
- diffusers/models/lora.py +3 -3
- diffusers/models/model_loading_utils.py +233 -1
- diffusers/models/modeling_flax_utils.py +1 -2
- diffusers/models/modeling_utils.py +203 -108
- diffusers/models/normalization.py +4 -4
- diffusers/models/resnet.py +2 -2
- diffusers/models/resnet_flax.py +1 -1
- diffusers/models/transformers/__init__.py +7 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
- diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
- diffusers/models/transformers/consisid_transformer_3d.py +1 -1
- diffusers/models/transformers/dit_transformer_2d.py +2 -2
- diffusers/models/transformers/dual_transformer_2d.py +1 -1
- diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
- diffusers/models/transformers/latte_transformer_3d.py +4 -5
- diffusers/models/transformers/lumina_nextdit2d.py +2 -2
- diffusers/models/transformers/pixart_transformer_2d.py +3 -3
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/sana_transformer.py +8 -3
- diffusers/models/transformers/stable_audio_transformer.py +5 -9
- diffusers/models/transformers/t5_film_transformer.py +3 -3
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +1 -1
- diffusers/models/transformers/transformer_chroma.py +641 -0
- diffusers/models/transformers/transformer_cogview3plus.py +5 -10
- diffusers/models/transformers/transformer_cogview4.py +353 -27
- diffusers/models/transformers/transformer_cosmos.py +586 -0
- diffusers/models/transformers/transformer_flux.py +376 -138
- diffusers/models/transformers/transformer_hidream_image.py +942 -0
- diffusers/models/transformers/transformer_hunyuan_video.py +12 -8
- diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
- diffusers/models/transformers/transformer_ltx.py +105 -24
- diffusers/models/transformers/transformer_lumina2.py +1 -1
- diffusers/models/transformers/transformer_mochi.py +1 -1
- diffusers/models/transformers/transformer_omnigen.py +2 -2
- diffusers/models/transformers/transformer_qwenimage.py +645 -0
- diffusers/models/transformers/transformer_sd3.py +7 -7
- diffusers/models/transformers/transformer_skyreels_v2.py +607 -0
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/transformers/transformer_wan.py +316 -87
- diffusers/models/transformers/transformer_wan_vace.py +387 -0
- diffusers/models/unets/unet_1d.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +1 -1
- diffusers/models/unets/unet_2d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
- diffusers/models/unets/unet_2d_condition.py +4 -3
- diffusers/models/unets/unet_2d_condition_flax.py +2 -2
- diffusers/models/unets/unet_3d_blocks.py +1 -1
- diffusers/models/unets/unet_3d_condition.py +3 -3
- diffusers/models/unets/unet_i2vgen_xl.py +3 -3
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +2 -2
- diffusers/models/unets/unet_stable_cascade.py +1 -1
- diffusers/models/upsampling.py +2 -2
- diffusers/models/vae_flax.py +2 -2
- diffusers/models/vq_model.py +1 -1
- diffusers/modular_pipelines/__init__.py +83 -0
- diffusers/modular_pipelines/components_manager.py +1068 -0
- diffusers/modular_pipelines/flux/__init__.py +66 -0
- diffusers/modular_pipelines/flux/before_denoise.py +689 -0
- diffusers/modular_pipelines/flux/decoders.py +109 -0
- diffusers/modular_pipelines/flux/denoise.py +227 -0
- diffusers/modular_pipelines/flux/encoders.py +412 -0
- diffusers/modular_pipelines/flux/modular_blocks.py +181 -0
- diffusers/modular_pipelines/flux/modular_pipeline.py +59 -0
- diffusers/modular_pipelines/modular_pipeline.py +2446 -0
- diffusers/modular_pipelines/modular_pipeline_utils.py +672 -0
- diffusers/modular_pipelines/node_utils.py +665 -0
- diffusers/modular_pipelines/stable_diffusion_xl/__init__.py +77 -0
- diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py +1874 -0
- diffusers/modular_pipelines/stable_diffusion_xl/decoders.py +208 -0
- diffusers/modular_pipelines/stable_diffusion_xl/denoise.py +771 -0
- diffusers/modular_pipelines/stable_diffusion_xl/encoders.py +887 -0
- diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py +380 -0
- diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py +365 -0
- diffusers/modular_pipelines/wan/__init__.py +66 -0
- diffusers/modular_pipelines/wan/before_denoise.py +365 -0
- diffusers/modular_pipelines/wan/decoders.py +105 -0
- diffusers/modular_pipelines/wan/denoise.py +261 -0
- diffusers/modular_pipelines/wan/encoders.py +242 -0
- diffusers/modular_pipelines/wan/modular_blocks.py +144 -0
- diffusers/modular_pipelines/wan/modular_pipeline.py +90 -0
- diffusers/pipelines/__init__.py +68 -6
- diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
- diffusers/pipelines/amused/pipeline_amused.py +7 -6
- diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
- diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
- diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +22 -13
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
- diffusers/pipelines/auto_pipeline.py +23 -20
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
- diffusers/pipelines/chroma/__init__.py +49 -0
- diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
- diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
- diffusers/pipelines/chroma/pipeline_output.py +21 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +17 -16
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +17 -16
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +18 -17
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +17 -16
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
- diffusers/pipelines/cogview4/pipeline_cogview4.py +23 -22
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
- diffusers/pipelines/consisid/consisid_utils.py +2 -2
- diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +11 -10
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +226 -107
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +12 -8
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +207 -105
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
- diffusers/pipelines/cosmos/__init__.py +54 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
- diffusers/pipelines/cosmos/pipeline_output.py +40 -0
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
- diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
- diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
- diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
- diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
- diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
- diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
- diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +8 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
- diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +4 -2
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
- diffusers/pipelines/flux/__init__.py +4 -0
- diffusers/pipelines/flux/modeling_flux.py +1 -1
- diffusers/pipelines/flux/pipeline_flux.py +37 -36
- diffusers/pipelines/flux/pipeline_flux_control.py +9 -9
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +7 -7
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +7 -7
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +7 -7
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +31 -23
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +3 -2
- diffusers/pipelines/flux/pipeline_flux_fill.py +7 -7
- diffusers/pipelines/flux/pipeline_flux_img2img.py +40 -7
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +12 -7
- diffusers/pipelines/flux/pipeline_flux_kontext.py +1134 -0
- diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py +1460 -0
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +2 -2
- diffusers/pipelines/flux/pipeline_output.py +6 -4
- diffusers/pipelines/free_init_utils.py +2 -2
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hidream_image/__init__.py +47 -0
- diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
- diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
- diffusers/pipelines/hunyuan_video/__init__.py +2 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +26 -25
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
- diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
- diffusers/pipelines/kolors/text_encoder.py +3 -3
- diffusers/pipelines/kolors/tokenizer.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
- diffusers/pipelines/latte/pipeline_latte.py +12 -12
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
- diffusers/pipelines/ltx/__init__.py +4 -0
- diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +64 -18
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +117 -38
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +63 -18
- diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
- diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
- diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
- diffusers/pipelines/mochi/pipeline_mochi.py +15 -14
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
- diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
- diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
- diffusers/pipelines/onnx_utils.py +15 -2
- diffusers/pipelines/pag/pag_utils.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
- diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
- diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
- diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
- diffusers/pipelines/pia/pipeline_pia.py +8 -6
- diffusers/pipelines/pipeline_flax_utils.py +5 -6
- diffusers/pipelines/pipeline_loading_utils.py +113 -15
- diffusers/pipelines/pipeline_utils.py +127 -48
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +14 -12
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +31 -11
- diffusers/pipelines/qwenimage/__init__.py +55 -0
- diffusers/pipelines/qwenimage/pipeline_output.py +21 -0
- diffusers/pipelines/qwenimage/pipeline_qwenimage.py +726 -0
- diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +882 -0
- diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +829 -0
- diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +1015 -0
- diffusers/pipelines/sana/__init__.py +4 -0
- diffusers/pipelines/sana/pipeline_sana.py +23 -21
- diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
- diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
- diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
- diffusers/pipelines/shap_e/camera.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
- diffusers/pipelines/shap_e/renderer.py +3 -3
- diffusers/pipelines/skyreels_v2/__init__.py +59 -0
- diffusers/pipelines/skyreels_v2/pipeline_output.py +20 -0
- diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py +610 -0
- diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py +978 -0
- diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py +1059 -0
- diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_v2v.py +1063 -0
- diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py +745 -0
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
- diffusers/pipelines/stable_diffusion/__init__.py +0 -7
- diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +12 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +11 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +10 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
- diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +13 -12
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
- diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
- diffusers/pipelines/unclip/text_proj.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
- diffusers/pipelines/visualcloze/__init__.py +52 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
- diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
- diffusers/pipelines/wan/__init__.py +2 -0
- diffusers/pipelines/wan/pipeline_wan.py +91 -30
- diffusers/pipelines/wan/pipeline_wan_i2v.py +145 -45
- diffusers/pipelines/wan/pipeline_wan_vace.py +975 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
- diffusers/quantizers/__init__.py +3 -1
- diffusers/quantizers/base.py +17 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
- diffusers/quantizers/bitsandbytes/utils.py +10 -7
- diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
- diffusers/quantizers/gguf/utils.py +108 -16
- diffusers/quantizers/pipe_quant_config.py +202 -0
- diffusers/quantizers/quantization_config.py +18 -16
- diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +31 -1
- diffusers/schedulers/__init__.py +3 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
- diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -1
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
- diffusers/schedulers/scheduling_ddim.py +8 -8
- diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_ddim_flax.py +6 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
- diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
- diffusers/schedulers/scheduling_ddpm.py +9 -9
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
- diffusers/schedulers/scheduling_deis_multistep.py +16 -9
- diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +18 -12
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +19 -13
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
- diffusers/schedulers/scheduling_edm_euler.py +20 -11
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
- diffusers/schedulers/scheduling_heun_discrete.py +2 -2
- diffusers/schedulers/scheduling_ipndm.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
- diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
- diffusers/schedulers/scheduling_lcm.py +3 -3
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
- diffusers/schedulers/scheduling_pndm.py +4 -4
- diffusers/schedulers/scheduling_pndm_flax.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +9 -9
- diffusers/schedulers/scheduling_sasolver.py +15 -15
- diffusers/schedulers/scheduling_scm.py +1 -2
- diffusers/schedulers/scheduling_sde_ve.py +1 -1
- diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
- diffusers/schedulers/scheduling_tcd.py +3 -3
- diffusers/schedulers/scheduling_unclip.py +5 -5
- diffusers/schedulers/scheduling_unipc_multistep.py +21 -12
- diffusers/schedulers/scheduling_utils.py +3 -3
- diffusers/schedulers/scheduling_utils_flax.py +2 -2
- diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
- diffusers/training_utils.py +91 -5
- diffusers/utils/__init__.py +15 -0
- diffusers/utils/accelerate_utils.py +1 -1
- diffusers/utils/constants.py +4 -0
- diffusers/utils/doc_utils.py +1 -1
- diffusers/utils/dummy_pt_objects.py +432 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +480 -0
- diffusers/utils/dynamic_modules_utils.py +85 -8
- diffusers/utils/export_utils.py +1 -1
- diffusers/utils/hub_utils.py +33 -17
- diffusers/utils/import_utils.py +151 -18
- diffusers/utils/logging.py +1 -1
- diffusers/utils/outputs.py +2 -1
- diffusers/utils/peft_utils.py +96 -10
- diffusers/utils/state_dict_utils.py +20 -3
- diffusers/utils/testing_utils.py +195 -17
- diffusers/utils/torch_utils.py +43 -5
- diffusers/video_processor.py +2 -2
- {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/METADATA +72 -57
- diffusers-0.35.0.dist-info/RECORD +703 -0
- {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/WHEEL +1 -1
- diffusers-0.33.1.dist-info/RECORD +0 -608
- {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/LICENSE +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -159,6 +159,33 @@ def retrieve_latents(
|
|
159
159
|
raise AttributeError("Could not access latents of provided encoder_output")
|
160
160
|
|
161
161
|
|
162
|
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
163
|
+
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
164
|
+
r"""
|
165
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
166
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
167
|
+
Flawed](https://huggingface.co/papers/2305.08891).
|
168
|
+
|
169
|
+
Args:
|
170
|
+
noise_cfg (`torch.Tensor`):
|
171
|
+
The predicted noise tensor for the guided diffusion process.
|
172
|
+
noise_pred_text (`torch.Tensor`):
|
173
|
+
The predicted noise tensor for the text-guided diffusion process.
|
174
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
175
|
+
A rescale factor applied to the noise predictions.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
179
|
+
"""
|
180
|
+
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
181
|
+
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
182
|
+
# rescale the results from guidance (fixes overexposure)
|
183
|
+
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
|
184
|
+
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
|
185
|
+
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
|
186
|
+
return noise_cfg
|
187
|
+
|
188
|
+
|
162
189
|
class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
|
163
190
|
r"""
|
164
191
|
Pipeline for image-to-video generation.
|
@@ -542,6 +569,10 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
542
569
|
def guidance_scale(self):
|
543
570
|
return self._guidance_scale
|
544
571
|
|
572
|
+
@property
|
573
|
+
def guidance_rescale(self):
|
574
|
+
return self._guidance_rescale
|
575
|
+
|
545
576
|
@property
|
546
577
|
def do_classifier_free_guidance(self):
|
547
578
|
return self._guidance_scale > 1.0
|
@@ -576,6 +607,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
576
607
|
num_inference_steps: int = 50,
|
577
608
|
timesteps: List[int] = None,
|
578
609
|
guidance_scale: float = 3,
|
610
|
+
guidance_rescale: float = 0.0,
|
579
611
|
num_videos_per_prompt: Optional[int] = 1,
|
580
612
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
581
613
|
latents: Optional[torch.Tensor] = None,
|
@@ -615,11 +647,16 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
615
647
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
616
648
|
passed will be used. Must be in descending order.
|
617
649
|
guidance_scale (`float`, defaults to `3 `):
|
618
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
619
|
-
`guidance_scale` is defined as `w` of equation 2.
|
620
|
-
Paper](https://
|
621
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
622
|
-
usually at the expense of lower image quality.
|
650
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
651
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
652
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
653
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
654
|
+
the text `prompt`, usually at the expense of lower image quality.
|
655
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
656
|
+
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
657
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
658
|
+
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
659
|
+
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
623
660
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
624
661
|
The number of videos to generate per prompt.
|
625
662
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -688,6 +725,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
688
725
|
)
|
689
726
|
|
690
727
|
self._guidance_scale = guidance_scale
|
728
|
+
self._guidance_rescale = guidance_rescale
|
691
729
|
self._attention_kwargs = attention_kwargs
|
692
730
|
self._interrupt = False
|
693
731
|
self._current_timestep = None
|
@@ -792,18 +830,19 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
792
830
|
timestep = t.expand(latent_model_input.shape[0])
|
793
831
|
timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask)
|
794
832
|
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
833
|
+
with self.transformer.cache_context("cond_uncond"):
|
834
|
+
noise_pred = self.transformer(
|
835
|
+
hidden_states=latent_model_input,
|
836
|
+
encoder_hidden_states=prompt_embeds,
|
837
|
+
timestep=timestep,
|
838
|
+
encoder_attention_mask=prompt_attention_mask,
|
839
|
+
num_frames=latent_num_frames,
|
840
|
+
height=latent_height,
|
841
|
+
width=latent_width,
|
842
|
+
rope_interpolation_scale=rope_interpolation_scale,
|
843
|
+
attention_kwargs=attention_kwargs,
|
844
|
+
return_dict=False,
|
845
|
+
)[0]
|
807
846
|
noise_pred = noise_pred.float()
|
808
847
|
|
809
848
|
if self.do_classifier_free_guidance:
|
@@ -811,6 +850,12 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
|
811
850
|
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
812
851
|
timestep, _ = timestep.chunk(2)
|
813
852
|
|
853
|
+
if self.guidance_rescale > 0:
|
854
|
+
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
855
|
+
noise_pred = rescale_noise_cfg(
|
856
|
+
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
857
|
+
)
|
858
|
+
|
814
859
|
# compute the previous noisy sample x_t -> x_t-1
|
815
860
|
noise_pred = self._unpack_latents(
|
816
861
|
noise_pred,
|
@@ -0,0 +1,277 @@
|
|
1
|
+
# Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import List, Optional, Union
|
16
|
+
|
17
|
+
import torch
|
18
|
+
|
19
|
+
from ...image_processor import PipelineImageInput
|
20
|
+
from ...models import AutoencoderKLLTXVideo
|
21
|
+
from ...utils import get_logger
|
22
|
+
from ...utils.torch_utils import randn_tensor
|
23
|
+
from ...video_processor import VideoProcessor
|
24
|
+
from ..pipeline_utils import DiffusionPipeline
|
25
|
+
from .modeling_latent_upsampler import LTXLatentUpsamplerModel
|
26
|
+
from .pipeline_output import LTXPipelineOutput
|
27
|
+
|
28
|
+
|
29
|
+
logger = get_logger(__name__) # pylint: disable=invalid-name
|
30
|
+
|
31
|
+
|
32
|
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
33
|
+
def retrieve_latents(
|
34
|
+
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
35
|
+
):
|
36
|
+
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
|
37
|
+
return encoder_output.latent_dist.sample(generator)
|
38
|
+
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
|
39
|
+
return encoder_output.latent_dist.mode()
|
40
|
+
elif hasattr(encoder_output, "latents"):
|
41
|
+
return encoder_output.latents
|
42
|
+
else:
|
43
|
+
raise AttributeError("Could not access latents of provided encoder_output")
|
44
|
+
|
45
|
+
|
46
|
+
class LTXLatentUpsamplePipeline(DiffusionPipeline):
|
47
|
+
model_cpu_offload_seq = ""
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
vae: AutoencoderKLLTXVideo,
|
52
|
+
latent_upsampler: LTXLatentUpsamplerModel,
|
53
|
+
) -> None:
|
54
|
+
super().__init__()
|
55
|
+
|
56
|
+
self.register_modules(vae=vae, latent_upsampler=latent_upsampler)
|
57
|
+
|
58
|
+
self.vae_spatial_compression_ratio = (
|
59
|
+
self.vae.spatial_compression_ratio if getattr(self, "vae", None) is not None else 32
|
60
|
+
)
|
61
|
+
self.vae_temporal_compression_ratio = (
|
62
|
+
self.vae.temporal_compression_ratio if getattr(self, "vae", None) is not None else 8
|
63
|
+
)
|
64
|
+
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_spatial_compression_ratio)
|
65
|
+
|
66
|
+
def prepare_latents(
|
67
|
+
self,
|
68
|
+
video: Optional[torch.Tensor] = None,
|
69
|
+
batch_size: int = 1,
|
70
|
+
dtype: Optional[torch.dtype] = None,
|
71
|
+
device: Optional[torch.device] = None,
|
72
|
+
generator: Optional[torch.Generator] = None,
|
73
|
+
latents: Optional[torch.Tensor] = None,
|
74
|
+
) -> torch.Tensor:
|
75
|
+
if latents is not None:
|
76
|
+
return latents.to(device=device, dtype=dtype)
|
77
|
+
|
78
|
+
video = video.to(device=device, dtype=self.vae.dtype)
|
79
|
+
if isinstance(generator, list):
|
80
|
+
if len(generator) != batch_size:
|
81
|
+
raise ValueError(
|
82
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
83
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
84
|
+
)
|
85
|
+
|
86
|
+
init_latents = [
|
87
|
+
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
|
88
|
+
]
|
89
|
+
else:
|
90
|
+
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
|
91
|
+
|
92
|
+
init_latents = torch.cat(init_latents, dim=0).to(dtype)
|
93
|
+
init_latents = self._normalize_latents(init_latents, self.vae.latents_mean, self.vae.latents_std)
|
94
|
+
return init_latents
|
95
|
+
|
96
|
+
def adain_filter_latent(self, latents: torch.Tensor, reference_latents: torch.Tensor, factor: float = 1.0):
|
97
|
+
"""
|
98
|
+
Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on statistics from a reference latent
|
99
|
+
tensor.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
latent (`torch.Tensor`):
|
103
|
+
Input latents to normalize
|
104
|
+
reference_latents (`torch.Tensor`):
|
105
|
+
The reference latents providing style statistics.
|
106
|
+
factor (`float`):
|
107
|
+
Blending factor between original and transformed latent. Range: -10.0 to 10.0, Default: 1.0
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
torch.Tensor: The transformed latent tensor
|
111
|
+
"""
|
112
|
+
result = latents.clone()
|
113
|
+
|
114
|
+
for i in range(latents.size(0)):
|
115
|
+
for c in range(latents.size(1)):
|
116
|
+
r_sd, r_mean = torch.std_mean(reference_latents[i, c], dim=None) # index by original dim order
|
117
|
+
i_sd, i_mean = torch.std_mean(result[i, c], dim=None)
|
118
|
+
|
119
|
+
result[i, c] = ((result[i, c] - i_mean) / i_sd) * r_sd + r_mean
|
120
|
+
|
121
|
+
result = torch.lerp(latents, result, factor)
|
122
|
+
return result
|
123
|
+
|
124
|
+
@staticmethod
|
125
|
+
# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
|
126
|
+
def _normalize_latents(
|
127
|
+
latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
|
128
|
+
) -> torch.Tensor:
|
129
|
+
# Normalize latents across the channel dimension [B, C, F, H, W]
|
130
|
+
latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
131
|
+
latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
132
|
+
latents = (latents - latents_mean) * scaling_factor / latents_std
|
133
|
+
return latents
|
134
|
+
|
135
|
+
@staticmethod
|
136
|
+
# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._denormalize_latents
|
137
|
+
def _denormalize_latents(
|
138
|
+
latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
|
139
|
+
) -> torch.Tensor:
|
140
|
+
# Denormalize latents across the channel dimension [B, C, F, H, W]
|
141
|
+
latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
142
|
+
latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
|
143
|
+
latents = latents * latents_std / scaling_factor + latents_mean
|
144
|
+
return latents
|
145
|
+
|
146
|
+
def enable_vae_slicing(self):
|
147
|
+
r"""
|
148
|
+
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
149
|
+
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
150
|
+
"""
|
151
|
+
self.vae.enable_slicing()
|
152
|
+
|
153
|
+
def disable_vae_slicing(self):
|
154
|
+
r"""
|
155
|
+
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
|
156
|
+
computing decoding in one step.
|
157
|
+
"""
|
158
|
+
self.vae.disable_slicing()
|
159
|
+
|
160
|
+
def enable_vae_tiling(self):
|
161
|
+
r"""
|
162
|
+
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
163
|
+
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
164
|
+
processing larger images.
|
165
|
+
"""
|
166
|
+
self.vae.enable_tiling()
|
167
|
+
|
168
|
+
def disable_vae_tiling(self):
|
169
|
+
r"""
|
170
|
+
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
|
171
|
+
computing decoding in one step.
|
172
|
+
"""
|
173
|
+
self.vae.disable_tiling()
|
174
|
+
|
175
|
+
def check_inputs(self, video, height, width, latents):
|
176
|
+
if height % self.vae_spatial_compression_ratio != 0 or width % self.vae_spatial_compression_ratio != 0:
|
177
|
+
raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
|
178
|
+
|
179
|
+
if video is not None and latents is not None:
|
180
|
+
raise ValueError("Only one of `video` or `latents` can be provided.")
|
181
|
+
if video is None and latents is None:
|
182
|
+
raise ValueError("One of `video` or `latents` has to be provided.")
|
183
|
+
|
184
|
+
@torch.no_grad()
|
185
|
+
def __call__(
|
186
|
+
self,
|
187
|
+
video: Optional[List[PipelineImageInput]] = None,
|
188
|
+
height: int = 512,
|
189
|
+
width: int = 704,
|
190
|
+
latents: Optional[torch.Tensor] = None,
|
191
|
+
decode_timestep: Union[float, List[float]] = 0.0,
|
192
|
+
decode_noise_scale: Optional[Union[float, List[float]]] = None,
|
193
|
+
adain_factor: float = 0.0,
|
194
|
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
195
|
+
output_type: Optional[str] = "pil",
|
196
|
+
return_dict: bool = True,
|
197
|
+
):
|
198
|
+
self.check_inputs(
|
199
|
+
video=video,
|
200
|
+
height=height,
|
201
|
+
width=width,
|
202
|
+
latents=latents,
|
203
|
+
)
|
204
|
+
|
205
|
+
if video is not None:
|
206
|
+
# Batched video input is not yet tested/supported. TODO: take a look later
|
207
|
+
batch_size = 1
|
208
|
+
else:
|
209
|
+
batch_size = latents.shape[0]
|
210
|
+
device = self._execution_device
|
211
|
+
|
212
|
+
if video is not None:
|
213
|
+
num_frames = len(video)
|
214
|
+
if num_frames % self.vae_temporal_compression_ratio != 1:
|
215
|
+
num_frames = (
|
216
|
+
num_frames // self.vae_temporal_compression_ratio * self.vae_temporal_compression_ratio + 1
|
217
|
+
)
|
218
|
+
video = video[:num_frames]
|
219
|
+
logger.warning(
|
220
|
+
f"Video length expected to be of the form `k * {self.vae_temporal_compression_ratio} + 1` but is {len(video)}. Truncating to {num_frames} frames."
|
221
|
+
)
|
222
|
+
video = self.video_processor.preprocess_video(video, height=height, width=width)
|
223
|
+
video = video.to(device=device, dtype=torch.float32)
|
224
|
+
|
225
|
+
latents = self.prepare_latents(
|
226
|
+
video=video,
|
227
|
+
batch_size=batch_size,
|
228
|
+
dtype=torch.float32,
|
229
|
+
device=device,
|
230
|
+
generator=generator,
|
231
|
+
latents=latents,
|
232
|
+
)
|
233
|
+
|
234
|
+
latents = self._denormalize_latents(
|
235
|
+
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
|
236
|
+
)
|
237
|
+
latents = latents.to(self.latent_upsampler.dtype)
|
238
|
+
latents_upsampled = self.latent_upsampler(latents)
|
239
|
+
|
240
|
+
if adain_factor > 0.0:
|
241
|
+
latents = self.adain_filter_latent(latents_upsampled, latents, adain_factor)
|
242
|
+
else:
|
243
|
+
latents = latents_upsampled
|
244
|
+
|
245
|
+
if output_type == "latent":
|
246
|
+
latents = self._normalize_latents(
|
247
|
+
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
|
248
|
+
)
|
249
|
+
video = latents
|
250
|
+
else:
|
251
|
+
if not self.vae.config.timestep_conditioning:
|
252
|
+
timestep = None
|
253
|
+
else:
|
254
|
+
noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
|
255
|
+
if not isinstance(decode_timestep, list):
|
256
|
+
decode_timestep = [decode_timestep] * batch_size
|
257
|
+
if decode_noise_scale is None:
|
258
|
+
decode_noise_scale = decode_timestep
|
259
|
+
elif not isinstance(decode_noise_scale, list):
|
260
|
+
decode_noise_scale = [decode_noise_scale] * batch_size
|
261
|
+
|
262
|
+
timestep = torch.tensor(decode_timestep, device=device, dtype=latents.dtype)
|
263
|
+
decode_noise_scale = torch.tensor(decode_noise_scale, device=device, dtype=latents.dtype)[
|
264
|
+
:, None, None, None, None
|
265
|
+
]
|
266
|
+
latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
|
267
|
+
|
268
|
+
video = self.vae.decode(latents, timestep, return_dict=False)[0]
|
269
|
+
video = self.video_processor.postprocess_video(video, output_type=output_type)
|
270
|
+
|
271
|
+
# Offload all models
|
272
|
+
self.maybe_free_model_hooks()
|
273
|
+
|
274
|
+
if not return_dict:
|
275
|
+
return (video,)
|
276
|
+
|
277
|
+
return LTXPipelineOutput(frames=video)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -372,7 +372,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
372
372
|
def prepare_extra_step_kwargs(self, generator, eta):
|
373
373
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
374
374
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
375
|
-
# eta corresponds to η in DDIM paper: https://
|
375
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
376
376
|
# and should be between [0, 1]
|
377
377
|
|
378
378
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -534,7 +534,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
534
534
|
# &
|
535
535
|
caption = re.sub(r"&", "", caption)
|
536
536
|
|
537
|
-
# ip
|
537
|
+
# ip addresses:
|
538
538
|
caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
|
539
539
|
|
540
540
|
# article ids:
|
@@ -619,7 +619,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
619
619
|
return self._guidance_scale
|
620
620
|
|
621
621
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
622
|
-
# of the Imagen paper: https://
|
622
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
623
623
|
# corresponds to doing no classifier free guidance.
|
624
624
|
@property
|
625
625
|
def do_classifier_free_guidance(self):
|
@@ -677,11 +677,11 @@ class LuminaPipeline(DiffusionPipeline):
|
|
677
677
|
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
678
678
|
will be used.
|
679
679
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
680
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
681
|
-
`guidance_scale` is defined as `w` of equation 2.
|
682
|
-
Paper](https://
|
683
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
684
|
-
usually at the expense of lower image quality.
|
680
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
681
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
682
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
683
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
684
|
+
the text `prompt`, usually at the expense of lower image quality.
|
685
685
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
686
686
|
The number of images to generate per prompt.
|
687
687
|
height (`int`, *optional*, defaults to self.unet.config.sample_size):
|
@@ -689,8 +689,8 @@ class LuminaPipeline(DiffusionPipeline):
|
|
689
689
|
width (`int`, *optional*, defaults to self.unet.config.sample_size):
|
690
690
|
The width in pixels of the generated image.
|
691
691
|
eta (`float`, *optional*, defaults to 0.0):
|
692
|
-
Corresponds to parameter eta (η) in the DDIM paper: https://
|
693
|
-
[`schedulers.DDIMScheduler`], will be ignored for others.
|
692
|
+
Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
|
693
|
+
applies to [`schedulers.DDIMScheduler`], will be ignored for others.
|
694
694
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
695
695
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
696
696
|
to make generation deterministic.
|
@@ -771,7 +771,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
771
771
|
device = self._execution_device
|
772
772
|
|
773
773
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
774
|
-
# of the Imagen paper: https://
|
774
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
775
775
|
# corresponds to doing no classifier free guidance.
|
776
776
|
do_classifier_free_guidance = guidance_scale > 1.0
|
777
777
|
|
@@ -848,7 +848,7 @@ class LuminaPipeline(DiffusionPipeline):
|
|
848
848
|
# prepare image_rotary_emb for positional encoding
|
849
849
|
# dynamic scaling_factor for different resolution.
|
850
850
|
# NOTE: For `Time-aware` denosing mechanism from Lumina-Next
|
851
|
-
# https://
|
851
|
+
# https://huggingface.co/papers/2406.18583, Sec 2.3
|
852
852
|
# NOTE: We should compute different image_rotary_emb with different timestep.
|
853
853
|
if current_timestep[0] < scaling_watershed:
|
854
854
|
linear_factor = scaling_factor
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -342,7 +342,7 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
342
342
|
def prepare_extra_step_kwargs(self, generator, eta):
|
343
343
|
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
344
344
|
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
345
|
-
# eta corresponds to η in DDIM paper: https://
|
345
|
+
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
346
346
|
# and should be between [0, 1]
|
347
347
|
|
348
348
|
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
@@ -487,7 +487,7 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
487
487
|
return self._attention_kwargs
|
488
488
|
|
489
489
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
490
|
-
# of the Imagen paper: https://
|
490
|
+
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
491
491
|
# corresponds to doing no classifier free guidance.
|
492
492
|
@property
|
493
493
|
def do_classifier_free_guidance(self):
|
@@ -544,11 +544,11 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
544
544
|
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
545
545
|
will be used.
|
546
546
|
guidance_scale (`float`, *optional*, defaults to 4.0):
|
547
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
548
|
-
`guidance_scale` is defined as `w` of equation 2.
|
549
|
-
Paper](https://
|
550
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
551
|
-
usually at the expense of lower image quality.
|
547
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
548
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
549
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
550
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
551
|
+
the text `prompt`, usually at the expense of lower image quality.
|
552
552
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
553
553
|
The number of images to generate per prompt.
|
554
554
|
height (`int`, *optional*, defaults to self.unet.config.sample_size):
|
@@ -556,8 +556,8 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
|
|
556
556
|
width (`int`, *optional*, defaults to self.unet.config.sample_size):
|
557
557
|
The width in pixels of the generated image.
|
558
558
|
eta (`float`, *optional*, defaults to 0.0):
|
559
|
-
Corresponds to parameter eta (η) in the DDIM paper: https://
|
560
|
-
[`schedulers.DDIMScheduler`], will be ignored for others.
|
559
|
+
Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
|
560
|
+
applies to [`schedulers.DDIMScheduler`], will be ignored for others.
|
561
561
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
562
562
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
563
563
|
to make generation deterministic.
|
@@ -426,7 +426,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
426
426
|
if isinstance(img, np.ndarray):
|
427
427
|
img = torch.from_numpy(img)
|
428
428
|
if not torch.is_floating_point(img):
|
429
|
-
raise ValueError(f"{prefix}:
|
429
|
+
raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
|
430
430
|
else:
|
431
431
|
raise ValueError(f"{prefix}: unexpected type={type(img)}.")
|
432
432
|
if val_min != 0.0 or val_max != 1.0:
|
@@ -464,7 +464,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
464
464
|
if torch.is_tensor(img):
|
465
465
|
img = img.cpu().numpy()
|
466
466
|
if not np.issubdtype(img.dtype, np.floating):
|
467
|
-
raise ValueError(f"{prefix}:
|
467
|
+
raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
|
468
468
|
if val_min != 0.0 or val_max != 1.0:
|
469
469
|
img = (img - val_min) / (val_max - val_min)
|
470
470
|
img = (img * (2**16 - 1)).astype(np.uint16)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 Genmo and The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -521,11 +521,11 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
|
|
521
521
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
522
522
|
passed will be used. Must be in descending order.
|
523
523
|
guidance_scale (`float`, defaults to `4.5`):
|
524
|
-
Guidance scale as defined in [Classifier-Free Diffusion
|
525
|
-
`guidance_scale` is defined as `w` of equation 2.
|
526
|
-
Paper](https://
|
527
|
-
1`. Higher guidance scale encourages to generate images that are closely linked to
|
528
|
-
usually at the expense of lower image quality.
|
524
|
+
Guidance scale as defined in [Classifier-Free Diffusion
|
525
|
+
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
526
|
+
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
527
|
+
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
528
|
+
the text `prompt`, usually at the expense of lower image quality.
|
529
529
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
530
530
|
The number of videos to generate per prompt.
|
531
531
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -671,14 +671,15 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
|
|
671
671
|
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
672
672
|
timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
|
673
673
|
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
674
|
+
with self.transformer.cache_context("cond_uncond"):
|
675
|
+
noise_pred = self.transformer(
|
676
|
+
hidden_states=latent_model_input,
|
677
|
+
encoder_hidden_states=prompt_embeds,
|
678
|
+
timestep=timestep,
|
679
|
+
encoder_attention_mask=prompt_attention_mask,
|
680
|
+
attention_kwargs=attention_kwargs,
|
681
|
+
return_dict=False,
|
682
|
+
)[0]
|
682
683
|
# Mochi CFG + Sampling runs in FP32
|
683
684
|
noise_pred = noise_pred.to(torch.float32)
|
684
685
|
|