diffusers 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +48 -1
- diffusers/commands/__init__.py +1 -1
- diffusers/commands/diffusers_cli.py +1 -1
- diffusers/commands/env.py +1 -1
- diffusers/commands/fp16_safetensors.py +1 -1
- diffusers/dependency_versions_check.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/experimental/rl/value_guided_sampling.py +1 -1
- diffusers/hooks/faster_cache.py +2 -2
- diffusers/hooks/group_offloading.py +128 -29
- diffusers/hooks/hooks.py +2 -2
- diffusers/hooks/layerwise_casting.py +3 -3
- diffusers/hooks/pyramid_attention_broadcast.py +1 -1
- diffusers/image_processor.py +7 -2
- diffusers/loaders/__init__.py +4 -0
- diffusers/loaders/ip_adapter.py +5 -14
- diffusers/loaders/lora_base.py +212 -111
- diffusers/loaders/lora_conversion_utils.py +275 -34
- diffusers/loaders/lora_pipeline.py +1554 -819
- diffusers/loaders/peft.py +52 -109
- diffusers/loaders/single_file.py +2 -2
- diffusers/loaders/single_file_model.py +20 -4
- diffusers/loaders/single_file_utils.py +225 -5
- diffusers/loaders/textual_inversion.py +3 -2
- diffusers/loaders/transformer_flux.py +1 -1
- diffusers/loaders/transformer_sd3.py +2 -2
- diffusers/loaders/unet.py +2 -16
- diffusers/loaders/unet_loader_utils.py +1 -1
- diffusers/loaders/utils.py +1 -1
- diffusers/models/__init__.py +15 -1
- diffusers/models/activations.py +5 -5
- diffusers/models/adapter.py +2 -3
- diffusers/models/attention.py +4 -4
- diffusers/models/attention_flax.py +10 -10
- diffusers/models/attention_processor.py +14 -10
- diffusers/models/auto_model.py +47 -10
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_dc.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
- diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
- diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
- diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
- diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vae.py +13 -2
- diffusers/models/autoencoders/vq_model.py +2 -2
- diffusers/models/cache_utils.py +1 -1
- diffusers/models/controlnet.py +1 -1
- diffusers/models/controlnet_flux.py +1 -1
- diffusers/models/controlnet_sd3.py +1 -1
- diffusers/models/controlnet_sparsectrl.py +1 -1
- diffusers/models/controlnets/__init__.py +1 -0
- diffusers/models/controlnets/controlnet.py +3 -3
- diffusers/models/controlnets/controlnet_flax.py +1 -1
- diffusers/models/controlnets/controlnet_flux.py +16 -15
- diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
- diffusers/models/controlnets/controlnet_sana.py +290 -0
- diffusers/models/controlnets/controlnet_sd3.py +1 -1
- diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
- diffusers/models/controlnets/controlnet_union.py +1 -1
- diffusers/models/controlnets/controlnet_xs.py +7 -7
- diffusers/models/controlnets/multicontrolnet.py +4 -5
- diffusers/models/controlnets/multicontrolnet_union.py +5 -6
- diffusers/models/downsampling.py +2 -2
- diffusers/models/embeddings.py +10 -12
- diffusers/models/embeddings_flax.py +2 -2
- diffusers/models/lora.py +3 -3
- diffusers/models/modeling_utils.py +44 -14
- diffusers/models/normalization.py +4 -4
- diffusers/models/resnet.py +2 -2
- diffusers/models/resnet_flax.py +1 -1
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
- diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
- diffusers/models/transformers/consisid_transformer_3d.py +1 -1
- diffusers/models/transformers/dit_transformer_2d.py +2 -2
- diffusers/models/transformers/dual_transformer_2d.py +1 -1
- diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
- diffusers/models/transformers/latte_transformer_3d.py +4 -5
- diffusers/models/transformers/lumina_nextdit2d.py +2 -2
- diffusers/models/transformers/pixart_transformer_2d.py +3 -3
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/sana_transformer.py +8 -3
- diffusers/models/transformers/stable_audio_transformer.py +5 -9
- diffusers/models/transformers/t5_film_transformer.py +3 -3
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +1 -1
- diffusers/models/transformers/transformer_chroma.py +742 -0
- diffusers/models/transformers/transformer_cogview3plus.py +5 -10
- diffusers/models/transformers/transformer_cogview4.py +317 -25
- diffusers/models/transformers/transformer_cosmos.py +579 -0
- diffusers/models/transformers/transformer_flux.py +9 -11
- diffusers/models/transformers/transformer_hidream_image.py +942 -0
- diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
- diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
- diffusers/models/transformers/transformer_ltx.py +2 -2
- diffusers/models/transformers/transformer_lumina2.py +1 -1
- diffusers/models/transformers/transformer_mochi.py +1 -1
- diffusers/models/transformers/transformer_omnigen.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +7 -7
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/transformers/transformer_wan.py +24 -8
- diffusers/models/transformers/transformer_wan_vace.py +393 -0
- diffusers/models/unets/unet_1d.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +1 -1
- diffusers/models/unets/unet_2d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
- diffusers/models/unets/unet_2d_condition.py +2 -2
- diffusers/models/unets/unet_2d_condition_flax.py +2 -2
- diffusers/models/unets/unet_3d_blocks.py +1 -1
- diffusers/models/unets/unet_3d_condition.py +3 -3
- diffusers/models/unets/unet_i2vgen_xl.py +3 -3
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +2 -2
- diffusers/models/unets/unet_stable_cascade.py +1 -1
- diffusers/models/upsampling.py +2 -2
- diffusers/models/vae_flax.py +2 -2
- diffusers/models/vq_model.py +1 -1
- diffusers/pipelines/__init__.py +37 -6
- diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
- diffusers/pipelines/amused/pipeline_amused.py +7 -6
- diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
- diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
- diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
- diffusers/pipelines/auto_pipeline.py +6 -7
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
- diffusers/pipelines/chroma/__init__.py +49 -0
- diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
- diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
- diffusers/pipelines/chroma/pipeline_output.py +21 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
- diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
- diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
- diffusers/pipelines/consisid/consisid_utils.py +2 -2
- diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
- diffusers/pipelines/cosmos/__init__.py +54 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
- diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
- diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
- diffusers/pipelines/cosmos/pipeline_output.py +40 -0
- diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
- diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
- diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
- diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
- diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
- diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
- diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
- diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
- diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
- diffusers/pipelines/dit/pipeline_dit.py +1 -1
- diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
- diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
- diffusers/pipelines/flux/modeling_flux.py +1 -1
- diffusers/pipelines/flux/pipeline_flux.py +10 -17
- diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
- diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
- diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
- diffusers/pipelines/free_init_utils.py +2 -2
- diffusers/pipelines/free_noise_utils.py +3 -3
- diffusers/pipelines/hidream_image/__init__.py +47 -0
- diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
- diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
- diffusers/pipelines/hunyuan_video/__init__.py +2 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
- diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
- diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
- diffusers/pipelines/kolors/text_encoder.py +3 -3
- diffusers/pipelines/kolors/tokenizer.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
- diffusers/pipelines/latte/pipeline_latte.py +12 -12
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
- diffusers/pipelines/ltx/__init__.py +4 -0
- diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
- diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
- diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
- diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
- diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
- diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
- diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
- diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
- diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
- diffusers/pipelines/onnx_utils.py +15 -2
- diffusers/pipelines/pag/pag_utils.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
- diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
- diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
- diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
- diffusers/pipelines/pia/pipeline_pia.py +8 -6
- diffusers/pipelines/pipeline_flax_utils.py +3 -4
- diffusers/pipelines/pipeline_loading_utils.py +89 -13
- diffusers/pipelines/pipeline_utils.py +105 -33
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
- diffusers/pipelines/sana/__init__.py +4 -0
- diffusers/pipelines/sana/pipeline_sana.py +23 -21
- diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
- diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
- diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
- diffusers/pipelines/shap_e/camera.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
- diffusers/pipelines/shap_e/renderer.py +3 -3
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
- diffusers/pipelines/stable_diffusion/__init__.py +0 -7
- diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
- diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
- diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
- diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
- diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
- diffusers/pipelines/unclip/text_proj.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
- diffusers/pipelines/visualcloze/__init__.py +52 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
- diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
- diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
- diffusers/pipelines/wan/__init__.py +2 -0
- diffusers/pipelines/wan/pipeline_wan.py +17 -12
- diffusers/pipelines/wan/pipeline_wan_i2v.py +42 -20
- diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
- diffusers/pipelines/wan/pipeline_wan_video2video.py +18 -18
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
- diffusers/quantizers/__init__.py +179 -1
- diffusers/quantizers/base.py +6 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
- diffusers/quantizers/bitsandbytes/utils.py +10 -7
- diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
- diffusers/quantizers/gguf/utils.py +16 -13
- diffusers/quantizers/quantization_config.py +18 -16
- diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
- diffusers/schedulers/__init__.py +3 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
- diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
- diffusers/schedulers/scheduling_consistency_models.py +1 -1
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
- diffusers/schedulers/scheduling_ddim.py +8 -8
- diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_ddim_flax.py +6 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
- diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
- diffusers/schedulers/scheduling_ddpm.py +9 -9
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
- diffusers/schedulers/scheduling_deis_multistep.py +8 -8
- diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
- diffusers/schedulers/scheduling_edm_euler.py +20 -11
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete.py +3 -3
- diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
- diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
- diffusers/schedulers/scheduling_heun_discrete.py +2 -2
- diffusers/schedulers/scheduling_ipndm.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
- diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
- diffusers/schedulers/scheduling_lcm.py +3 -3
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
- diffusers/schedulers/scheduling_pndm.py +4 -4
- diffusers/schedulers/scheduling_pndm_flax.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +9 -9
- diffusers/schedulers/scheduling_sasolver.py +15 -15
- diffusers/schedulers/scheduling_scm.py +1 -1
- diffusers/schedulers/scheduling_sde_ve.py +1 -1
- diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
- diffusers/schedulers/scheduling_tcd.py +3 -3
- diffusers/schedulers/scheduling_unclip.py +5 -5
- diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
- diffusers/schedulers/scheduling_utils.py +1 -1
- diffusers/schedulers/scheduling_utils_flax.py +1 -1
- diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
- diffusers/training_utils.py +13 -5
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/accelerate_utils.py +1 -1
- diffusers/utils/doc_utils.py +1 -1
- diffusers/utils/dummy_pt_objects.py +120 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
- diffusers/utils/dynamic_modules_utils.py +21 -3
- diffusers/utils/export_utils.py +1 -1
- diffusers/utils/import_utils.py +81 -18
- diffusers/utils/logging.py +1 -1
- diffusers/utils/outputs.py +2 -1
- diffusers/utils/peft_utils.py +91 -8
- diffusers/utils/state_dict_utils.py +20 -3
- diffusers/utils/testing_utils.py +59 -7
- diffusers/utils/torch_utils.py +25 -5
- diffusers/video_processor.py +2 -2
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/METADATA +3 -3
- diffusers-0.34.0.dist-info/RECORD +639 -0
- diffusers-0.33.0.dist-info/RECORD +0 -608
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/WHEEL +0 -0
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -126,6 +126,7 @@ CHECKPOINT_KEY_NAMES = {
|
|
126
126
|
],
|
127
127
|
"wan": ["model.diffusion_model.head.modulation", "head.modulation"],
|
128
128
|
"wan_vae": "decoder.middle.0.residual.0.gamma",
|
129
|
+
"hidream": "double_stream_blocks.0.block.adaLN_modulation.1.bias",
|
129
130
|
}
|
130
131
|
|
131
132
|
DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
@@ -177,6 +178,8 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
|
177
178
|
"flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
|
178
179
|
"ltx-video": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.0"},
|
179
180
|
"ltx-video-0.9.1": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.1"},
|
181
|
+
"ltx-video-0.9.5": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.5"},
|
182
|
+
"ltx-video-0.9.7": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.7-dev"},
|
180
183
|
"autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
|
181
184
|
"autoencoder-dc-f64c128": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"},
|
182
185
|
"autoencoder-dc-f32c32": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers"},
|
@@ -189,6 +192,7 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
|
|
189
192
|
"wan-t2v-1.3B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
|
190
193
|
"wan-t2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-14B-Diffusers"},
|
191
194
|
"wan-i2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"},
|
195
|
+
"hidream": {"pretrained_model_name_or_path": "HiDream-ai/HiDream-I1-Dev"},
|
192
196
|
}
|
193
197
|
|
194
198
|
# Use to configure model sample size when original config is provided
|
@@ -404,13 +408,16 @@ def load_single_file_checkpoint(
|
|
404
408
|
local_files_only=None,
|
405
409
|
revision=None,
|
406
410
|
disable_mmap=False,
|
411
|
+
user_agent=None,
|
407
412
|
):
|
413
|
+
if user_agent is None:
|
414
|
+
user_agent = {"file_type": "single_file", "framework": "pytorch"}
|
415
|
+
|
408
416
|
if os.path.isfile(pretrained_model_link_or_path):
|
409
417
|
pretrained_model_link_or_path = pretrained_model_link_or_path
|
410
418
|
|
411
419
|
else:
|
412
420
|
repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
|
413
|
-
user_agent = {"file_type": "single_file", "framework": "pytorch"}
|
414
421
|
pretrained_model_link_or_path = _get_model_file(
|
415
422
|
repo_id,
|
416
423
|
weights_name=weights_name,
|
@@ -638,7 +645,12 @@ def infer_diffusers_model_type(checkpoint):
|
|
638
645
|
model_type = "flux-schnell"
|
639
646
|
|
640
647
|
elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["ltx-video"]):
|
641
|
-
|
648
|
+
has_vae = "vae.encoder.conv_in.conv.bias" in checkpoint
|
649
|
+
if any(key.endswith("transformer_blocks.47.scale_shift_table") for key in checkpoint):
|
650
|
+
model_type = "ltx-video-0.9.7"
|
651
|
+
elif has_vae and checkpoint["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
|
652
|
+
model_type = "ltx-video-0.9.5"
|
653
|
+
elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
|
642
654
|
model_type = "ltx-video-0.9.1"
|
643
655
|
else:
|
644
656
|
model_type = "ltx-video"
|
@@ -695,6 +707,8 @@ def infer_diffusers_model_type(checkpoint):
|
|
695
707
|
elif CHECKPOINT_KEY_NAMES["wan_vae"] in checkpoint:
|
696
708
|
# All Wan models use the same VAE so we can use the same default model repo to fetch the config
|
697
709
|
model_type = "wan-t2v-14B"
|
710
|
+
elif CHECKPOINT_KEY_NAMES["hidream"] in checkpoint:
|
711
|
+
model_type = "hidream"
|
698
712
|
else:
|
699
713
|
model_type = "v1"
|
700
714
|
|
@@ -2272,7 +2286,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
|
2272
2286
|
f"double_blocks.{i}.txt_attn.proj.bias"
|
2273
2287
|
)
|
2274
2288
|
|
2275
|
-
# single
|
2289
|
+
# single transformer blocks
|
2276
2290
|
for i in range(num_single_layers):
|
2277
2291
|
block_prefix = f"single_transformer_blocks.{i}."
|
2278
2292
|
# norm.linear <- single_blocks.0.modulation.lin
|
@@ -2403,13 +2417,41 @@ def convert_ltx_vae_checkpoint_to_diffusers(checkpoint, **kwargs):
|
|
2403
2417
|
"last_scale_shift_table": "scale_shift_table",
|
2404
2418
|
}
|
2405
2419
|
|
2420
|
+
VAE_095_RENAME_DICT = {
|
2421
|
+
# decoder
|
2422
|
+
"up_blocks.0": "mid_block",
|
2423
|
+
"up_blocks.1": "up_blocks.0.upsamplers.0",
|
2424
|
+
"up_blocks.2": "up_blocks.0",
|
2425
|
+
"up_blocks.3": "up_blocks.1.upsamplers.0",
|
2426
|
+
"up_blocks.4": "up_blocks.1",
|
2427
|
+
"up_blocks.5": "up_blocks.2.upsamplers.0",
|
2428
|
+
"up_blocks.6": "up_blocks.2",
|
2429
|
+
"up_blocks.7": "up_blocks.3.upsamplers.0",
|
2430
|
+
"up_blocks.8": "up_blocks.3",
|
2431
|
+
# encoder
|
2432
|
+
"down_blocks.0": "down_blocks.0",
|
2433
|
+
"down_blocks.1": "down_blocks.0.downsamplers.0",
|
2434
|
+
"down_blocks.2": "down_blocks.1",
|
2435
|
+
"down_blocks.3": "down_blocks.1.downsamplers.0",
|
2436
|
+
"down_blocks.4": "down_blocks.2",
|
2437
|
+
"down_blocks.5": "down_blocks.2.downsamplers.0",
|
2438
|
+
"down_blocks.6": "down_blocks.3",
|
2439
|
+
"down_blocks.7": "down_blocks.3.downsamplers.0",
|
2440
|
+
"down_blocks.8": "mid_block",
|
2441
|
+
# common
|
2442
|
+
"last_time_embedder": "time_embedder",
|
2443
|
+
"last_scale_shift_table": "scale_shift_table",
|
2444
|
+
}
|
2445
|
+
|
2406
2446
|
VAE_SPECIAL_KEYS_REMAP = {
|
2407
2447
|
"per_channel_statistics.channel": remove_keys_,
|
2408
2448
|
"per_channel_statistics.mean-of-means": remove_keys_,
|
2409
2449
|
"per_channel_statistics.mean-of-stds": remove_keys_,
|
2410
2450
|
}
|
2411
2451
|
|
2412
|
-
if "vae.
|
2452
|
+
if converted_state_dict["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
|
2453
|
+
VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
|
2454
|
+
elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
|
2413
2455
|
VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
|
2414
2456
|
|
2415
2457
|
for key in list(converted_state_dict.keys()):
|
@@ -2838,7 +2880,7 @@ def convert_auraflow_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
|
2838
2880
|
def convert_lumina2_to_diffusers(checkpoint, **kwargs):
|
2839
2881
|
converted_state_dict = {}
|
2840
2882
|
|
2841
|
-
# Original Lumina-Image-2 has an extra norm
|
2883
|
+
# Original Lumina-Image-2 has an extra norm parameter that is unused
|
2842
2884
|
# We just remove it here
|
2843
2885
|
checkpoint.pop("norm_final.weight", None)
|
2844
2886
|
|
@@ -3259,3 +3301,181 @@ def convert_wan_vae_to_diffusers(checkpoint, **kwargs):
|
|
3259
3301
|
converted_state_dict[key] = value
|
3260
3302
|
|
3261
3303
|
return converted_state_dict
|
3304
|
+
|
3305
|
+
|
3306
|
+
def convert_hidream_transformer_to_diffusers(checkpoint, **kwargs):
|
3307
|
+
keys = list(checkpoint.keys())
|
3308
|
+
for k in keys:
|
3309
|
+
if "model.diffusion_model." in k:
|
3310
|
+
checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
|
3311
|
+
|
3312
|
+
return checkpoint
|
3313
|
+
|
3314
|
+
|
3315
|
+
def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
3316
|
+
converted_state_dict = {}
|
3317
|
+
keys = list(checkpoint.keys())
|
3318
|
+
|
3319
|
+
for k in keys:
|
3320
|
+
if "model.diffusion_model." in k:
|
3321
|
+
checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
|
3322
|
+
|
3323
|
+
num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1 # noqa: C401
|
3324
|
+
num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1 # noqa: C401
|
3325
|
+
num_guidance_layers = (
|
3326
|
+
list(set(int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k))[-1] + 1 # noqa: C401
|
3327
|
+
)
|
3328
|
+
mlp_ratio = 4.0
|
3329
|
+
inner_dim = 3072
|
3330
|
+
|
3331
|
+
# in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
|
3332
|
+
# while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
|
3333
|
+
def swap_scale_shift(weight):
|
3334
|
+
shift, scale = weight.chunk(2, dim=0)
|
3335
|
+
new_weight = torch.cat([scale, shift], dim=0)
|
3336
|
+
return new_weight
|
3337
|
+
|
3338
|
+
# guidance
|
3339
|
+
converted_state_dict["distilled_guidance_layer.in_proj.bias"] = checkpoint.pop(
|
3340
|
+
"distilled_guidance_layer.in_proj.bias"
|
3341
|
+
)
|
3342
|
+
converted_state_dict["distilled_guidance_layer.in_proj.weight"] = checkpoint.pop(
|
3343
|
+
"distilled_guidance_layer.in_proj.weight"
|
3344
|
+
)
|
3345
|
+
converted_state_dict["distilled_guidance_layer.out_proj.bias"] = checkpoint.pop(
|
3346
|
+
"distilled_guidance_layer.out_proj.bias"
|
3347
|
+
)
|
3348
|
+
converted_state_dict["distilled_guidance_layer.out_proj.weight"] = checkpoint.pop(
|
3349
|
+
"distilled_guidance_layer.out_proj.weight"
|
3350
|
+
)
|
3351
|
+
for i in range(num_guidance_layers):
|
3352
|
+
block_prefix = f"distilled_guidance_layer.layers.{i}."
|
3353
|
+
converted_state_dict[f"{block_prefix}linear_1.bias"] = checkpoint.pop(
|
3354
|
+
f"distilled_guidance_layer.layers.{i}.in_layer.bias"
|
3355
|
+
)
|
3356
|
+
converted_state_dict[f"{block_prefix}linear_1.weight"] = checkpoint.pop(
|
3357
|
+
f"distilled_guidance_layer.layers.{i}.in_layer.weight"
|
3358
|
+
)
|
3359
|
+
converted_state_dict[f"{block_prefix}linear_2.bias"] = checkpoint.pop(
|
3360
|
+
f"distilled_guidance_layer.layers.{i}.out_layer.bias"
|
3361
|
+
)
|
3362
|
+
converted_state_dict[f"{block_prefix}linear_2.weight"] = checkpoint.pop(
|
3363
|
+
f"distilled_guidance_layer.layers.{i}.out_layer.weight"
|
3364
|
+
)
|
3365
|
+
converted_state_dict[f"distilled_guidance_layer.norms.{i}.weight"] = checkpoint.pop(
|
3366
|
+
f"distilled_guidance_layer.norms.{i}.scale"
|
3367
|
+
)
|
3368
|
+
|
3369
|
+
# context_embedder
|
3370
|
+
converted_state_dict["context_embedder.weight"] = checkpoint.pop("txt_in.weight")
|
3371
|
+
converted_state_dict["context_embedder.bias"] = checkpoint.pop("txt_in.bias")
|
3372
|
+
|
3373
|
+
# x_embedder
|
3374
|
+
converted_state_dict["x_embedder.weight"] = checkpoint.pop("img_in.weight")
|
3375
|
+
converted_state_dict["x_embedder.bias"] = checkpoint.pop("img_in.bias")
|
3376
|
+
|
3377
|
+
# double transformer blocks
|
3378
|
+
for i in range(num_layers):
|
3379
|
+
block_prefix = f"transformer_blocks.{i}."
|
3380
|
+
# Q, K, V
|
3381
|
+
sample_q, sample_k, sample_v = torch.chunk(checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.weight"), 3, dim=0)
|
3382
|
+
context_q, context_k, context_v = torch.chunk(
|
3383
|
+
checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.weight"), 3, dim=0
|
3384
|
+
)
|
3385
|
+
sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
|
3386
|
+
checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.bias"), 3, dim=0
|
3387
|
+
)
|
3388
|
+
context_q_bias, context_k_bias, context_v_bias = torch.chunk(
|
3389
|
+
checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.bias"), 3, dim=0
|
3390
|
+
)
|
3391
|
+
converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([sample_q])
|
3392
|
+
converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([sample_q_bias])
|
3393
|
+
converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([sample_k])
|
3394
|
+
converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([sample_k_bias])
|
3395
|
+
converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([sample_v])
|
3396
|
+
converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([sample_v_bias])
|
3397
|
+
converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = torch.cat([context_q])
|
3398
|
+
converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = torch.cat([context_q_bias])
|
3399
|
+
converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = torch.cat([context_k])
|
3400
|
+
converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = torch.cat([context_k_bias])
|
3401
|
+
converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = torch.cat([context_v])
|
3402
|
+
converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = torch.cat([context_v_bias])
|
3403
|
+
# qk_norm
|
3404
|
+
converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
|
3405
|
+
f"double_blocks.{i}.img_attn.norm.query_norm.scale"
|
3406
|
+
)
|
3407
|
+
converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
|
3408
|
+
f"double_blocks.{i}.img_attn.norm.key_norm.scale"
|
3409
|
+
)
|
3410
|
+
converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = checkpoint.pop(
|
3411
|
+
f"double_blocks.{i}.txt_attn.norm.query_norm.scale"
|
3412
|
+
)
|
3413
|
+
converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = checkpoint.pop(
|
3414
|
+
f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
|
3415
|
+
)
|
3416
|
+
# ff img_mlp
|
3417
|
+
converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = checkpoint.pop(
|
3418
|
+
f"double_blocks.{i}.img_mlp.0.weight"
|
3419
|
+
)
|
3420
|
+
converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.0.bias")
|
3421
|
+
converted_state_dict[f"{block_prefix}ff.net.2.weight"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.weight")
|
3422
|
+
converted_state_dict[f"{block_prefix}ff.net.2.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.bias")
|
3423
|
+
converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = checkpoint.pop(
|
3424
|
+
f"double_blocks.{i}.txt_mlp.0.weight"
|
3425
|
+
)
|
3426
|
+
converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = checkpoint.pop(
|
3427
|
+
f"double_blocks.{i}.txt_mlp.0.bias"
|
3428
|
+
)
|
3429
|
+
converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = checkpoint.pop(
|
3430
|
+
f"double_blocks.{i}.txt_mlp.2.weight"
|
3431
|
+
)
|
3432
|
+
converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = checkpoint.pop(
|
3433
|
+
f"double_blocks.{i}.txt_mlp.2.bias"
|
3434
|
+
)
|
3435
|
+
# output projections.
|
3436
|
+
converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = checkpoint.pop(
|
3437
|
+
f"double_blocks.{i}.img_attn.proj.weight"
|
3438
|
+
)
|
3439
|
+
converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = checkpoint.pop(
|
3440
|
+
f"double_blocks.{i}.img_attn.proj.bias"
|
3441
|
+
)
|
3442
|
+
converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = checkpoint.pop(
|
3443
|
+
f"double_blocks.{i}.txt_attn.proj.weight"
|
3444
|
+
)
|
3445
|
+
converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = checkpoint.pop(
|
3446
|
+
f"double_blocks.{i}.txt_attn.proj.bias"
|
3447
|
+
)
|
3448
|
+
|
3449
|
+
# single transformer blocks
|
3450
|
+
for i in range(num_single_layers):
|
3451
|
+
block_prefix = f"single_transformer_blocks.{i}."
|
3452
|
+
# Q, K, V, mlp
|
3453
|
+
mlp_hidden_dim = int(inner_dim * mlp_ratio)
|
3454
|
+
split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
|
3455
|
+
q, k, v, mlp = torch.split(checkpoint.pop(f"single_blocks.{i}.linear1.weight"), split_size, dim=0)
|
3456
|
+
q_bias, k_bias, v_bias, mlp_bias = torch.split(
|
3457
|
+
checkpoint.pop(f"single_blocks.{i}.linear1.bias"), split_size, dim=0
|
3458
|
+
)
|
3459
|
+
converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([q])
|
3460
|
+
converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([q_bias])
|
3461
|
+
converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([k])
|
3462
|
+
converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([k_bias])
|
3463
|
+
converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([v])
|
3464
|
+
converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([v_bias])
|
3465
|
+
converted_state_dict[f"{block_prefix}proj_mlp.weight"] = torch.cat([mlp])
|
3466
|
+
converted_state_dict[f"{block_prefix}proj_mlp.bias"] = torch.cat([mlp_bias])
|
3467
|
+
# qk norm
|
3468
|
+
converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
|
3469
|
+
f"single_blocks.{i}.norm.query_norm.scale"
|
3470
|
+
)
|
3471
|
+
converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
|
3472
|
+
f"single_blocks.{i}.norm.key_norm.scale"
|
3473
|
+
)
|
3474
|
+
# output projections.
|
3475
|
+
converted_state_dict[f"{block_prefix}proj_out.weight"] = checkpoint.pop(f"single_blocks.{i}.linear2.weight")
|
3476
|
+
converted_state_dict[f"{block_prefix}proj_out.bias"] = checkpoint.pop(f"single_blocks.{i}.linear2.bias")
|
3477
|
+
|
3478
|
+
converted_state_dict["proj_out.weight"] = checkpoint.pop("final_layer.linear.weight")
|
3479
|
+
converted_state_dict["proj_out.bias"] = checkpoint.pop("final_layer.linear.bias")
|
3480
|
+
|
3481
|
+
return converted_state_dict
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -427,7 +427,8 @@ class TextualInversionLoaderMixin:
|
|
427
427
|
logger.info(
|
428
428
|
"Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
|
429
429
|
)
|
430
|
-
|
430
|
+
if is_sequential_cpu_offload or is_model_cpu_offload:
|
431
|
+
remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
|
431
432
|
|
432
433
|
# 7.2 save expected device and dtype
|
433
434
|
device = text_encoder.device
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -123,7 +123,7 @@ class SD3Transformer2DLoadersMixin:
|
|
123
123
|
key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj")
|
124
124
|
updated_state_dict[key] = value
|
125
125
|
|
126
|
-
# Image
|
126
|
+
# Image projection parameters
|
127
127
|
embed_dim = updated_state_dict["proj_in.weight"].shape[1]
|
128
128
|
output_dim = updated_state_dict["proj_out.weight"].shape[0]
|
129
129
|
hidden_dim = updated_state_dict["proj_in.weight"].shape[0]
|
diffusers/loaders/unet.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -155,10 +155,7 @@ class UNet2DConditionLoadersMixin:
|
|
155
155
|
use_safetensors = True
|
156
156
|
allow_pickle = True
|
157
157
|
|
158
|
-
user_agent = {
|
159
|
-
"file_type": "attn_procs_weights",
|
160
|
-
"framework": "pytorch",
|
161
|
-
}
|
158
|
+
user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
|
162
159
|
|
163
160
|
model_file = None
|
164
161
|
if not isinstance(pretrained_model_name_or_path_or_dict, dict):
|
@@ -397,17 +394,6 @@ class UNet2DConditionLoadersMixin:
|
|
397
394
|
@classmethod
|
398
395
|
# Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading
|
399
396
|
def _optionally_disable_offloading(cls, _pipeline):
|
400
|
-
"""
|
401
|
-
Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU.
|
402
|
-
|
403
|
-
Args:
|
404
|
-
_pipeline (`DiffusionPipeline`):
|
405
|
-
The pipeline to disable offloading for.
|
406
|
-
|
407
|
-
Returns:
|
408
|
-
tuple:
|
409
|
-
A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
|
410
|
-
"""
|
411
397
|
return _func_optionally_disable_offloading(_pipeline=_pipeline)
|
412
398
|
|
413
399
|
def save_attn_procs(
|
diffusers/loaders/utils.py
CHANGED
diffusers/models/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -32,6 +32,7 @@ if is_torch_available():
|
|
32
32
|
_import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
|
33
33
|
_import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
|
34
34
|
_import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
|
35
|
+
_import_structure["autoencoders.autoencoder_kl_cosmos"] = ["AutoencoderKLCosmos"]
|
35
36
|
_import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
|
36
37
|
_import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
|
37
38
|
_import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
|
@@ -49,6 +50,7 @@ if is_torch_available():
|
|
49
50
|
"HunyuanDiT2DControlNetModel",
|
50
51
|
"HunyuanDiT2DMultiControlNetModel",
|
51
52
|
]
|
53
|
+
_import_structure["controlnets.controlnet_sana"] = ["SanaControlNetModel"]
|
52
54
|
_import_structure["controlnets.controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
|
53
55
|
_import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
|
54
56
|
_import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
|
@@ -72,11 +74,15 @@ if is_torch_available():
|
|
72
74
|
_import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
|
73
75
|
_import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
|
74
76
|
_import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
|
77
|
+
_import_structure["transformers.transformer_chroma"] = ["ChromaTransformer2DModel"]
|
75
78
|
_import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
|
76
79
|
_import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"]
|
80
|
+
_import_structure["transformers.transformer_cosmos"] = ["CosmosTransformer3DModel"]
|
77
81
|
_import_structure["transformers.transformer_easyanimate"] = ["EasyAnimateTransformer3DModel"]
|
78
82
|
_import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
|
83
|
+
_import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
|
79
84
|
_import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
|
85
|
+
_import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
|
80
86
|
_import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
|
81
87
|
_import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
|
82
88
|
_import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
|
@@ -84,6 +90,7 @@ if is_torch_available():
|
|
84
90
|
_import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
|
85
91
|
_import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
|
86
92
|
_import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
|
93
|
+
_import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
|
87
94
|
_import_structure["unets.unet_1d"] = ["UNet1DModel"]
|
88
95
|
_import_structure["unets.unet_2d"] = ["UNet2DModel"]
|
89
96
|
_import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
|
@@ -111,6 +118,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
111
118
|
AutoencoderKL,
|
112
119
|
AutoencoderKLAllegro,
|
113
120
|
AutoencoderKLCogVideoX,
|
121
|
+
AutoencoderKLCosmos,
|
114
122
|
AutoencoderKLHunyuanVideo,
|
115
123
|
AutoencoderKLLTXVideo,
|
116
124
|
AutoencoderKLMagvit,
|
@@ -133,6 +141,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
133
141
|
HunyuanDiT2DMultiControlNetModel,
|
134
142
|
MultiControlNetModel,
|
135
143
|
MultiControlNetUnionModel,
|
144
|
+
SanaControlNetModel,
|
136
145
|
SD3ControlNetModel,
|
137
146
|
SD3MultiControlNetModel,
|
138
147
|
SparseControlNetModel,
|
@@ -143,15 +152,19 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
143
152
|
from .transformers import (
|
144
153
|
AllegroTransformer3DModel,
|
145
154
|
AuraFlowTransformer2DModel,
|
155
|
+
ChromaTransformer2DModel,
|
146
156
|
CogVideoXTransformer3DModel,
|
147
157
|
CogView3PlusTransformer2DModel,
|
148
158
|
CogView4Transformer2DModel,
|
149
159
|
ConsisIDTransformer3DModel,
|
160
|
+
CosmosTransformer3DModel,
|
150
161
|
DiTTransformer2DModel,
|
151
162
|
DualTransformer2DModel,
|
152
163
|
EasyAnimateTransformer3DModel,
|
153
164
|
FluxTransformer2DModel,
|
165
|
+
HiDreamImageTransformer2DModel,
|
154
166
|
HunyuanDiT2DModel,
|
167
|
+
HunyuanVideoFramepackTransformer3DModel,
|
155
168
|
HunyuanVideoTransformer3DModel,
|
156
169
|
LatteTransformer3DModel,
|
157
170
|
LTXVideoTransformer3DModel,
|
@@ -168,6 +181,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
168
181
|
Transformer2DModel,
|
169
182
|
TransformerTemporalModel,
|
170
183
|
WanTransformer3DModel,
|
184
|
+
WanVACETransformer3DModel,
|
171
185
|
)
|
172
186
|
from .unets import (
|
173
187
|
I2VGenXLUNet,
|
diffusers/models/activations.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding=utf-8
|
2
|
-
# Copyright
|
2
|
+
# Copyright 2025 HuggingFace Inc.
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
5
|
# you may not use this file except in compliance with the License.
|
@@ -92,7 +92,7 @@ class GELU(nn.Module):
|
|
92
92
|
|
93
93
|
class GEGLU(nn.Module):
|
94
94
|
r"""
|
95
|
-
A [variant](https://
|
95
|
+
A [variant](https://huggingface.co/papers/2002.05202) of the gated linear unit activation function.
|
96
96
|
|
97
97
|
Parameters:
|
98
98
|
dim_in (`int`): The number of channels in the input.
|
@@ -125,8 +125,8 @@ class GEGLU(nn.Module):
|
|
125
125
|
|
126
126
|
class SwiGLU(nn.Module):
|
127
127
|
r"""
|
128
|
-
A [variant](https://
|
129
|
-
but uses SiLU / Swish instead of GeLU.
|
128
|
+
A [variant](https://huggingface.co/papers/2002.05202) of the gated linear unit activation function. It's similar to
|
129
|
+
`GEGLU` but uses SiLU / Swish instead of GeLU.
|
130
130
|
|
131
131
|
Parameters:
|
132
132
|
dim_in (`int`): The number of channels in the input.
|
@@ -149,7 +149,7 @@ class SwiGLU(nn.Module):
|
|
149
149
|
class ApproximateGELU(nn.Module):
|
150
150
|
r"""
|
151
151
|
The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
|
152
|
-
[paper](https://
|
152
|
+
[paper](https://huggingface.co/papers/1606.08415).
|
153
153
|
|
154
154
|
Parameters:
|
155
155
|
dim_in (`int`): The number of channels in the input.
|
diffusers/models/adapter.py
CHANGED
@@ -161,9 +161,8 @@ class MultiAdapter(ModelMixin):
|
|
161
161
|
pretrained_model_path (`os.PathLike`):
|
162
162
|
A path to a *directory* containing model weights saved using
|
163
163
|
[`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
|
164
|
-
torch_dtype (`
|
165
|
-
Override the default `torch.dtype` and load the model under this dtype.
|
166
|
-
will be automatically derived from the model's weights.
|
164
|
+
torch_dtype (`torch.dtype`, *optional*):
|
165
|
+
Override the default `torch.dtype` and load the model under this dtype.
|
167
166
|
output_loading_info(`bool`, *optional*, defaults to `False`):
|
168
167
|
Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
|
169
168
|
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
|
diffusers/models/attention.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -90,7 +90,7 @@ class JointTransformerBlock(nn.Module):
|
|
90
90
|
r"""
|
91
91
|
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
92
92
|
|
93
|
-
Reference: https://
|
93
|
+
Reference: https://huggingface.co/papers/2403.03206
|
94
94
|
|
95
95
|
Parameters:
|
96
96
|
dim (`int`): The number of channels in the input and output.
|
@@ -892,8 +892,8 @@ class FreeNoiseTransformerBlock(nn.Module):
|
|
892
892
|
The number of frames to be skipped before starting to process a new batch of `context_length` frames.
|
893
893
|
weighting_scheme (`str`, defaults to `"pyramid"`):
|
894
894
|
The weighting scheme to use for weighting averaging of processed latent frames. As described in the
|
895
|
-
Equation 9. of the [FreeNoise](https://
|
896
|
-
used.
|
895
|
+
Equation 9. of the [FreeNoise](https://huggingface.co/papers/2310.15169) paper, "pyramid" is the default
|
896
|
+
setting used.
|
897
897
|
"""
|
898
898
|
|
899
899
|
def __init__(
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -75,7 +75,7 @@ def jax_memory_efficient_attention(
|
|
75
75
|
query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
|
76
76
|
):
|
77
77
|
r"""
|
78
|
-
Flax Memory-efficient multi-head dot product attention. https://
|
78
|
+
Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
|
79
79
|
https://github.com/AminRezaei0x443/memory-efficient-attention
|
80
80
|
|
81
81
|
Args:
|
@@ -121,7 +121,7 @@ def jax_memory_efficient_attention(
|
|
121
121
|
|
122
122
|
class FlaxAttention(nn.Module):
|
123
123
|
r"""
|
124
|
-
A Flax multi-head attention module as described in: https://
|
124
|
+
A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762
|
125
125
|
|
126
126
|
Parameters:
|
127
127
|
query_dim (:obj:`int`):
|
@@ -133,7 +133,7 @@ class FlaxAttention(nn.Module):
|
|
133
133
|
dropout (:obj:`float`, *optional*, defaults to 0.0):
|
134
134
|
Dropout rate
|
135
135
|
use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
|
136
|
-
enable memory efficient attention https://
|
136
|
+
enable memory efficient attention https://huggingface.co/papers/2112.05682
|
137
137
|
split_head_dim (`bool`, *optional*, defaults to `False`):
|
138
138
|
Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
|
139
139
|
enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
|
@@ -244,7 +244,7 @@ class FlaxAttention(nn.Module):
|
|
244
244
|
class FlaxBasicTransformerBlock(nn.Module):
|
245
245
|
r"""
|
246
246
|
A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
|
247
|
-
https://
|
247
|
+
https://huggingface.co/papers/1706.03762
|
248
248
|
|
249
249
|
|
250
250
|
Parameters:
|
@@ -261,7 +261,7 @@ class FlaxBasicTransformerBlock(nn.Module):
|
|
261
261
|
dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
|
262
262
|
Parameters `dtype`
|
263
263
|
use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
|
264
|
-
enable memory efficient attention https://
|
264
|
+
enable memory efficient attention https://huggingface.co/papers/2112.05682
|
265
265
|
split_head_dim (`bool`, *optional*, defaults to `False`):
|
266
266
|
Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
|
267
267
|
enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
|
@@ -328,7 +328,7 @@ class FlaxBasicTransformerBlock(nn.Module):
|
|
328
328
|
class FlaxTransformer2DModel(nn.Module):
|
329
329
|
r"""
|
330
330
|
A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
|
331
|
-
https://
|
331
|
+
https://huggingface.co/papers/1506.02025
|
332
332
|
|
333
333
|
|
334
334
|
Parameters:
|
@@ -347,7 +347,7 @@ class FlaxTransformer2DModel(nn.Module):
|
|
347
347
|
dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
|
348
348
|
Parameters `dtype`
|
349
349
|
use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
|
350
|
-
enable memory efficient attention https://
|
350
|
+
enable memory efficient attention https://huggingface.co/papers/2112.05682
|
351
351
|
split_head_dim (`bool`, *optional*, defaults to `False`):
|
352
352
|
Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
|
353
353
|
enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
|
@@ -436,7 +436,7 @@ class FlaxFeedForward(nn.Module):
|
|
436
436
|
Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
|
437
437
|
[`FeedForward`] class, with the following simplifications:
|
438
438
|
- The activation function is currently hardcoded to a gated linear unit from:
|
439
|
-
https://
|
439
|
+
https://huggingface.co/papers/2002.05202
|
440
440
|
- `dim_out` is equal to `dim`.
|
441
441
|
- The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].
|
442
442
|
|
@@ -468,7 +468,7 @@ class FlaxFeedForward(nn.Module):
|
|
468
468
|
class FlaxGEGLU(nn.Module):
|
469
469
|
r"""
|
470
470
|
Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
|
471
|
-
https://
|
471
|
+
https://huggingface.co/papers/2002.05202.
|
472
472
|
|
473
473
|
Parameters:
|
474
474
|
dim (:obj:`int`):
|