diffusers 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (478) hide show
  1. diffusers/__init__.py +48 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/dependency_versions_check.py +1 -1
  7. diffusers/dependency_versions_table.py +1 -1
  8. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  9. diffusers/hooks/faster_cache.py +2 -2
  10. diffusers/hooks/group_offloading.py +128 -29
  11. diffusers/hooks/hooks.py +2 -2
  12. diffusers/hooks/layerwise_casting.py +3 -3
  13. diffusers/hooks/pyramid_attention_broadcast.py +1 -1
  14. diffusers/image_processor.py +7 -2
  15. diffusers/loaders/__init__.py +4 -0
  16. diffusers/loaders/ip_adapter.py +5 -14
  17. diffusers/loaders/lora_base.py +212 -111
  18. diffusers/loaders/lora_conversion_utils.py +275 -34
  19. diffusers/loaders/lora_pipeline.py +1554 -819
  20. diffusers/loaders/peft.py +52 -109
  21. diffusers/loaders/single_file.py +2 -2
  22. diffusers/loaders/single_file_model.py +20 -4
  23. diffusers/loaders/single_file_utils.py +225 -5
  24. diffusers/loaders/textual_inversion.py +3 -2
  25. diffusers/loaders/transformer_flux.py +1 -1
  26. diffusers/loaders/transformer_sd3.py +2 -2
  27. diffusers/loaders/unet.py +2 -16
  28. diffusers/loaders/unet_loader_utils.py +1 -1
  29. diffusers/loaders/utils.py +1 -1
  30. diffusers/models/__init__.py +15 -1
  31. diffusers/models/activations.py +5 -5
  32. diffusers/models/adapter.py +2 -3
  33. diffusers/models/attention.py +4 -4
  34. diffusers/models/attention_flax.py +10 -10
  35. diffusers/models/attention_processor.py +14 -10
  36. diffusers/models/auto_model.py +47 -10
  37. diffusers/models/autoencoders/__init__.py +1 -0
  38. diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
  39. diffusers/models/autoencoders/autoencoder_dc.py +3 -3
  40. diffusers/models/autoencoders/autoencoder_kl.py +4 -4
  41. diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
  42. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
  43. diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
  44. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
  45. diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
  46. diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
  47. diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
  48. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
  49. diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
  50. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
  51. diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
  52. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  53. diffusers/models/autoencoders/vae.py +13 -2
  54. diffusers/models/autoencoders/vq_model.py +2 -2
  55. diffusers/models/cache_utils.py +1 -1
  56. diffusers/models/controlnet.py +1 -1
  57. diffusers/models/controlnet_flux.py +1 -1
  58. diffusers/models/controlnet_sd3.py +1 -1
  59. diffusers/models/controlnet_sparsectrl.py +1 -1
  60. diffusers/models/controlnets/__init__.py +1 -0
  61. diffusers/models/controlnets/controlnet.py +3 -3
  62. diffusers/models/controlnets/controlnet_flax.py +1 -1
  63. diffusers/models/controlnets/controlnet_flux.py +16 -15
  64. diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
  65. diffusers/models/controlnets/controlnet_sana.py +290 -0
  66. diffusers/models/controlnets/controlnet_sd3.py +1 -1
  67. diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
  68. diffusers/models/controlnets/controlnet_union.py +1 -1
  69. diffusers/models/controlnets/controlnet_xs.py +7 -7
  70. diffusers/models/controlnets/multicontrolnet.py +4 -5
  71. diffusers/models/controlnets/multicontrolnet_union.py +5 -6
  72. diffusers/models/downsampling.py +2 -2
  73. diffusers/models/embeddings.py +10 -12
  74. diffusers/models/embeddings_flax.py +2 -2
  75. diffusers/models/lora.py +3 -3
  76. diffusers/models/modeling_utils.py +44 -14
  77. diffusers/models/normalization.py +4 -4
  78. diffusers/models/resnet.py +2 -2
  79. diffusers/models/resnet_flax.py +1 -1
  80. diffusers/models/transformers/__init__.py +5 -0
  81. diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
  82. diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
  83. diffusers/models/transformers/consisid_transformer_3d.py +1 -1
  84. diffusers/models/transformers/dit_transformer_2d.py +2 -2
  85. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  86. diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
  87. diffusers/models/transformers/latte_transformer_3d.py +4 -5
  88. diffusers/models/transformers/lumina_nextdit2d.py +2 -2
  89. diffusers/models/transformers/pixart_transformer_2d.py +3 -3
  90. diffusers/models/transformers/prior_transformer.py +1 -1
  91. diffusers/models/transformers/sana_transformer.py +8 -3
  92. diffusers/models/transformers/stable_audio_transformer.py +5 -9
  93. diffusers/models/transformers/t5_film_transformer.py +3 -3
  94. diffusers/models/transformers/transformer_2d.py +1 -1
  95. diffusers/models/transformers/transformer_allegro.py +1 -1
  96. diffusers/models/transformers/transformer_chroma.py +742 -0
  97. diffusers/models/transformers/transformer_cogview3plus.py +5 -10
  98. diffusers/models/transformers/transformer_cogview4.py +317 -25
  99. diffusers/models/transformers/transformer_cosmos.py +579 -0
  100. diffusers/models/transformers/transformer_flux.py +9 -11
  101. diffusers/models/transformers/transformer_hidream_image.py +942 -0
  102. diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
  103. diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
  104. diffusers/models/transformers/transformer_ltx.py +2 -2
  105. diffusers/models/transformers/transformer_lumina2.py +1 -1
  106. diffusers/models/transformers/transformer_mochi.py +1 -1
  107. diffusers/models/transformers/transformer_omnigen.py +2 -2
  108. diffusers/models/transformers/transformer_sd3.py +7 -7
  109. diffusers/models/transformers/transformer_temporal.py +1 -1
  110. diffusers/models/transformers/transformer_wan.py +24 -8
  111. diffusers/models/transformers/transformer_wan_vace.py +393 -0
  112. diffusers/models/unets/unet_1d.py +1 -1
  113. diffusers/models/unets/unet_1d_blocks.py +1 -1
  114. diffusers/models/unets/unet_2d.py +1 -1
  115. diffusers/models/unets/unet_2d_blocks.py +1 -1
  116. diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
  117. diffusers/models/unets/unet_2d_condition.py +2 -2
  118. diffusers/models/unets/unet_2d_condition_flax.py +2 -2
  119. diffusers/models/unets/unet_3d_blocks.py +1 -1
  120. diffusers/models/unets/unet_3d_condition.py +3 -3
  121. diffusers/models/unets/unet_i2vgen_xl.py +3 -3
  122. diffusers/models/unets/unet_kandinsky3.py +1 -1
  123. diffusers/models/unets/unet_motion_model.py +2 -2
  124. diffusers/models/unets/unet_stable_cascade.py +1 -1
  125. diffusers/models/upsampling.py +2 -2
  126. diffusers/models/vae_flax.py +2 -2
  127. diffusers/models/vq_model.py +1 -1
  128. diffusers/pipelines/__init__.py +37 -6
  129. diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
  130. diffusers/pipelines/amused/pipeline_amused.py +7 -6
  131. diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
  132. diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
  133. diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
  134. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
  135. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
  136. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
  137. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
  138. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
  139. diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
  140. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  141. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
  142. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
  143. diffusers/pipelines/auto_pipeline.py +6 -7
  144. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  145. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  146. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
  147. diffusers/pipelines/chroma/__init__.py +49 -0
  148. diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
  149. diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
  150. diffusers/pipelines/chroma/pipeline_output.py +21 -0
  151. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
  152. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
  153. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
  154. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
  155. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
  156. diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
  157. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
  158. diffusers/pipelines/consisid/consisid_utils.py +2 -2
  159. diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
  160. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  161. diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
  162. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
  163. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
  164. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
  165. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
  166. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
  167. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
  168. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
  169. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
  170. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
  171. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  172. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
  173. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
  174. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  175. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
  176. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
  177. diffusers/pipelines/cosmos/__init__.py +54 -0
  178. diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
  179. diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
  180. diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
  181. diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
  182. diffusers/pipelines/cosmos/pipeline_output.py +40 -0
  183. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
  184. diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
  185. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  186. diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
  187. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
  188. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
  189. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
  190. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
  191. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
  192. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
  193. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
  194. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  195. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
  196. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  197. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
  198. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
  199. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  200. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  201. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  202. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  203. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  204. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
  205. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
  206. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
  207. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
  208. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
  209. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
  210. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  211. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
  212. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
  213. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
  214. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
  215. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
  216. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  217. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  218. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
  219. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
  220. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
  221. diffusers/pipelines/flux/modeling_flux.py +1 -1
  222. diffusers/pipelines/flux/pipeline_flux.py +10 -17
  223. diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
  224. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
  225. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
  226. diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
  227. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
  228. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
  229. diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
  230. diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
  231. diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
  232. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
  233. diffusers/pipelines/free_init_utils.py +2 -2
  234. diffusers/pipelines/free_noise_utils.py +3 -3
  235. diffusers/pipelines/hidream_image/__init__.py +47 -0
  236. diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
  237. diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
  238. diffusers/pipelines/hunyuan_video/__init__.py +2 -0
  239. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
  240. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
  241. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  242. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
  243. diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
  244. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
  245. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
  246. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
  247. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
  248. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
  249. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
  250. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
  251. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  252. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
  253. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
  254. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
  255. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
  256. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
  257. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
  258. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
  259. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
  260. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
  261. diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
  262. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
  263. diffusers/pipelines/kolors/text_encoder.py +3 -3
  264. diffusers/pipelines/kolors/tokenizer.py +1 -1
  265. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
  266. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
  267. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  268. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
  269. diffusers/pipelines/latte/pipeline_latte.py +12 -12
  270. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
  271. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
  272. diffusers/pipelines/ltx/__init__.py +4 -0
  273. diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
  274. diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
  275. diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
  276. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
  277. diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
  278. diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
  279. diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
  280. diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
  281. diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
  282. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
  283. diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
  284. diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
  285. diffusers/pipelines/onnx_utils.py +15 -2
  286. diffusers/pipelines/pag/pag_utils.py +2 -2
  287. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
  288. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
  289. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
  290. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
  291. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
  292. diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
  293. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
  294. diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
  295. diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
  296. diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
  297. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
  298. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
  299. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
  300. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
  301. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
  302. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
  303. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
  304. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  305. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
  306. diffusers/pipelines/pia/pipeline_pia.py +8 -6
  307. diffusers/pipelines/pipeline_flax_utils.py +3 -4
  308. diffusers/pipelines/pipeline_loading_utils.py +89 -13
  309. diffusers/pipelines/pipeline_utils.py +105 -33
  310. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
  311. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
  312. diffusers/pipelines/sana/__init__.py +4 -0
  313. diffusers/pipelines/sana/pipeline_sana.py +23 -21
  314. diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
  315. diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
  316. diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
  317. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
  318. diffusers/pipelines/shap_e/camera.py +1 -1
  319. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  320. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  321. diffusers/pipelines/shap_e/renderer.py +3 -3
  322. diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
  323. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
  324. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
  325. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
  326. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
  327. diffusers/pipelines/stable_diffusion/__init__.py +0 -7
  328. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  329. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
  330. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  331. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
  332. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  333. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
  334. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
  335. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
  336. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
  337. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
  338. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
  339. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
  340. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
  341. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
  342. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
  343. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
  344. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
  345. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
  346. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
  347. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  348. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  349. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  350. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
  351. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
  352. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
  353. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
  354. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
  355. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
  356. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
  357. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
  358. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
  359. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
  360. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
  361. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
  362. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  363. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
  364. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  365. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
  366. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
  367. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
  368. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
  369. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
  370. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
  371. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
  372. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
  373. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
  374. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
  375. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
  376. diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
  377. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
  378. diffusers/pipelines/unclip/text_proj.py +2 -2
  379. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
  380. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  381. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
  382. diffusers/pipelines/visualcloze/__init__.py +52 -0
  383. diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
  384. diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
  385. diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
  386. diffusers/pipelines/wan/__init__.py +2 -0
  387. diffusers/pipelines/wan/pipeline_wan.py +17 -12
  388. diffusers/pipelines/wan/pipeline_wan_i2v.py +42 -20
  389. diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
  390. diffusers/pipelines/wan/pipeline_wan_video2video.py +18 -18
  391. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  392. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
  393. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  394. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  395. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
  396. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
  397. diffusers/quantizers/__init__.py +179 -1
  398. diffusers/quantizers/base.py +6 -1
  399. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
  400. diffusers/quantizers/bitsandbytes/utils.py +10 -7
  401. diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
  402. diffusers/quantizers/gguf/utils.py +16 -13
  403. diffusers/quantizers/quantization_config.py +18 -16
  404. diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
  405. diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
  406. diffusers/schedulers/__init__.py +3 -1
  407. diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
  408. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  409. diffusers/schedulers/scheduling_consistency_models.py +1 -1
  410. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
  411. diffusers/schedulers/scheduling_ddim.py +8 -8
  412. diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
  413. diffusers/schedulers/scheduling_ddim_flax.py +6 -6
  414. diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
  415. diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
  416. diffusers/schedulers/scheduling_ddpm.py +9 -9
  417. diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
  418. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
  419. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
  420. diffusers/schedulers/scheduling_deis_multistep.py +8 -8
  421. diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
  422. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
  423. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
  424. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
  425. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  426. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
  427. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
  428. diffusers/schedulers/scheduling_edm_euler.py +20 -11
  429. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
  430. diffusers/schedulers/scheduling_euler_discrete.py +3 -3
  431. diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
  432. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
  433. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
  434. diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
  435. diffusers/schedulers/scheduling_heun_discrete.py +2 -2
  436. diffusers/schedulers/scheduling_ipndm.py +2 -2
  437. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
  438. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
  439. diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
  440. diffusers/schedulers/scheduling_lcm.py +3 -3
  441. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  442. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  443. diffusers/schedulers/scheduling_pndm.py +4 -4
  444. diffusers/schedulers/scheduling_pndm_flax.py +4 -4
  445. diffusers/schedulers/scheduling_repaint.py +9 -9
  446. diffusers/schedulers/scheduling_sasolver.py +15 -15
  447. diffusers/schedulers/scheduling_scm.py +1 -1
  448. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  449. diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
  450. diffusers/schedulers/scheduling_tcd.py +3 -3
  451. diffusers/schedulers/scheduling_unclip.py +5 -5
  452. diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
  453. diffusers/schedulers/scheduling_utils.py +1 -1
  454. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  455. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  456. diffusers/training_utils.py +13 -5
  457. diffusers/utils/__init__.py +5 -0
  458. diffusers/utils/accelerate_utils.py +1 -1
  459. diffusers/utils/doc_utils.py +1 -1
  460. diffusers/utils/dummy_pt_objects.py +120 -0
  461. diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
  462. diffusers/utils/dynamic_modules_utils.py +21 -3
  463. diffusers/utils/export_utils.py +1 -1
  464. diffusers/utils/import_utils.py +81 -18
  465. diffusers/utils/logging.py +1 -1
  466. diffusers/utils/outputs.py +2 -1
  467. diffusers/utils/peft_utils.py +91 -8
  468. diffusers/utils/state_dict_utils.py +20 -3
  469. diffusers/utils/testing_utils.py +59 -7
  470. diffusers/utils/torch_utils.py +25 -5
  471. diffusers/video_processor.py +2 -2
  472. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/METADATA +3 -3
  473. diffusers-0.34.0.dist-info/RECORD +639 -0
  474. diffusers-0.33.0.dist-info/RECORD +0 -608
  475. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
  476. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/WHEEL +0 -0
  477. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
  478. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -126,6 +126,7 @@ CHECKPOINT_KEY_NAMES = {
126
126
  ],
127
127
  "wan": ["model.diffusion_model.head.modulation", "head.modulation"],
128
128
  "wan_vae": "decoder.middle.0.residual.0.gamma",
129
+ "hidream": "double_stream_blocks.0.block.adaLN_modulation.1.bias",
129
130
  }
130
131
 
131
132
  DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
@@ -177,6 +178,8 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
177
178
  "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
178
179
  "ltx-video": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.0"},
179
180
  "ltx-video-0.9.1": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.1"},
181
+ "ltx-video-0.9.5": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.5"},
182
+ "ltx-video-0.9.7": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.7-dev"},
180
183
  "autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
181
184
  "autoencoder-dc-f64c128": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"},
182
185
  "autoencoder-dc-f32c32": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers"},
@@ -189,6 +192,7 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
189
192
  "wan-t2v-1.3B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
190
193
  "wan-t2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-14B-Diffusers"},
191
194
  "wan-i2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"},
195
+ "hidream": {"pretrained_model_name_or_path": "HiDream-ai/HiDream-I1-Dev"},
192
196
  }
193
197
 
194
198
  # Use to configure model sample size when original config is provided
@@ -404,13 +408,16 @@ def load_single_file_checkpoint(
404
408
  local_files_only=None,
405
409
  revision=None,
406
410
  disable_mmap=False,
411
+ user_agent=None,
407
412
  ):
413
+ if user_agent is None:
414
+ user_agent = {"file_type": "single_file", "framework": "pytorch"}
415
+
408
416
  if os.path.isfile(pretrained_model_link_or_path):
409
417
  pretrained_model_link_or_path = pretrained_model_link_or_path
410
418
 
411
419
  else:
412
420
  repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
413
- user_agent = {"file_type": "single_file", "framework": "pytorch"}
414
421
  pretrained_model_link_or_path = _get_model_file(
415
422
  repo_id,
416
423
  weights_name=weights_name,
@@ -638,7 +645,12 @@ def infer_diffusers_model_type(checkpoint):
638
645
  model_type = "flux-schnell"
639
646
 
640
647
  elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["ltx-video"]):
641
- if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
648
+ has_vae = "vae.encoder.conv_in.conv.bias" in checkpoint
649
+ if any(key.endswith("transformer_blocks.47.scale_shift_table") for key in checkpoint):
650
+ model_type = "ltx-video-0.9.7"
651
+ elif has_vae and checkpoint["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
652
+ model_type = "ltx-video-0.9.5"
653
+ elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
642
654
  model_type = "ltx-video-0.9.1"
643
655
  else:
644
656
  model_type = "ltx-video"
@@ -695,6 +707,8 @@ def infer_diffusers_model_type(checkpoint):
695
707
  elif CHECKPOINT_KEY_NAMES["wan_vae"] in checkpoint:
696
708
  # All Wan models use the same VAE so we can use the same default model repo to fetch the config
697
709
  model_type = "wan-t2v-14B"
710
+ elif CHECKPOINT_KEY_NAMES["hidream"] in checkpoint:
711
+ model_type = "hidream"
698
712
  else:
699
713
  model_type = "v1"
700
714
 
@@ -2272,7 +2286,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
2272
2286
  f"double_blocks.{i}.txt_attn.proj.bias"
2273
2287
  )
2274
2288
 
2275
- # single transfomer blocks
2289
+ # single transformer blocks
2276
2290
  for i in range(num_single_layers):
2277
2291
  block_prefix = f"single_transformer_blocks.{i}."
2278
2292
  # norm.linear <- single_blocks.0.modulation.lin
@@ -2403,13 +2417,41 @@ def convert_ltx_vae_checkpoint_to_diffusers(checkpoint, **kwargs):
2403
2417
  "last_scale_shift_table": "scale_shift_table",
2404
2418
  }
2405
2419
 
2420
+ VAE_095_RENAME_DICT = {
2421
+ # decoder
2422
+ "up_blocks.0": "mid_block",
2423
+ "up_blocks.1": "up_blocks.0.upsamplers.0",
2424
+ "up_blocks.2": "up_blocks.0",
2425
+ "up_blocks.3": "up_blocks.1.upsamplers.0",
2426
+ "up_blocks.4": "up_blocks.1",
2427
+ "up_blocks.5": "up_blocks.2.upsamplers.0",
2428
+ "up_blocks.6": "up_blocks.2",
2429
+ "up_blocks.7": "up_blocks.3.upsamplers.0",
2430
+ "up_blocks.8": "up_blocks.3",
2431
+ # encoder
2432
+ "down_blocks.0": "down_blocks.0",
2433
+ "down_blocks.1": "down_blocks.0.downsamplers.0",
2434
+ "down_blocks.2": "down_blocks.1",
2435
+ "down_blocks.3": "down_blocks.1.downsamplers.0",
2436
+ "down_blocks.4": "down_blocks.2",
2437
+ "down_blocks.5": "down_blocks.2.downsamplers.0",
2438
+ "down_blocks.6": "down_blocks.3",
2439
+ "down_blocks.7": "down_blocks.3.downsamplers.0",
2440
+ "down_blocks.8": "mid_block",
2441
+ # common
2442
+ "last_time_embedder": "time_embedder",
2443
+ "last_scale_shift_table": "scale_shift_table",
2444
+ }
2445
+
2406
2446
  VAE_SPECIAL_KEYS_REMAP = {
2407
2447
  "per_channel_statistics.channel": remove_keys_,
2408
2448
  "per_channel_statistics.mean-of-means": remove_keys_,
2409
2449
  "per_channel_statistics.mean-of-stds": remove_keys_,
2410
2450
  }
2411
2451
 
2412
- if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
2452
+ if converted_state_dict["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
2453
+ VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
2454
+ elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
2413
2455
  VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
2414
2456
 
2415
2457
  for key in list(converted_state_dict.keys()):
@@ -2838,7 +2880,7 @@ def convert_auraflow_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
2838
2880
  def convert_lumina2_to_diffusers(checkpoint, **kwargs):
2839
2881
  converted_state_dict = {}
2840
2882
 
2841
- # Original Lumina-Image-2 has an extra norm paramter that is unused
2883
+ # Original Lumina-Image-2 has an extra norm parameter that is unused
2842
2884
  # We just remove it here
2843
2885
  checkpoint.pop("norm_final.weight", None)
2844
2886
 
@@ -3259,3 +3301,181 @@ def convert_wan_vae_to_diffusers(checkpoint, **kwargs):
3259
3301
  converted_state_dict[key] = value
3260
3302
 
3261
3303
  return converted_state_dict
3304
+
3305
+
3306
+ def convert_hidream_transformer_to_diffusers(checkpoint, **kwargs):
3307
+ keys = list(checkpoint.keys())
3308
+ for k in keys:
3309
+ if "model.diffusion_model." in k:
3310
+ checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
3311
+
3312
+ return checkpoint
3313
+
3314
+
3315
+ def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
3316
+ converted_state_dict = {}
3317
+ keys = list(checkpoint.keys())
3318
+
3319
+ for k in keys:
3320
+ if "model.diffusion_model." in k:
3321
+ checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
3322
+
3323
+ num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1 # noqa: C401
3324
+ num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1 # noqa: C401
3325
+ num_guidance_layers = (
3326
+ list(set(int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k))[-1] + 1 # noqa: C401
3327
+ )
3328
+ mlp_ratio = 4.0
3329
+ inner_dim = 3072
3330
+
3331
+ # in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
3332
+ # while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
3333
+ def swap_scale_shift(weight):
3334
+ shift, scale = weight.chunk(2, dim=0)
3335
+ new_weight = torch.cat([scale, shift], dim=0)
3336
+ return new_weight
3337
+
3338
+ # guidance
3339
+ converted_state_dict["distilled_guidance_layer.in_proj.bias"] = checkpoint.pop(
3340
+ "distilled_guidance_layer.in_proj.bias"
3341
+ )
3342
+ converted_state_dict["distilled_guidance_layer.in_proj.weight"] = checkpoint.pop(
3343
+ "distilled_guidance_layer.in_proj.weight"
3344
+ )
3345
+ converted_state_dict["distilled_guidance_layer.out_proj.bias"] = checkpoint.pop(
3346
+ "distilled_guidance_layer.out_proj.bias"
3347
+ )
3348
+ converted_state_dict["distilled_guidance_layer.out_proj.weight"] = checkpoint.pop(
3349
+ "distilled_guidance_layer.out_proj.weight"
3350
+ )
3351
+ for i in range(num_guidance_layers):
3352
+ block_prefix = f"distilled_guidance_layer.layers.{i}."
3353
+ converted_state_dict[f"{block_prefix}linear_1.bias"] = checkpoint.pop(
3354
+ f"distilled_guidance_layer.layers.{i}.in_layer.bias"
3355
+ )
3356
+ converted_state_dict[f"{block_prefix}linear_1.weight"] = checkpoint.pop(
3357
+ f"distilled_guidance_layer.layers.{i}.in_layer.weight"
3358
+ )
3359
+ converted_state_dict[f"{block_prefix}linear_2.bias"] = checkpoint.pop(
3360
+ f"distilled_guidance_layer.layers.{i}.out_layer.bias"
3361
+ )
3362
+ converted_state_dict[f"{block_prefix}linear_2.weight"] = checkpoint.pop(
3363
+ f"distilled_guidance_layer.layers.{i}.out_layer.weight"
3364
+ )
3365
+ converted_state_dict[f"distilled_guidance_layer.norms.{i}.weight"] = checkpoint.pop(
3366
+ f"distilled_guidance_layer.norms.{i}.scale"
3367
+ )
3368
+
3369
+ # context_embedder
3370
+ converted_state_dict["context_embedder.weight"] = checkpoint.pop("txt_in.weight")
3371
+ converted_state_dict["context_embedder.bias"] = checkpoint.pop("txt_in.bias")
3372
+
3373
+ # x_embedder
3374
+ converted_state_dict["x_embedder.weight"] = checkpoint.pop("img_in.weight")
3375
+ converted_state_dict["x_embedder.bias"] = checkpoint.pop("img_in.bias")
3376
+
3377
+ # double transformer blocks
3378
+ for i in range(num_layers):
3379
+ block_prefix = f"transformer_blocks.{i}."
3380
+ # Q, K, V
3381
+ sample_q, sample_k, sample_v = torch.chunk(checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.weight"), 3, dim=0)
3382
+ context_q, context_k, context_v = torch.chunk(
3383
+ checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.weight"), 3, dim=0
3384
+ )
3385
+ sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
3386
+ checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.bias"), 3, dim=0
3387
+ )
3388
+ context_q_bias, context_k_bias, context_v_bias = torch.chunk(
3389
+ checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.bias"), 3, dim=0
3390
+ )
3391
+ converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([sample_q])
3392
+ converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([sample_q_bias])
3393
+ converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([sample_k])
3394
+ converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([sample_k_bias])
3395
+ converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([sample_v])
3396
+ converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([sample_v_bias])
3397
+ converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = torch.cat([context_q])
3398
+ converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = torch.cat([context_q_bias])
3399
+ converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = torch.cat([context_k])
3400
+ converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = torch.cat([context_k_bias])
3401
+ converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = torch.cat([context_v])
3402
+ converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = torch.cat([context_v_bias])
3403
+ # qk_norm
3404
+ converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
3405
+ f"double_blocks.{i}.img_attn.norm.query_norm.scale"
3406
+ )
3407
+ converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
3408
+ f"double_blocks.{i}.img_attn.norm.key_norm.scale"
3409
+ )
3410
+ converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = checkpoint.pop(
3411
+ f"double_blocks.{i}.txt_attn.norm.query_norm.scale"
3412
+ )
3413
+ converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = checkpoint.pop(
3414
+ f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
3415
+ )
3416
+ # ff img_mlp
3417
+ converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = checkpoint.pop(
3418
+ f"double_blocks.{i}.img_mlp.0.weight"
3419
+ )
3420
+ converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.0.bias")
3421
+ converted_state_dict[f"{block_prefix}ff.net.2.weight"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.weight")
3422
+ converted_state_dict[f"{block_prefix}ff.net.2.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.bias")
3423
+ converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = checkpoint.pop(
3424
+ f"double_blocks.{i}.txt_mlp.0.weight"
3425
+ )
3426
+ converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = checkpoint.pop(
3427
+ f"double_blocks.{i}.txt_mlp.0.bias"
3428
+ )
3429
+ converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = checkpoint.pop(
3430
+ f"double_blocks.{i}.txt_mlp.2.weight"
3431
+ )
3432
+ converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = checkpoint.pop(
3433
+ f"double_blocks.{i}.txt_mlp.2.bias"
3434
+ )
3435
+ # output projections.
3436
+ converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = checkpoint.pop(
3437
+ f"double_blocks.{i}.img_attn.proj.weight"
3438
+ )
3439
+ converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = checkpoint.pop(
3440
+ f"double_blocks.{i}.img_attn.proj.bias"
3441
+ )
3442
+ converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = checkpoint.pop(
3443
+ f"double_blocks.{i}.txt_attn.proj.weight"
3444
+ )
3445
+ converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = checkpoint.pop(
3446
+ f"double_blocks.{i}.txt_attn.proj.bias"
3447
+ )
3448
+
3449
+ # single transformer blocks
3450
+ for i in range(num_single_layers):
3451
+ block_prefix = f"single_transformer_blocks.{i}."
3452
+ # Q, K, V, mlp
3453
+ mlp_hidden_dim = int(inner_dim * mlp_ratio)
3454
+ split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
3455
+ q, k, v, mlp = torch.split(checkpoint.pop(f"single_blocks.{i}.linear1.weight"), split_size, dim=0)
3456
+ q_bias, k_bias, v_bias, mlp_bias = torch.split(
3457
+ checkpoint.pop(f"single_blocks.{i}.linear1.bias"), split_size, dim=0
3458
+ )
3459
+ converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([q])
3460
+ converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([q_bias])
3461
+ converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([k])
3462
+ converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([k_bias])
3463
+ converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([v])
3464
+ converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([v_bias])
3465
+ converted_state_dict[f"{block_prefix}proj_mlp.weight"] = torch.cat([mlp])
3466
+ converted_state_dict[f"{block_prefix}proj_mlp.bias"] = torch.cat([mlp_bias])
3467
+ # qk norm
3468
+ converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
3469
+ f"single_blocks.{i}.norm.query_norm.scale"
3470
+ )
3471
+ converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
3472
+ f"single_blocks.{i}.norm.key_norm.scale"
3473
+ )
3474
+ # output projections.
3475
+ converted_state_dict[f"{block_prefix}proj_out.weight"] = checkpoint.pop(f"single_blocks.{i}.linear2.weight")
3476
+ converted_state_dict[f"{block_prefix}proj_out.bias"] = checkpoint.pop(f"single_blocks.{i}.linear2.bias")
3477
+
3478
+ converted_state_dict["proj_out.weight"] = checkpoint.pop("final_layer.linear.weight")
3479
+ converted_state_dict["proj_out.bias"] = checkpoint.pop("final_layer.linear.bias")
3480
+
3481
+ return converted_state_dict
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -427,7 +427,8 @@ class TextualInversionLoaderMixin:
427
427
  logger.info(
428
428
  "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
429
429
  )
430
- remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
430
+ if is_sequential_cpu_offload or is_model_cpu_offload:
431
+ remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
431
432
 
432
433
  # 7.2 save expected device and dtype
433
434
  device = text_encoder.device
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -123,7 +123,7 @@ class SD3Transformer2DLoadersMixin:
123
123
  key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj")
124
124
  updated_state_dict[key] = value
125
125
 
126
- # Image projetion parameters
126
+ # Image projection parameters
127
127
  embed_dim = updated_state_dict["proj_in.weight"].shape[1]
128
128
  output_dim = updated_state_dict["proj_out.weight"].shape[0]
129
129
  hidden_dim = updated_state_dict["proj_in.weight"].shape[0]
diffusers/loaders/unet.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -155,10 +155,7 @@ class UNet2DConditionLoadersMixin:
155
155
  use_safetensors = True
156
156
  allow_pickle = True
157
157
 
158
- user_agent = {
159
- "file_type": "attn_procs_weights",
160
- "framework": "pytorch",
161
- }
158
+ user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
162
159
 
163
160
  model_file = None
164
161
  if not isinstance(pretrained_model_name_or_path_or_dict, dict):
@@ -397,17 +394,6 @@ class UNet2DConditionLoadersMixin:
397
394
  @classmethod
398
395
  # Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading
399
396
  def _optionally_disable_offloading(cls, _pipeline):
400
- """
401
- Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU.
402
-
403
- Args:
404
- _pipeline (`DiffusionPipeline`):
405
- The pipeline to disable offloading for.
406
-
407
- Returns:
408
- tuple:
409
- A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
410
- """
411
397
  return _func_optionally_disable_offloading(_pipeline=_pipeline)
412
398
 
413
399
  def save_attn_procs(
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@ if is_torch_available():
32
32
  _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
33
33
  _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
34
34
  _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
35
+ _import_structure["autoencoders.autoencoder_kl_cosmos"] = ["AutoencoderKLCosmos"]
35
36
  _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
36
37
  _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
37
38
  _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
@@ -49,6 +50,7 @@ if is_torch_available():
49
50
  "HunyuanDiT2DControlNetModel",
50
51
  "HunyuanDiT2DMultiControlNetModel",
51
52
  ]
53
+ _import_structure["controlnets.controlnet_sana"] = ["SanaControlNetModel"]
52
54
  _import_structure["controlnets.controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
53
55
  _import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
54
56
  _import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
@@ -72,11 +74,15 @@ if is_torch_available():
72
74
  _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
73
75
  _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
74
76
  _import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
77
+ _import_structure["transformers.transformer_chroma"] = ["ChromaTransformer2DModel"]
75
78
  _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
76
79
  _import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"]
80
+ _import_structure["transformers.transformer_cosmos"] = ["CosmosTransformer3DModel"]
77
81
  _import_structure["transformers.transformer_easyanimate"] = ["EasyAnimateTransformer3DModel"]
78
82
  _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
83
+ _import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
79
84
  _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
85
+ _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
80
86
  _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
81
87
  _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
82
88
  _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
@@ -84,6 +90,7 @@ if is_torch_available():
84
90
  _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
85
91
  _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
86
92
  _import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
93
+ _import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
87
94
  _import_structure["unets.unet_1d"] = ["UNet1DModel"]
88
95
  _import_structure["unets.unet_2d"] = ["UNet2DModel"]
89
96
  _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
@@ -111,6 +118,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
111
118
  AutoencoderKL,
112
119
  AutoencoderKLAllegro,
113
120
  AutoencoderKLCogVideoX,
121
+ AutoencoderKLCosmos,
114
122
  AutoencoderKLHunyuanVideo,
115
123
  AutoencoderKLLTXVideo,
116
124
  AutoencoderKLMagvit,
@@ -133,6 +141,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
133
141
  HunyuanDiT2DMultiControlNetModel,
134
142
  MultiControlNetModel,
135
143
  MultiControlNetUnionModel,
144
+ SanaControlNetModel,
136
145
  SD3ControlNetModel,
137
146
  SD3MultiControlNetModel,
138
147
  SparseControlNetModel,
@@ -143,15 +152,19 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
143
152
  from .transformers import (
144
153
  AllegroTransformer3DModel,
145
154
  AuraFlowTransformer2DModel,
155
+ ChromaTransformer2DModel,
146
156
  CogVideoXTransformer3DModel,
147
157
  CogView3PlusTransformer2DModel,
148
158
  CogView4Transformer2DModel,
149
159
  ConsisIDTransformer3DModel,
160
+ CosmosTransformer3DModel,
150
161
  DiTTransformer2DModel,
151
162
  DualTransformer2DModel,
152
163
  EasyAnimateTransformer3DModel,
153
164
  FluxTransformer2DModel,
165
+ HiDreamImageTransformer2DModel,
154
166
  HunyuanDiT2DModel,
167
+ HunyuanVideoFramepackTransformer3DModel,
155
168
  HunyuanVideoTransformer3DModel,
156
169
  LatteTransformer3DModel,
157
170
  LTXVideoTransformer3DModel,
@@ -168,6 +181,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
168
181
  Transformer2DModel,
169
182
  TransformerTemporalModel,
170
183
  WanTransformer3DModel,
184
+ WanVACETransformer3DModel,
171
185
  )
172
186
  from .unets import (
173
187
  I2VGenXLUNet,
@@ -1,5 +1,5 @@
1
1
  # coding=utf-8
2
- # Copyright 2024 HuggingFace Inc.
2
+ # Copyright 2025 HuggingFace Inc.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
5
  # you may not use this file except in compliance with the License.
@@ -92,7 +92,7 @@ class GELU(nn.Module):
92
92
 
93
93
  class GEGLU(nn.Module):
94
94
  r"""
95
- A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
95
+ A [variant](https://huggingface.co/papers/2002.05202) of the gated linear unit activation function.
96
96
 
97
97
  Parameters:
98
98
  dim_in (`int`): The number of channels in the input.
@@ -125,8 +125,8 @@ class GEGLU(nn.Module):
125
125
 
126
126
  class SwiGLU(nn.Module):
127
127
  r"""
128
- A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
129
- but uses SiLU / Swish instead of GeLU.
128
+ A [variant](https://huggingface.co/papers/2002.05202) of the gated linear unit activation function. It's similar to
129
+ `GEGLU` but uses SiLU / Swish instead of GeLU.
130
130
 
131
131
  Parameters:
132
132
  dim_in (`int`): The number of channels in the input.
@@ -149,7 +149,7 @@ class SwiGLU(nn.Module):
149
149
  class ApproximateGELU(nn.Module):
150
150
  r"""
151
151
  The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
152
- [paper](https://arxiv.org/abs/1606.08415).
152
+ [paper](https://huggingface.co/papers/1606.08415).
153
153
 
154
154
  Parameters:
155
155
  dim_in (`int`): The number of channels in the input.
@@ -161,9 +161,8 @@ class MultiAdapter(ModelMixin):
161
161
  pretrained_model_path (`os.PathLike`):
162
162
  A path to a *directory* containing model weights saved using
163
163
  [`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
164
- torch_dtype (`str` or `torch.dtype`, *optional*):
165
- Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
166
- will be automatically derived from the model's weights.
164
+ torch_dtype (`torch.dtype`, *optional*):
165
+ Override the default `torch.dtype` and load the model under this dtype.
167
166
  output_loading_info(`bool`, *optional*, defaults to `False`):
168
167
  Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
169
168
  device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ class JointTransformerBlock(nn.Module):
90
90
  r"""
91
91
  A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
92
92
 
93
- Reference: https://arxiv.org/abs/2403.03206
93
+ Reference: https://huggingface.co/papers/2403.03206
94
94
 
95
95
  Parameters:
96
96
  dim (`int`): The number of channels in the input and output.
@@ -892,8 +892,8 @@ class FreeNoiseTransformerBlock(nn.Module):
892
892
  The number of frames to be skipped before starting to process a new batch of `context_length` frames.
893
893
  weighting_scheme (`str`, defaults to `"pyramid"`):
894
894
  The weighting scheme to use for weighting averaging of processed latent frames. As described in the
895
- Equation 9. of the [FreeNoise](https://arxiv.org/abs/2310.15169) paper, "pyramid" is the default setting
896
- used.
895
+ Equation 9. of the [FreeNoise](https://huggingface.co/papers/2310.15169) paper, "pyramid" is the default
896
+ setting used.
897
897
  """
898
898
 
899
899
  def __init__(
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ def jax_memory_efficient_attention(
75
75
  query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
76
76
  ):
77
77
  r"""
78
- Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
78
+ Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
79
79
  https://github.com/AminRezaei0x443/memory-efficient-attention
80
80
 
81
81
  Args:
@@ -121,7 +121,7 @@ def jax_memory_efficient_attention(
121
121
 
122
122
  class FlaxAttention(nn.Module):
123
123
  r"""
124
- A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
124
+ A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762
125
125
 
126
126
  Parameters:
127
127
  query_dim (:obj:`int`):
@@ -133,7 +133,7 @@ class FlaxAttention(nn.Module):
133
133
  dropout (:obj:`float`, *optional*, defaults to 0.0):
134
134
  Dropout rate
135
135
  use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
136
- enable memory efficient attention https://arxiv.org/abs/2112.05682
136
+ enable memory efficient attention https://huggingface.co/papers/2112.05682
137
137
  split_head_dim (`bool`, *optional*, defaults to `False`):
138
138
  Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
139
139
  enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
@@ -244,7 +244,7 @@ class FlaxAttention(nn.Module):
244
244
  class FlaxBasicTransformerBlock(nn.Module):
245
245
  r"""
246
246
  A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
247
- https://arxiv.org/abs/1706.03762
247
+ https://huggingface.co/papers/1706.03762
248
248
 
249
249
 
250
250
  Parameters:
@@ -261,7 +261,7 @@ class FlaxBasicTransformerBlock(nn.Module):
261
261
  dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
262
262
  Parameters `dtype`
263
263
  use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
264
- enable memory efficient attention https://arxiv.org/abs/2112.05682
264
+ enable memory efficient attention https://huggingface.co/papers/2112.05682
265
265
  split_head_dim (`bool`, *optional*, defaults to `False`):
266
266
  Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
267
267
  enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
@@ -328,7 +328,7 @@ class FlaxBasicTransformerBlock(nn.Module):
328
328
  class FlaxTransformer2DModel(nn.Module):
329
329
  r"""
330
330
  A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
331
- https://arxiv.org/pdf/1506.02025.pdf
331
+ https://huggingface.co/papers/1506.02025
332
332
 
333
333
 
334
334
  Parameters:
@@ -347,7 +347,7 @@ class FlaxTransformer2DModel(nn.Module):
347
347
  dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
348
348
  Parameters `dtype`
349
349
  use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
350
- enable memory efficient attention https://arxiv.org/abs/2112.05682
350
+ enable memory efficient attention https://huggingface.co/papers/2112.05682
351
351
  split_head_dim (`bool`, *optional*, defaults to `False`):
352
352
  Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
353
353
  enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
@@ -436,7 +436,7 @@ class FlaxFeedForward(nn.Module):
436
436
  Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
437
437
  [`FeedForward`] class, with the following simplifications:
438
438
  - The activation function is currently hardcoded to a gated linear unit from:
439
- https://arxiv.org/abs/2002.05202
439
+ https://huggingface.co/papers/2002.05202
440
440
  - `dim_out` is equal to `dim`.
441
441
  - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].
442
442
 
@@ -468,7 +468,7 @@ class FlaxFeedForward(nn.Module):
468
468
  class FlaxGEGLU(nn.Module):
469
469
  r"""
470
470
  Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
471
- https://arxiv.org/abs/2002.05202.
471
+ https://huggingface.co/papers/2002.05202.
472
472
 
473
473
  Parameters:
474
474
  dim (:obj:`int`):