diffusers 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (478) hide show
  1. diffusers/__init__.py +48 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/dependency_versions_check.py +1 -1
  7. diffusers/dependency_versions_table.py +1 -1
  8. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  9. diffusers/hooks/faster_cache.py +2 -2
  10. diffusers/hooks/group_offloading.py +128 -29
  11. diffusers/hooks/hooks.py +2 -2
  12. diffusers/hooks/layerwise_casting.py +3 -3
  13. diffusers/hooks/pyramid_attention_broadcast.py +1 -1
  14. diffusers/image_processor.py +7 -2
  15. diffusers/loaders/__init__.py +4 -0
  16. diffusers/loaders/ip_adapter.py +5 -14
  17. diffusers/loaders/lora_base.py +212 -111
  18. diffusers/loaders/lora_conversion_utils.py +275 -34
  19. diffusers/loaders/lora_pipeline.py +1554 -819
  20. diffusers/loaders/peft.py +52 -109
  21. diffusers/loaders/single_file.py +2 -2
  22. diffusers/loaders/single_file_model.py +20 -4
  23. diffusers/loaders/single_file_utils.py +225 -5
  24. diffusers/loaders/textual_inversion.py +3 -2
  25. diffusers/loaders/transformer_flux.py +1 -1
  26. diffusers/loaders/transformer_sd3.py +2 -2
  27. diffusers/loaders/unet.py +2 -16
  28. diffusers/loaders/unet_loader_utils.py +1 -1
  29. diffusers/loaders/utils.py +1 -1
  30. diffusers/models/__init__.py +15 -1
  31. diffusers/models/activations.py +5 -5
  32. diffusers/models/adapter.py +2 -3
  33. diffusers/models/attention.py +4 -4
  34. diffusers/models/attention_flax.py +10 -10
  35. diffusers/models/attention_processor.py +14 -10
  36. diffusers/models/auto_model.py +47 -10
  37. diffusers/models/autoencoders/__init__.py +1 -0
  38. diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
  39. diffusers/models/autoencoders/autoencoder_dc.py +3 -3
  40. diffusers/models/autoencoders/autoencoder_kl.py +4 -4
  41. diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
  42. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
  43. diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
  44. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
  45. diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
  46. diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
  47. diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
  48. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
  49. diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
  50. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
  51. diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
  52. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  53. diffusers/models/autoencoders/vae.py +13 -2
  54. diffusers/models/autoencoders/vq_model.py +2 -2
  55. diffusers/models/cache_utils.py +1 -1
  56. diffusers/models/controlnet.py +1 -1
  57. diffusers/models/controlnet_flux.py +1 -1
  58. diffusers/models/controlnet_sd3.py +1 -1
  59. diffusers/models/controlnet_sparsectrl.py +1 -1
  60. diffusers/models/controlnets/__init__.py +1 -0
  61. diffusers/models/controlnets/controlnet.py +3 -3
  62. diffusers/models/controlnets/controlnet_flax.py +1 -1
  63. diffusers/models/controlnets/controlnet_flux.py +16 -15
  64. diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
  65. diffusers/models/controlnets/controlnet_sana.py +290 -0
  66. diffusers/models/controlnets/controlnet_sd3.py +1 -1
  67. diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
  68. diffusers/models/controlnets/controlnet_union.py +1 -1
  69. diffusers/models/controlnets/controlnet_xs.py +7 -7
  70. diffusers/models/controlnets/multicontrolnet.py +4 -5
  71. diffusers/models/controlnets/multicontrolnet_union.py +5 -6
  72. diffusers/models/downsampling.py +2 -2
  73. diffusers/models/embeddings.py +10 -12
  74. diffusers/models/embeddings_flax.py +2 -2
  75. diffusers/models/lora.py +3 -3
  76. diffusers/models/modeling_utils.py +44 -14
  77. diffusers/models/normalization.py +4 -4
  78. diffusers/models/resnet.py +2 -2
  79. diffusers/models/resnet_flax.py +1 -1
  80. diffusers/models/transformers/__init__.py +5 -0
  81. diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
  82. diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
  83. diffusers/models/transformers/consisid_transformer_3d.py +1 -1
  84. diffusers/models/transformers/dit_transformer_2d.py +2 -2
  85. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  86. diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
  87. diffusers/models/transformers/latte_transformer_3d.py +4 -5
  88. diffusers/models/transformers/lumina_nextdit2d.py +2 -2
  89. diffusers/models/transformers/pixart_transformer_2d.py +3 -3
  90. diffusers/models/transformers/prior_transformer.py +1 -1
  91. diffusers/models/transformers/sana_transformer.py +8 -3
  92. diffusers/models/transformers/stable_audio_transformer.py +5 -9
  93. diffusers/models/transformers/t5_film_transformer.py +3 -3
  94. diffusers/models/transformers/transformer_2d.py +1 -1
  95. diffusers/models/transformers/transformer_allegro.py +1 -1
  96. diffusers/models/transformers/transformer_chroma.py +742 -0
  97. diffusers/models/transformers/transformer_cogview3plus.py +5 -10
  98. diffusers/models/transformers/transformer_cogview4.py +317 -25
  99. diffusers/models/transformers/transformer_cosmos.py +579 -0
  100. diffusers/models/transformers/transformer_flux.py +9 -11
  101. diffusers/models/transformers/transformer_hidream_image.py +942 -0
  102. diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
  103. diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
  104. diffusers/models/transformers/transformer_ltx.py +2 -2
  105. diffusers/models/transformers/transformer_lumina2.py +1 -1
  106. diffusers/models/transformers/transformer_mochi.py +1 -1
  107. diffusers/models/transformers/transformer_omnigen.py +2 -2
  108. diffusers/models/transformers/transformer_sd3.py +7 -7
  109. diffusers/models/transformers/transformer_temporal.py +1 -1
  110. diffusers/models/transformers/transformer_wan.py +24 -8
  111. diffusers/models/transformers/transformer_wan_vace.py +393 -0
  112. diffusers/models/unets/unet_1d.py +1 -1
  113. diffusers/models/unets/unet_1d_blocks.py +1 -1
  114. diffusers/models/unets/unet_2d.py +1 -1
  115. diffusers/models/unets/unet_2d_blocks.py +1 -1
  116. diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
  117. diffusers/models/unets/unet_2d_condition.py +2 -2
  118. diffusers/models/unets/unet_2d_condition_flax.py +2 -2
  119. diffusers/models/unets/unet_3d_blocks.py +1 -1
  120. diffusers/models/unets/unet_3d_condition.py +3 -3
  121. diffusers/models/unets/unet_i2vgen_xl.py +3 -3
  122. diffusers/models/unets/unet_kandinsky3.py +1 -1
  123. diffusers/models/unets/unet_motion_model.py +2 -2
  124. diffusers/models/unets/unet_stable_cascade.py +1 -1
  125. diffusers/models/upsampling.py +2 -2
  126. diffusers/models/vae_flax.py +2 -2
  127. diffusers/models/vq_model.py +1 -1
  128. diffusers/pipelines/__init__.py +37 -6
  129. diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
  130. diffusers/pipelines/amused/pipeline_amused.py +7 -6
  131. diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
  132. diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
  133. diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
  134. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
  135. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
  136. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
  137. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
  138. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
  139. diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
  140. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  141. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
  142. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
  143. diffusers/pipelines/auto_pipeline.py +6 -7
  144. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  145. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  146. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
  147. diffusers/pipelines/chroma/__init__.py +49 -0
  148. diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
  149. diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
  150. diffusers/pipelines/chroma/pipeline_output.py +21 -0
  151. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
  152. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
  153. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
  154. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
  155. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
  156. diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
  157. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
  158. diffusers/pipelines/consisid/consisid_utils.py +2 -2
  159. diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
  160. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  161. diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
  162. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
  163. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
  164. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
  165. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
  166. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
  167. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
  168. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
  169. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
  170. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
  171. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  172. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
  173. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
  174. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  175. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
  176. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
  177. diffusers/pipelines/cosmos/__init__.py +54 -0
  178. diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
  179. diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
  180. diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
  181. diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
  182. diffusers/pipelines/cosmos/pipeline_output.py +40 -0
  183. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
  184. diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
  185. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  186. diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
  187. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
  188. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
  189. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
  190. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
  191. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
  192. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
  193. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
  194. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  195. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
  196. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  197. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
  198. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
  199. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  200. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  201. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  202. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  203. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  204. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
  205. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
  206. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
  207. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
  208. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
  209. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
  210. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  211. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
  212. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
  213. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
  214. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
  215. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
  216. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  217. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  218. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
  219. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
  220. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
  221. diffusers/pipelines/flux/modeling_flux.py +1 -1
  222. diffusers/pipelines/flux/pipeline_flux.py +10 -17
  223. diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
  224. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
  225. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
  226. diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
  227. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
  228. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
  229. diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
  230. diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
  231. diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
  232. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
  233. diffusers/pipelines/free_init_utils.py +2 -2
  234. diffusers/pipelines/free_noise_utils.py +3 -3
  235. diffusers/pipelines/hidream_image/__init__.py +47 -0
  236. diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
  237. diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
  238. diffusers/pipelines/hunyuan_video/__init__.py +2 -0
  239. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
  240. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
  241. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  242. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
  243. diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
  244. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
  245. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
  246. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
  247. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
  248. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
  249. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
  250. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
  251. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  252. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
  253. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
  254. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
  255. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
  256. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
  257. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
  258. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
  259. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
  260. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
  261. diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
  262. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
  263. diffusers/pipelines/kolors/text_encoder.py +3 -3
  264. diffusers/pipelines/kolors/tokenizer.py +1 -1
  265. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
  266. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
  267. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  268. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
  269. diffusers/pipelines/latte/pipeline_latte.py +12 -12
  270. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
  271. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
  272. diffusers/pipelines/ltx/__init__.py +4 -0
  273. diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
  274. diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
  275. diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
  276. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
  277. diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
  278. diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
  279. diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
  280. diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
  281. diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
  282. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
  283. diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
  284. diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
  285. diffusers/pipelines/onnx_utils.py +15 -2
  286. diffusers/pipelines/pag/pag_utils.py +2 -2
  287. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
  288. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
  289. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
  290. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
  291. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
  292. diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
  293. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
  294. diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
  295. diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
  296. diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
  297. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
  298. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
  299. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
  300. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
  301. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
  302. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
  303. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
  304. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  305. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
  306. diffusers/pipelines/pia/pipeline_pia.py +8 -6
  307. diffusers/pipelines/pipeline_flax_utils.py +3 -4
  308. diffusers/pipelines/pipeline_loading_utils.py +89 -13
  309. diffusers/pipelines/pipeline_utils.py +105 -33
  310. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
  311. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
  312. diffusers/pipelines/sana/__init__.py +4 -0
  313. diffusers/pipelines/sana/pipeline_sana.py +23 -21
  314. diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
  315. diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
  316. diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
  317. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
  318. diffusers/pipelines/shap_e/camera.py +1 -1
  319. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  320. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  321. diffusers/pipelines/shap_e/renderer.py +3 -3
  322. diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
  323. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
  324. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
  325. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
  326. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
  327. diffusers/pipelines/stable_diffusion/__init__.py +0 -7
  328. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  329. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
  330. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  331. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
  332. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  333. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
  334. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
  335. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
  336. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
  337. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
  338. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
  339. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
  340. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
  341. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
  342. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
  343. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
  344. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
  345. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
  346. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
  347. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  348. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  349. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  350. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
  351. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
  352. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
  353. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
  354. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
  355. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
  356. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
  357. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
  358. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
  359. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
  360. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
  361. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
  362. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  363. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
  364. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  365. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
  366. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
  367. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
  368. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
  369. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
  370. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
  371. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
  372. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
  373. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
  374. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
  375. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
  376. diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
  377. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
  378. diffusers/pipelines/unclip/text_proj.py +2 -2
  379. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
  380. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  381. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
  382. diffusers/pipelines/visualcloze/__init__.py +52 -0
  383. diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
  384. diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
  385. diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
  386. diffusers/pipelines/wan/__init__.py +2 -0
  387. diffusers/pipelines/wan/pipeline_wan.py +13 -10
  388. diffusers/pipelines/wan/pipeline_wan_i2v.py +38 -18
  389. diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
  390. diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
  391. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  392. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
  393. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  394. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  395. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
  396. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
  397. diffusers/quantizers/__init__.py +179 -1
  398. diffusers/quantizers/base.py +6 -1
  399. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
  400. diffusers/quantizers/bitsandbytes/utils.py +10 -7
  401. diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
  402. diffusers/quantizers/gguf/utils.py +16 -13
  403. diffusers/quantizers/quantization_config.py +18 -16
  404. diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
  405. diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
  406. diffusers/schedulers/__init__.py +3 -1
  407. diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
  408. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  409. diffusers/schedulers/scheduling_consistency_models.py +1 -1
  410. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
  411. diffusers/schedulers/scheduling_ddim.py +8 -8
  412. diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
  413. diffusers/schedulers/scheduling_ddim_flax.py +6 -6
  414. diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
  415. diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
  416. diffusers/schedulers/scheduling_ddpm.py +9 -9
  417. diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
  418. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
  419. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
  420. diffusers/schedulers/scheduling_deis_multistep.py +8 -8
  421. diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
  422. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
  423. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
  424. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
  425. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  426. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
  427. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
  428. diffusers/schedulers/scheduling_edm_euler.py +20 -11
  429. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
  430. diffusers/schedulers/scheduling_euler_discrete.py +3 -3
  431. diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
  432. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
  433. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
  434. diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
  435. diffusers/schedulers/scheduling_heun_discrete.py +2 -2
  436. diffusers/schedulers/scheduling_ipndm.py +2 -2
  437. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
  438. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
  439. diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
  440. diffusers/schedulers/scheduling_lcm.py +3 -3
  441. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  442. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  443. diffusers/schedulers/scheduling_pndm.py +4 -4
  444. diffusers/schedulers/scheduling_pndm_flax.py +4 -4
  445. diffusers/schedulers/scheduling_repaint.py +9 -9
  446. diffusers/schedulers/scheduling_sasolver.py +15 -15
  447. diffusers/schedulers/scheduling_scm.py +1 -1
  448. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  449. diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
  450. diffusers/schedulers/scheduling_tcd.py +3 -3
  451. diffusers/schedulers/scheduling_unclip.py +5 -5
  452. diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
  453. diffusers/schedulers/scheduling_utils.py +1 -1
  454. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  455. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  456. diffusers/training_utils.py +13 -5
  457. diffusers/utils/__init__.py +5 -0
  458. diffusers/utils/accelerate_utils.py +1 -1
  459. diffusers/utils/doc_utils.py +1 -1
  460. diffusers/utils/dummy_pt_objects.py +120 -0
  461. diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
  462. diffusers/utils/dynamic_modules_utils.py +21 -3
  463. diffusers/utils/export_utils.py +1 -1
  464. diffusers/utils/import_utils.py +81 -18
  465. diffusers/utils/logging.py +1 -1
  466. diffusers/utils/outputs.py +2 -1
  467. diffusers/utils/peft_utils.py +91 -8
  468. diffusers/utils/state_dict_utils.py +20 -3
  469. diffusers/utils/testing_utils.py +59 -7
  470. diffusers/utils/torch_utils.py +25 -5
  471. diffusers/video_processor.py +2 -2
  472. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/METADATA +70 -55
  473. diffusers-0.34.0.dist-info/RECORD +639 -0
  474. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/WHEEL +1 -1
  475. diffusers-0.33.1.dist-info/RECORD +0 -608
  476. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
  477. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
  478. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2024 Lightricks and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -159,6 +159,33 @@ def retrieve_latents(
159
159
  raise AttributeError("Could not access latents of provided encoder_output")
160
160
 
161
161
 
162
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
163
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
164
+ r"""
165
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
166
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
167
+ Flawed](https://huggingface.co/papers/2305.08891).
168
+
169
+ Args:
170
+ noise_cfg (`torch.Tensor`):
171
+ The predicted noise tensor for the guided diffusion process.
172
+ noise_pred_text (`torch.Tensor`):
173
+ The predicted noise tensor for the text-guided diffusion process.
174
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
175
+ A rescale factor applied to the noise predictions.
176
+
177
+ Returns:
178
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
179
+ """
180
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
181
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
182
+ # rescale the results from guidance (fixes overexposure)
183
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
184
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
185
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
186
+ return noise_cfg
187
+
188
+
162
189
  class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
163
190
  r"""
164
191
  Pipeline for image-to-video generation.
@@ -542,6 +569,10 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
542
569
  def guidance_scale(self):
543
570
  return self._guidance_scale
544
571
 
572
+ @property
573
+ def guidance_rescale(self):
574
+ return self._guidance_rescale
575
+
545
576
  @property
546
577
  def do_classifier_free_guidance(self):
547
578
  return self._guidance_scale > 1.0
@@ -576,6 +607,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
576
607
  num_inference_steps: int = 50,
577
608
  timesteps: List[int] = None,
578
609
  guidance_scale: float = 3,
610
+ guidance_rescale: float = 0.0,
579
611
  num_videos_per_prompt: Optional[int] = 1,
580
612
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
581
613
  latents: Optional[torch.Tensor] = None,
@@ -615,11 +647,16 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
615
647
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
616
648
  passed will be used. Must be in descending order.
617
649
  guidance_scale (`float`, defaults to `3 `):
618
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
619
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
620
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
621
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
622
- usually at the expense of lower image quality.
650
+ Guidance scale as defined in [Classifier-Free Diffusion
651
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
652
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
653
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
654
+ the text `prompt`, usually at the expense of lower image quality.
655
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
656
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
657
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
658
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
659
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
623
660
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
624
661
  The number of videos to generate per prompt.
625
662
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -688,6 +725,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
688
725
  )
689
726
 
690
727
  self._guidance_scale = guidance_scale
728
+ self._guidance_rescale = guidance_rescale
691
729
  self._attention_kwargs = attention_kwargs
692
730
  self._interrupt = False
693
731
  self._current_timestep = None
@@ -811,6 +849,12 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
811
849
  noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
812
850
  timestep, _ = timestep.chunk(2)
813
851
 
852
+ if self.guidance_rescale > 0:
853
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
854
+ noise_pred = rescale_noise_cfg(
855
+ noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
856
+ )
857
+
814
858
  # compute the previous noisy sample x_t -> x_t-1
815
859
  noise_pred = self._unpack_latents(
816
860
  noise_pred,
@@ -0,0 +1,277 @@
1
+ # Copyright 2025 Lightricks and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Optional, Union
16
+
17
+ import torch
18
+
19
+ from ...image_processor import PipelineImageInput
20
+ from ...models import AutoencoderKLLTXVideo
21
+ from ...utils import get_logger
22
+ from ...utils.torch_utils import randn_tensor
23
+ from ...video_processor import VideoProcessor
24
+ from ..pipeline_utils import DiffusionPipeline
25
+ from .modeling_latent_upsampler import LTXLatentUpsamplerModel
26
+ from .pipeline_output import LTXPipelineOutput
27
+
28
+
29
+ logger = get_logger(__name__) # pylint: disable=invalid-name
30
+
31
+
32
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
33
+ def retrieve_latents(
34
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
35
+ ):
36
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
37
+ return encoder_output.latent_dist.sample(generator)
38
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
39
+ return encoder_output.latent_dist.mode()
40
+ elif hasattr(encoder_output, "latents"):
41
+ return encoder_output.latents
42
+ else:
43
+ raise AttributeError("Could not access latents of provided encoder_output")
44
+
45
+
46
+ class LTXLatentUpsamplePipeline(DiffusionPipeline):
47
+ model_cpu_offload_seq = ""
48
+
49
+ def __init__(
50
+ self,
51
+ vae: AutoencoderKLLTXVideo,
52
+ latent_upsampler: LTXLatentUpsamplerModel,
53
+ ) -> None:
54
+ super().__init__()
55
+
56
+ self.register_modules(vae=vae, latent_upsampler=latent_upsampler)
57
+
58
+ self.vae_spatial_compression_ratio = (
59
+ self.vae.spatial_compression_ratio if getattr(self, "vae", None) is not None else 32
60
+ )
61
+ self.vae_temporal_compression_ratio = (
62
+ self.vae.temporal_compression_ratio if getattr(self, "vae", None) is not None else 8
63
+ )
64
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_spatial_compression_ratio)
65
+
66
+ def prepare_latents(
67
+ self,
68
+ video: Optional[torch.Tensor] = None,
69
+ batch_size: int = 1,
70
+ dtype: Optional[torch.dtype] = None,
71
+ device: Optional[torch.device] = None,
72
+ generator: Optional[torch.Generator] = None,
73
+ latents: Optional[torch.Tensor] = None,
74
+ ) -> torch.Tensor:
75
+ if latents is not None:
76
+ return latents.to(device=device, dtype=dtype)
77
+
78
+ video = video.to(device=device, dtype=self.vae.dtype)
79
+ if isinstance(generator, list):
80
+ if len(generator) != batch_size:
81
+ raise ValueError(
82
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
83
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
84
+ )
85
+
86
+ init_latents = [
87
+ retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
88
+ ]
89
+ else:
90
+ init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
91
+
92
+ init_latents = torch.cat(init_latents, dim=0).to(dtype)
93
+ init_latents = self._normalize_latents(init_latents, self.vae.latents_mean, self.vae.latents_std)
94
+ return init_latents
95
+
96
+ def adain_filter_latent(self, latents: torch.Tensor, reference_latents: torch.Tensor, factor: float = 1.0):
97
+ """
98
+ Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on statistics from a reference latent
99
+ tensor.
100
+
101
+ Args:
102
+ latent (`torch.Tensor`):
103
+ Input latents to normalize
104
+ reference_latents (`torch.Tensor`):
105
+ The reference latents providing style statistics.
106
+ factor (`float`):
107
+ Blending factor between original and transformed latent. Range: -10.0 to 10.0, Default: 1.0
108
+
109
+ Returns:
110
+ torch.Tensor: The transformed latent tensor
111
+ """
112
+ result = latents.clone()
113
+
114
+ for i in range(latents.size(0)):
115
+ for c in range(latents.size(1)):
116
+ r_sd, r_mean = torch.std_mean(reference_latents[i, c], dim=None) # index by original dim order
117
+ i_sd, i_mean = torch.std_mean(result[i, c], dim=None)
118
+
119
+ result[i, c] = ((result[i, c] - i_mean) / i_sd) * r_sd + r_mean
120
+
121
+ result = torch.lerp(latents, result, factor)
122
+ return result
123
+
124
+ @staticmethod
125
+ # Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
126
+ def _normalize_latents(
127
+ latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
128
+ ) -> torch.Tensor:
129
+ # Normalize latents across the channel dimension [B, C, F, H, W]
130
+ latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
131
+ latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
132
+ latents = (latents - latents_mean) * scaling_factor / latents_std
133
+ return latents
134
+
135
+ @staticmethod
136
+ # Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._denormalize_latents
137
+ def _denormalize_latents(
138
+ latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
139
+ ) -> torch.Tensor:
140
+ # Denormalize latents across the channel dimension [B, C, F, H, W]
141
+ latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
142
+ latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
143
+ latents = latents * latents_std / scaling_factor + latents_mean
144
+ return latents
145
+
146
+ def enable_vae_slicing(self):
147
+ r"""
148
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
149
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
150
+ """
151
+ self.vae.enable_slicing()
152
+
153
+ def disable_vae_slicing(self):
154
+ r"""
155
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
156
+ computing decoding in one step.
157
+ """
158
+ self.vae.disable_slicing()
159
+
160
+ def enable_vae_tiling(self):
161
+ r"""
162
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
163
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
164
+ processing larger images.
165
+ """
166
+ self.vae.enable_tiling()
167
+
168
+ def disable_vae_tiling(self):
169
+ r"""
170
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
171
+ computing decoding in one step.
172
+ """
173
+ self.vae.disable_tiling()
174
+
175
+ def check_inputs(self, video, height, width, latents):
176
+ if height % self.vae_spatial_compression_ratio != 0 or width % self.vae_spatial_compression_ratio != 0:
177
+ raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
178
+
179
+ if video is not None and latents is not None:
180
+ raise ValueError("Only one of `video` or `latents` can be provided.")
181
+ if video is None and latents is None:
182
+ raise ValueError("One of `video` or `latents` has to be provided.")
183
+
184
+ @torch.no_grad()
185
+ def __call__(
186
+ self,
187
+ video: Optional[List[PipelineImageInput]] = None,
188
+ height: int = 512,
189
+ width: int = 704,
190
+ latents: Optional[torch.Tensor] = None,
191
+ decode_timestep: Union[float, List[float]] = 0.0,
192
+ decode_noise_scale: Optional[Union[float, List[float]]] = None,
193
+ adain_factor: float = 0.0,
194
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
195
+ output_type: Optional[str] = "pil",
196
+ return_dict: bool = True,
197
+ ):
198
+ self.check_inputs(
199
+ video=video,
200
+ height=height,
201
+ width=width,
202
+ latents=latents,
203
+ )
204
+
205
+ if video is not None:
206
+ # Batched video input is not yet tested/supported. TODO: take a look later
207
+ batch_size = 1
208
+ else:
209
+ batch_size = latents.shape[0]
210
+ device = self._execution_device
211
+
212
+ if video is not None:
213
+ num_frames = len(video)
214
+ if num_frames % self.vae_temporal_compression_ratio != 1:
215
+ num_frames = (
216
+ num_frames // self.vae_temporal_compression_ratio * self.vae_temporal_compression_ratio + 1
217
+ )
218
+ video = video[:num_frames]
219
+ logger.warning(
220
+ f"Video length expected to be of the form `k * {self.vae_temporal_compression_ratio} + 1` but is {len(video)}. Truncating to {num_frames} frames."
221
+ )
222
+ video = self.video_processor.preprocess_video(video, height=height, width=width)
223
+ video = video.to(device=device, dtype=torch.float32)
224
+
225
+ latents = self.prepare_latents(
226
+ video=video,
227
+ batch_size=batch_size,
228
+ dtype=torch.float32,
229
+ device=device,
230
+ generator=generator,
231
+ latents=latents,
232
+ )
233
+
234
+ latents = self._denormalize_latents(
235
+ latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
236
+ )
237
+ latents = latents.to(self.latent_upsampler.dtype)
238
+ latents_upsampled = self.latent_upsampler(latents)
239
+
240
+ if adain_factor > 0.0:
241
+ latents = self.adain_filter_latent(latents_upsampled, latents, adain_factor)
242
+ else:
243
+ latents = latents_upsampled
244
+
245
+ if output_type == "latent":
246
+ latents = self._normalize_latents(
247
+ latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
248
+ )
249
+ video = latents
250
+ else:
251
+ if not self.vae.config.timestep_conditioning:
252
+ timestep = None
253
+ else:
254
+ noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
255
+ if not isinstance(decode_timestep, list):
256
+ decode_timestep = [decode_timestep] * batch_size
257
+ if decode_noise_scale is None:
258
+ decode_noise_scale = decode_timestep
259
+ elif not isinstance(decode_noise_scale, list):
260
+ decode_noise_scale = [decode_noise_scale] * batch_size
261
+
262
+ timestep = torch.tensor(decode_timestep, device=device, dtype=latents.dtype)
263
+ decode_noise_scale = torch.tensor(decode_noise_scale, device=device, dtype=latents.dtype)[
264
+ :, None, None, None, None
265
+ ]
266
+ latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
267
+
268
+ video = self.vae.decode(latents, timestep, return_dict=False)[0]
269
+ video = self.video_processor.postprocess_video(video, output_type=output_type)
270
+
271
+ # Offload all models
272
+ self.maybe_free_model_hooks()
273
+
274
+ if not return_dict:
275
+ return (video,)
276
+
277
+ return LTXPipelineOutput(frames=video)
@@ -1,4 +1,4 @@
1
- # Copyright 2024 Alpha-VLLM and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -372,7 +372,7 @@ class LuminaPipeline(DiffusionPipeline):
372
372
  def prepare_extra_step_kwargs(self, generator, eta):
373
373
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
374
374
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
375
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
375
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
376
376
  # and should be between [0, 1]
377
377
 
378
378
  accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -534,7 +534,7 @@ class LuminaPipeline(DiffusionPipeline):
534
534
  # &amp
535
535
  caption = re.sub(r"&amp", "", caption)
536
536
 
537
- # ip adresses:
537
+ # ip addresses:
538
538
  caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
539
539
 
540
540
  # article ids:
@@ -619,7 +619,7 @@ class LuminaPipeline(DiffusionPipeline):
619
619
  return self._guidance_scale
620
620
 
621
621
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
622
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
622
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
623
623
  # corresponds to doing no classifier free guidance.
624
624
  @property
625
625
  def do_classifier_free_guidance(self):
@@ -677,11 +677,11 @@ class LuminaPipeline(DiffusionPipeline):
677
677
  their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
678
678
  will be used.
679
679
  guidance_scale (`float`, *optional*, defaults to 4.0):
680
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
681
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
682
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
683
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
684
- usually at the expense of lower image quality.
680
+ Guidance scale as defined in [Classifier-Free Diffusion
681
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
682
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
683
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
684
+ the text `prompt`, usually at the expense of lower image quality.
685
685
  num_images_per_prompt (`int`, *optional*, defaults to 1):
686
686
  The number of images to generate per prompt.
687
687
  height (`int`, *optional*, defaults to self.unet.config.sample_size):
@@ -689,8 +689,8 @@ class LuminaPipeline(DiffusionPipeline):
689
689
  width (`int`, *optional*, defaults to self.unet.config.sample_size):
690
690
  The width in pixels of the generated image.
691
691
  eta (`float`, *optional*, defaults to 0.0):
692
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
693
- [`schedulers.DDIMScheduler`], will be ignored for others.
692
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
693
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
694
694
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
695
695
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
696
696
  to make generation deterministic.
@@ -771,7 +771,7 @@ class LuminaPipeline(DiffusionPipeline):
771
771
  device = self._execution_device
772
772
 
773
773
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
774
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
774
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
775
775
  # corresponds to doing no classifier free guidance.
776
776
  do_classifier_free_guidance = guidance_scale > 1.0
777
777
 
@@ -848,7 +848,7 @@ class LuminaPipeline(DiffusionPipeline):
848
848
  # prepare image_rotary_emb for positional encoding
849
849
  # dynamic scaling_factor for different resolution.
850
850
  # NOTE: For `Time-aware` denosing mechanism from Lumina-Next
851
- # https://arxiv.org/abs/2406.18583, Sec 2.3
851
+ # https://huggingface.co/papers/2406.18583, Sec 2.3
852
852
  # NOTE: We should compute different image_rotary_emb with different timestep.
853
853
  if current_timestep[0] < scaling_watershed:
854
854
  linear_factor = scaling_factor
@@ -1,4 +1,4 @@
1
- # Copyright 2024 Alpha-VLLM and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -342,7 +342,7 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
342
342
  def prepare_extra_step_kwargs(self, generator, eta):
343
343
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
344
344
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
345
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
345
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
346
346
  # and should be between [0, 1]
347
347
 
348
348
  accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -487,7 +487,7 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
487
487
  return self._attention_kwargs
488
488
 
489
489
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
490
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
490
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
491
491
  # corresponds to doing no classifier free guidance.
492
492
  @property
493
493
  def do_classifier_free_guidance(self):
@@ -544,11 +544,11 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
544
544
  their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
545
545
  will be used.
546
546
  guidance_scale (`float`, *optional*, defaults to 4.0):
547
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
548
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
549
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
550
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
551
- usually at the expense of lower image quality.
547
+ Guidance scale as defined in [Classifier-Free Diffusion
548
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
549
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
550
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
551
+ the text `prompt`, usually at the expense of lower image quality.
552
552
  num_images_per_prompt (`int`, *optional*, defaults to 1):
553
553
  The number of images to generate per prompt.
554
554
  height (`int`, *optional*, defaults to self.unet.config.sample_size):
@@ -556,8 +556,8 @@ class Lumina2Pipeline(DiffusionPipeline, Lumina2LoraLoaderMixin):
556
556
  width (`int`, *optional*, defaults to self.unet.config.sample_size):
557
557
  The width in pixels of the generated image.
558
558
  eta (`float`, *optional*, defaults to 0.0):
559
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
560
- [`schedulers.DDIMScheduler`], will be ignored for others.
559
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
560
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
561
561
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
562
562
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
563
563
  to make generation deterministic.
@@ -426,7 +426,7 @@ class MarigoldImageProcessor(ConfigMixin):
426
426
  if isinstance(img, np.ndarray):
427
427
  img = torch.from_numpy(img)
428
428
  if not torch.is_floating_point(img):
429
- raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
429
+ raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
430
430
  else:
431
431
  raise ValueError(f"{prefix}: unexpected type={type(img)}.")
432
432
  if val_min != 0.0 or val_max != 1.0:
@@ -464,7 +464,7 @@ class MarigoldImageProcessor(ConfigMixin):
464
464
  if torch.is_tensor(img):
465
465
  img = img.cpu().numpy()
466
466
  if not np.issubdtype(img.dtype, np.floating):
467
- raise ValueError(f"{prefix}: unexected dtype={img.dtype}.")
467
+ raise ValueError(f"{prefix}: unexpected dtype={img.dtype}.")
468
468
  if val_min != 0.0 or val_max != 1.0:
469
469
  img = (img - val_min) / (val_max - val_min)
470
470
  img = (img * (2**16 - 1)).astype(np.uint16)
@@ -1,4 +1,4 @@
1
- # Copyright 2024 Genmo and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 Genmo and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -521,11 +521,11 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
521
521
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
522
522
  passed will be used. Must be in descending order.
523
523
  guidance_scale (`float`, defaults to `4.5`):
524
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
525
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
526
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
527
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
528
- usually at the expense of lower image quality.
524
+ Guidance scale as defined in [Classifier-Free Diffusion
525
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
526
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
527
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
528
+ the text `prompt`, usually at the expense of lower image quality.
529
529
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
530
530
  The number of videos to generate per prompt.
531
531
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@ from ...utils import (
35
35
  logging,
36
36
  replace_example_docstring,
37
37
  )
38
- from ...utils.torch_utils import randn_tensor
39
- from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
38
+ from ...utils.torch_utils import empty_device_cache, get_device, randn_tensor
39
+ from ..pipeline_utils import AudioPipelineOutput, DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
40
40
 
41
41
 
42
42
  if is_librosa_available():
@@ -76,7 +76,8 @@ EXAMPLE_DOC_STRING = """
76
76
  """
77
77
 
78
78
 
79
- class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
79
+ class MusicLDMPipeline(DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin):
80
+ _last_supported_version = "0.33.1"
80
81
  r"""
81
82
  Pipeline for text-to-audio generation using MusicLDM.
82
83
 
@@ -297,7 +298,7 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
297
298
  def prepare_extra_step_kwargs(self, generator, eta):
298
299
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
299
300
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
300
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
301
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
301
302
  # and should be between [0, 1]
302
303
 
303
304
  accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -396,20 +397,22 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
396
397
  def enable_model_cpu_offload(self, gpu_id=0):
397
398
  r"""
398
399
  Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
399
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
400
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
401
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
400
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
401
+ `forward` method is called, and the model remains in accelerator until the next model runs. Memory savings are
402
+ lower than with `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution
403
+ of the `unet`.
402
404
  """
403
405
  if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
404
406
  from accelerate import cpu_offload_with_hook
405
407
  else:
406
408
  raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
407
409
 
408
- device = torch.device(f"cuda:{gpu_id}")
410
+ device_type = get_device()
411
+ device = torch.device(f"{device_type}:{gpu_id}")
409
412
 
410
413
  if self.device.type != "cpu":
411
414
  self.to("cpu", silence_dtype_warnings=True)
412
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
415
+ empty_device_cache() # otherwise we don't see the memory savings (but they probably exist)
413
416
 
414
417
  model_sequence = [
415
418
  self.text_encoder.text_model,
@@ -472,8 +475,8 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
472
475
  and the input text. This scoring ranks the generated waveforms based on their cosine similarity to text
473
476
  input in the joint text-audio embedding space.
474
477
  eta (`float`, *optional*, defaults to 0.0):
475
- Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
476
- to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
478
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
479
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
477
480
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
478
481
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
479
482
  generation deterministic.
@@ -548,7 +551,7 @@ class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
548
551
 
549
552
  device = self._execution_device
550
553
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
551
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
554
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
552
555
  # corresponds to doing no classifier free guidance.
553
556
  do_classifier_free_guidance = guidance_scale > 1.0
554
557