diffusers 0.33.1__py3-none-any.whl → 0.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (551) hide show
  1. diffusers/__init__.py +145 -1
  2. diffusers/callbacks.py +35 -0
  3. diffusers/commands/__init__.py +1 -1
  4. diffusers/commands/custom_blocks.py +134 -0
  5. diffusers/commands/diffusers_cli.py +3 -1
  6. diffusers/commands/env.py +1 -1
  7. diffusers/commands/fp16_safetensors.py +2 -2
  8. diffusers/configuration_utils.py +11 -2
  9. diffusers/dependency_versions_check.py +1 -1
  10. diffusers/dependency_versions_table.py +3 -3
  11. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  12. diffusers/guiders/__init__.py +41 -0
  13. diffusers/guiders/adaptive_projected_guidance.py +188 -0
  14. diffusers/guiders/auto_guidance.py +190 -0
  15. diffusers/guiders/classifier_free_guidance.py +141 -0
  16. diffusers/guiders/classifier_free_zero_star_guidance.py +152 -0
  17. diffusers/guiders/frequency_decoupled_guidance.py +327 -0
  18. diffusers/guiders/guider_utils.py +309 -0
  19. diffusers/guiders/perturbed_attention_guidance.py +271 -0
  20. diffusers/guiders/skip_layer_guidance.py +262 -0
  21. diffusers/guiders/smoothed_energy_guidance.py +251 -0
  22. diffusers/guiders/tangential_classifier_free_guidance.py +143 -0
  23. diffusers/hooks/__init__.py +17 -0
  24. diffusers/hooks/_common.py +56 -0
  25. diffusers/hooks/_helpers.py +293 -0
  26. diffusers/hooks/faster_cache.py +9 -8
  27. diffusers/hooks/first_block_cache.py +259 -0
  28. diffusers/hooks/group_offloading.py +332 -227
  29. diffusers/hooks/hooks.py +58 -3
  30. diffusers/hooks/layer_skip.py +263 -0
  31. diffusers/hooks/layerwise_casting.py +5 -10
  32. diffusers/hooks/pyramid_attention_broadcast.py +15 -12
  33. diffusers/hooks/smoothed_energy_guidance_utils.py +167 -0
  34. diffusers/hooks/utils.py +43 -0
  35. diffusers/image_processor.py +7 -2
  36. diffusers/loaders/__init__.py +10 -0
  37. diffusers/loaders/ip_adapter.py +260 -18
  38. diffusers/loaders/lora_base.py +261 -127
  39. diffusers/loaders/lora_conversion_utils.py +657 -35
  40. diffusers/loaders/lora_pipeline.py +2778 -1246
  41. diffusers/loaders/peft.py +78 -112
  42. diffusers/loaders/single_file.py +2 -2
  43. diffusers/loaders/single_file_model.py +64 -15
  44. diffusers/loaders/single_file_utils.py +395 -7
  45. diffusers/loaders/textual_inversion.py +3 -2
  46. diffusers/loaders/transformer_flux.py +10 -11
  47. diffusers/loaders/transformer_sd3.py +8 -3
  48. diffusers/loaders/unet.py +24 -21
  49. diffusers/loaders/unet_loader_utils.py +6 -3
  50. diffusers/loaders/utils.py +1 -1
  51. diffusers/models/__init__.py +23 -1
  52. diffusers/models/activations.py +5 -5
  53. diffusers/models/adapter.py +2 -3
  54. diffusers/models/attention.py +488 -7
  55. diffusers/models/attention_dispatch.py +1218 -0
  56. diffusers/models/attention_flax.py +10 -10
  57. diffusers/models/attention_processor.py +113 -667
  58. diffusers/models/auto_model.py +49 -12
  59. diffusers/models/autoencoders/__init__.py +2 -0
  60. diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
  61. diffusers/models/autoencoders/autoencoder_dc.py +17 -4
  62. diffusers/models/autoencoders/autoencoder_kl.py +5 -5
  63. diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
  64. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
  65. diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1110 -0
  66. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
  67. diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
  68. diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
  69. diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
  70. diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +1070 -0
  71. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
  72. diffusers/models/autoencoders/autoencoder_kl_wan.py +626 -62
  73. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
  74. diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
  75. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  76. diffusers/models/autoencoders/vae.py +13 -2
  77. diffusers/models/autoencoders/vq_model.py +2 -2
  78. diffusers/models/cache_utils.py +32 -10
  79. diffusers/models/controlnet.py +1 -1
  80. diffusers/models/controlnet_flux.py +1 -1
  81. diffusers/models/controlnet_sd3.py +1 -1
  82. diffusers/models/controlnet_sparsectrl.py +1 -1
  83. diffusers/models/controlnets/__init__.py +1 -0
  84. diffusers/models/controlnets/controlnet.py +3 -3
  85. diffusers/models/controlnets/controlnet_flax.py +1 -1
  86. diffusers/models/controlnets/controlnet_flux.py +21 -20
  87. diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
  88. diffusers/models/controlnets/controlnet_sana.py +290 -0
  89. diffusers/models/controlnets/controlnet_sd3.py +1 -1
  90. diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
  91. diffusers/models/controlnets/controlnet_union.py +5 -5
  92. diffusers/models/controlnets/controlnet_xs.py +7 -7
  93. diffusers/models/controlnets/multicontrolnet.py +4 -5
  94. diffusers/models/controlnets/multicontrolnet_union.py +5 -6
  95. diffusers/models/downsampling.py +2 -2
  96. diffusers/models/embeddings.py +36 -46
  97. diffusers/models/embeddings_flax.py +2 -2
  98. diffusers/models/lora.py +3 -3
  99. diffusers/models/model_loading_utils.py +233 -1
  100. diffusers/models/modeling_flax_utils.py +1 -2
  101. diffusers/models/modeling_utils.py +203 -108
  102. diffusers/models/normalization.py +4 -4
  103. diffusers/models/resnet.py +2 -2
  104. diffusers/models/resnet_flax.py +1 -1
  105. diffusers/models/transformers/__init__.py +7 -0
  106. diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
  107. diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
  108. diffusers/models/transformers/consisid_transformer_3d.py +1 -1
  109. diffusers/models/transformers/dit_transformer_2d.py +2 -2
  110. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  111. diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
  112. diffusers/models/transformers/latte_transformer_3d.py +4 -5
  113. diffusers/models/transformers/lumina_nextdit2d.py +2 -2
  114. diffusers/models/transformers/pixart_transformer_2d.py +3 -3
  115. diffusers/models/transformers/prior_transformer.py +1 -1
  116. diffusers/models/transformers/sana_transformer.py +8 -3
  117. diffusers/models/transformers/stable_audio_transformer.py +5 -9
  118. diffusers/models/transformers/t5_film_transformer.py +3 -3
  119. diffusers/models/transformers/transformer_2d.py +1 -1
  120. diffusers/models/transformers/transformer_allegro.py +1 -1
  121. diffusers/models/transformers/transformer_chroma.py +641 -0
  122. diffusers/models/transformers/transformer_cogview3plus.py +5 -10
  123. diffusers/models/transformers/transformer_cogview4.py +353 -27
  124. diffusers/models/transformers/transformer_cosmos.py +586 -0
  125. diffusers/models/transformers/transformer_flux.py +376 -138
  126. diffusers/models/transformers/transformer_hidream_image.py +942 -0
  127. diffusers/models/transformers/transformer_hunyuan_video.py +12 -8
  128. diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
  129. diffusers/models/transformers/transformer_ltx.py +105 -24
  130. diffusers/models/transformers/transformer_lumina2.py +1 -1
  131. diffusers/models/transformers/transformer_mochi.py +1 -1
  132. diffusers/models/transformers/transformer_omnigen.py +2 -2
  133. diffusers/models/transformers/transformer_qwenimage.py +645 -0
  134. diffusers/models/transformers/transformer_sd3.py +7 -7
  135. diffusers/models/transformers/transformer_skyreels_v2.py +607 -0
  136. diffusers/models/transformers/transformer_temporal.py +1 -1
  137. diffusers/models/transformers/transformer_wan.py +316 -87
  138. diffusers/models/transformers/transformer_wan_vace.py +387 -0
  139. diffusers/models/unets/unet_1d.py +1 -1
  140. diffusers/models/unets/unet_1d_blocks.py +1 -1
  141. diffusers/models/unets/unet_2d.py +1 -1
  142. diffusers/models/unets/unet_2d_blocks.py +1 -1
  143. diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
  144. diffusers/models/unets/unet_2d_condition.py +4 -3
  145. diffusers/models/unets/unet_2d_condition_flax.py +2 -2
  146. diffusers/models/unets/unet_3d_blocks.py +1 -1
  147. diffusers/models/unets/unet_3d_condition.py +3 -3
  148. diffusers/models/unets/unet_i2vgen_xl.py +3 -3
  149. diffusers/models/unets/unet_kandinsky3.py +1 -1
  150. diffusers/models/unets/unet_motion_model.py +2 -2
  151. diffusers/models/unets/unet_stable_cascade.py +1 -1
  152. diffusers/models/upsampling.py +2 -2
  153. diffusers/models/vae_flax.py +2 -2
  154. diffusers/models/vq_model.py +1 -1
  155. diffusers/modular_pipelines/__init__.py +83 -0
  156. diffusers/modular_pipelines/components_manager.py +1068 -0
  157. diffusers/modular_pipelines/flux/__init__.py +66 -0
  158. diffusers/modular_pipelines/flux/before_denoise.py +689 -0
  159. diffusers/modular_pipelines/flux/decoders.py +109 -0
  160. diffusers/modular_pipelines/flux/denoise.py +227 -0
  161. diffusers/modular_pipelines/flux/encoders.py +412 -0
  162. diffusers/modular_pipelines/flux/modular_blocks.py +181 -0
  163. diffusers/modular_pipelines/flux/modular_pipeline.py +59 -0
  164. diffusers/modular_pipelines/modular_pipeline.py +2446 -0
  165. diffusers/modular_pipelines/modular_pipeline_utils.py +672 -0
  166. diffusers/modular_pipelines/node_utils.py +665 -0
  167. diffusers/modular_pipelines/stable_diffusion_xl/__init__.py +77 -0
  168. diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py +1874 -0
  169. diffusers/modular_pipelines/stable_diffusion_xl/decoders.py +208 -0
  170. diffusers/modular_pipelines/stable_diffusion_xl/denoise.py +771 -0
  171. diffusers/modular_pipelines/stable_diffusion_xl/encoders.py +887 -0
  172. diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py +380 -0
  173. diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py +365 -0
  174. diffusers/modular_pipelines/wan/__init__.py +66 -0
  175. diffusers/modular_pipelines/wan/before_denoise.py +365 -0
  176. diffusers/modular_pipelines/wan/decoders.py +105 -0
  177. diffusers/modular_pipelines/wan/denoise.py +261 -0
  178. diffusers/modular_pipelines/wan/encoders.py +242 -0
  179. diffusers/modular_pipelines/wan/modular_blocks.py +144 -0
  180. diffusers/modular_pipelines/wan/modular_pipeline.py +90 -0
  181. diffusers/pipelines/__init__.py +68 -6
  182. diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
  183. diffusers/pipelines/amused/pipeline_amused.py +7 -6
  184. diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
  185. diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
  186. diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
  187. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
  188. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
  189. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
  190. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
  191. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
  192. diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
  193. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  194. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +22 -13
  195. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
  196. diffusers/pipelines/auto_pipeline.py +23 -20
  197. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  198. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  199. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
  200. diffusers/pipelines/chroma/__init__.py +49 -0
  201. diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
  202. diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
  203. diffusers/pipelines/chroma/pipeline_output.py +21 -0
  204. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +17 -16
  205. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +17 -16
  206. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +18 -17
  207. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +17 -16
  208. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
  209. diffusers/pipelines/cogview4/pipeline_cogview4.py +23 -22
  210. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
  211. diffusers/pipelines/consisid/consisid_utils.py +2 -2
  212. diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
  213. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  214. diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
  215. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +11 -10
  216. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
  217. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
  218. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
  219. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
  220. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
  221. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +226 -107
  222. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +12 -8
  223. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +207 -105
  224. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  225. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
  226. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
  227. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  228. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
  229. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
  230. diffusers/pipelines/cosmos/__init__.py +54 -0
  231. diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
  232. diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
  233. diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
  234. diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
  235. diffusers/pipelines/cosmos/pipeline_output.py +40 -0
  236. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
  237. diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
  238. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  239. diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
  240. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
  241. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
  242. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
  243. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
  244. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
  245. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
  246. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
  247. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  248. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
  249. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  250. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
  251. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
  252. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  253. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  254. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  255. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  256. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  257. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +8 -8
  258. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
  259. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
  260. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
  261. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
  262. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
  263. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  264. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
  265. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
  266. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
  267. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
  268. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
  269. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  270. diffusers/pipelines/dit/pipeline_dit.py +4 -2
  271. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
  272. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
  273. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
  274. diffusers/pipelines/flux/__init__.py +4 -0
  275. diffusers/pipelines/flux/modeling_flux.py +1 -1
  276. diffusers/pipelines/flux/pipeline_flux.py +37 -36
  277. diffusers/pipelines/flux/pipeline_flux_control.py +9 -9
  278. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +7 -7
  279. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +7 -7
  280. diffusers/pipelines/flux/pipeline_flux_controlnet.py +7 -7
  281. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +31 -23
  282. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +3 -2
  283. diffusers/pipelines/flux/pipeline_flux_fill.py +7 -7
  284. diffusers/pipelines/flux/pipeline_flux_img2img.py +40 -7
  285. diffusers/pipelines/flux/pipeline_flux_inpaint.py +12 -7
  286. diffusers/pipelines/flux/pipeline_flux_kontext.py +1134 -0
  287. diffusers/pipelines/flux/pipeline_flux_kontext_inpaint.py +1460 -0
  288. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +2 -2
  289. diffusers/pipelines/flux/pipeline_output.py +6 -4
  290. diffusers/pipelines/free_init_utils.py +2 -2
  291. diffusers/pipelines/free_noise_utils.py +3 -3
  292. diffusers/pipelines/hidream_image/__init__.py +47 -0
  293. diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
  294. diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
  295. diffusers/pipelines/hunyuan_video/__init__.py +2 -0
  296. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
  297. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +26 -25
  298. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  299. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
  300. diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
  301. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
  302. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
  303. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
  304. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
  305. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
  306. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
  307. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
  308. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  309. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
  310. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
  311. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
  312. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
  313. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
  314. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
  315. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
  316. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
  317. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
  318. diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
  319. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
  320. diffusers/pipelines/kolors/text_encoder.py +3 -3
  321. diffusers/pipelines/kolors/tokenizer.py +1 -1
  322. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
  323. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
  324. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  325. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
  326. diffusers/pipelines/latte/pipeline_latte.py +12 -12
  327. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
  328. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
  329. diffusers/pipelines/ltx/__init__.py +4 -0
  330. diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
  331. diffusers/pipelines/ltx/pipeline_ltx.py +64 -18
  332. diffusers/pipelines/ltx/pipeline_ltx_condition.py +117 -38
  333. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +63 -18
  334. diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
  335. diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
  336. diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
  337. diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
  338. diffusers/pipelines/mochi/pipeline_mochi.py +15 -14
  339. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
  340. diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
  341. diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
  342. diffusers/pipelines/onnx_utils.py +15 -2
  343. diffusers/pipelines/pag/pag_utils.py +2 -2
  344. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
  345. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
  346. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
  347. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
  348. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
  349. diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
  350. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
  351. diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
  352. diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
  353. diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
  354. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
  355. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
  356. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
  357. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
  358. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
  359. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
  360. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
  361. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  362. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
  363. diffusers/pipelines/pia/pipeline_pia.py +8 -6
  364. diffusers/pipelines/pipeline_flax_utils.py +5 -6
  365. diffusers/pipelines/pipeline_loading_utils.py +113 -15
  366. diffusers/pipelines/pipeline_utils.py +127 -48
  367. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +14 -12
  368. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +31 -11
  369. diffusers/pipelines/qwenimage/__init__.py +55 -0
  370. diffusers/pipelines/qwenimage/pipeline_output.py +21 -0
  371. diffusers/pipelines/qwenimage/pipeline_qwenimage.py +726 -0
  372. diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +882 -0
  373. diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +829 -0
  374. diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +1015 -0
  375. diffusers/pipelines/sana/__init__.py +4 -0
  376. diffusers/pipelines/sana/pipeline_sana.py +23 -21
  377. diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
  378. diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
  379. diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
  380. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
  381. diffusers/pipelines/shap_e/camera.py +1 -1
  382. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  383. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  384. diffusers/pipelines/shap_e/renderer.py +3 -3
  385. diffusers/pipelines/skyreels_v2/__init__.py +59 -0
  386. diffusers/pipelines/skyreels_v2/pipeline_output.py +20 -0
  387. diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py +610 -0
  388. diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py +978 -0
  389. diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py +1059 -0
  390. diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_v2v.py +1063 -0
  391. diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py +745 -0
  392. diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
  393. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
  394. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
  395. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
  396. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
  397. diffusers/pipelines/stable_diffusion/__init__.py +0 -7
  398. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  399. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
  400. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  401. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
  402. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  403. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +12 -11
  404. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
  405. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +11 -11
  406. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +10 -10
  407. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +10 -9
  408. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
  409. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
  410. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
  411. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
  412. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
  413. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
  414. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
  415. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
  416. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
  417. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  418. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  419. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  420. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +13 -12
  421. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
  422. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
  423. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
  424. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
  425. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
  426. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
  427. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
  428. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
  429. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
  430. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
  431. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
  432. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  433. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
  434. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  435. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
  436. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
  437. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
  438. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
  439. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
  440. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
  441. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
  442. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
  443. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
  444. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
  445. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
  446. diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
  447. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
  448. diffusers/pipelines/unclip/text_proj.py +2 -2
  449. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
  450. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  451. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
  452. diffusers/pipelines/visualcloze/__init__.py +52 -0
  453. diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
  454. diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
  455. diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
  456. diffusers/pipelines/wan/__init__.py +2 -0
  457. diffusers/pipelines/wan/pipeline_wan.py +91 -30
  458. diffusers/pipelines/wan/pipeline_wan_i2v.py +145 -45
  459. diffusers/pipelines/wan/pipeline_wan_vace.py +975 -0
  460. diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
  461. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  462. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
  463. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  464. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  465. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
  466. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
  467. diffusers/quantizers/__init__.py +3 -1
  468. diffusers/quantizers/base.py +17 -1
  469. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
  470. diffusers/quantizers/bitsandbytes/utils.py +10 -7
  471. diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
  472. diffusers/quantizers/gguf/utils.py +108 -16
  473. diffusers/quantizers/pipe_quant_config.py +202 -0
  474. diffusers/quantizers/quantization_config.py +18 -16
  475. diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
  476. diffusers/quantizers/torchao/torchao_quantizer.py +31 -1
  477. diffusers/schedulers/__init__.py +3 -1
  478. diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
  479. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  480. diffusers/schedulers/scheduling_consistency_models.py +1 -1
  481. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
  482. diffusers/schedulers/scheduling_ddim.py +8 -8
  483. diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
  484. diffusers/schedulers/scheduling_ddim_flax.py +6 -6
  485. diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
  486. diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
  487. diffusers/schedulers/scheduling_ddpm.py +9 -9
  488. diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
  489. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
  490. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
  491. diffusers/schedulers/scheduling_deis_multistep.py +16 -9
  492. diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
  493. diffusers/schedulers/scheduling_dpmsolver_multistep.py +18 -12
  494. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
  495. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
  496. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  497. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +19 -13
  498. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
  499. diffusers/schedulers/scheduling_edm_euler.py +20 -11
  500. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
  501. diffusers/schedulers/scheduling_euler_discrete.py +3 -3
  502. diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
  503. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
  504. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
  505. diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
  506. diffusers/schedulers/scheduling_heun_discrete.py +2 -2
  507. diffusers/schedulers/scheduling_ipndm.py +2 -2
  508. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
  509. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
  510. diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
  511. diffusers/schedulers/scheduling_lcm.py +3 -3
  512. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  513. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  514. diffusers/schedulers/scheduling_pndm.py +4 -4
  515. diffusers/schedulers/scheduling_pndm_flax.py +4 -4
  516. diffusers/schedulers/scheduling_repaint.py +9 -9
  517. diffusers/schedulers/scheduling_sasolver.py +15 -15
  518. diffusers/schedulers/scheduling_scm.py +1 -2
  519. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  520. diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
  521. diffusers/schedulers/scheduling_tcd.py +3 -3
  522. diffusers/schedulers/scheduling_unclip.py +5 -5
  523. diffusers/schedulers/scheduling_unipc_multistep.py +21 -12
  524. diffusers/schedulers/scheduling_utils.py +3 -3
  525. diffusers/schedulers/scheduling_utils_flax.py +2 -2
  526. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  527. diffusers/training_utils.py +91 -5
  528. diffusers/utils/__init__.py +15 -0
  529. diffusers/utils/accelerate_utils.py +1 -1
  530. diffusers/utils/constants.py +4 -0
  531. diffusers/utils/doc_utils.py +1 -1
  532. diffusers/utils/dummy_pt_objects.py +432 -0
  533. diffusers/utils/dummy_torch_and_transformers_objects.py +480 -0
  534. diffusers/utils/dynamic_modules_utils.py +85 -8
  535. diffusers/utils/export_utils.py +1 -1
  536. diffusers/utils/hub_utils.py +33 -17
  537. diffusers/utils/import_utils.py +151 -18
  538. diffusers/utils/logging.py +1 -1
  539. diffusers/utils/outputs.py +2 -1
  540. diffusers/utils/peft_utils.py +96 -10
  541. diffusers/utils/state_dict_utils.py +20 -3
  542. diffusers/utils/testing_utils.py +195 -17
  543. diffusers/utils/torch_utils.py +43 -5
  544. diffusers/video_processor.py +2 -2
  545. {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/METADATA +72 -57
  546. diffusers-0.35.0.dist-info/RECORD +703 -0
  547. {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/WHEEL +1 -1
  548. diffusers-0.33.1.dist-info/RECORD +0 -608
  549. {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/LICENSE +0 -0
  550. {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/entry_points.txt +0 -0
  551. {diffusers-0.33.1.dist-info → diffusers-0.35.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
1
+ # Copyright 2025 VisualCloze team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ from PIL import Image
19
+
20
+ from ...image_processor import VaeImageProcessor
21
+
22
+
23
+ class VisualClozeProcessor(VaeImageProcessor):
24
+ """
25
+ Image processor for the VisualCloze pipeline.
26
+
27
+ This processor handles the preprocessing of images for visual cloze tasks, including resizing, normalization, and
28
+ mask generation.
29
+
30
+ Args:
31
+ resolution (int, optional):
32
+ Target resolution for processing images. Each image will be resized to this resolution before being
33
+ concatenated to avoid the out-of-memory error. Defaults to 384.
34
+ *args: Additional arguments passed to [~image_processor.VaeImageProcessor]
35
+ **kwargs: Additional keyword arguments passed to [~image_processor.VaeImageProcessor]
36
+ """
37
+
38
+ def __init__(self, *args, resolution: int = 384, **kwargs):
39
+ super().__init__(*args, **kwargs)
40
+ self.resolution = resolution
41
+
42
+ def preprocess_image(
43
+ self, input_images: List[List[Optional[Image.Image]]], vae_scale_factor: int
44
+ ) -> Tuple[List[List[torch.Tensor]], List[List[List[int]]], List[int]]:
45
+ """
46
+ Preprocesses input images for the VisualCloze pipeline.
47
+
48
+ This function handles the preprocessing of input images by:
49
+ 1. Resizing and cropping images to maintain consistent dimensions
50
+ 2. Converting images to the Tensor format for the VAE
51
+ 3. Normalizing pixel values
52
+ 4. Tracking image sizes and positions of target images
53
+
54
+ Args:
55
+ input_images (List[List[Optional[Image.Image]]]):
56
+ A nested list of PIL Images where:
57
+ - Outer list represents different samples, including in-context examples and the query
58
+ - Inner list contains images for the task
59
+ - In the last row, condition images are provided and the target images are placed as None
60
+ vae_scale_factor (int):
61
+ The scale factor used by the VAE for resizing images
62
+
63
+ Returns:
64
+ Tuple containing:
65
+ - List[List[torch.Tensor]]: Preprocessed images in tensor format
66
+ - List[List[List[int]]]: Dimensions of each processed image [height, width]
67
+ - List[int]: Target positions indicating which images are to be generated
68
+ """
69
+ n_samples, n_task_images = len(input_images), len(input_images[0])
70
+ divisible = 2 * vae_scale_factor
71
+
72
+ processed_images: List[List[Image.Image]] = [[] for _ in range(n_samples)]
73
+ resize_size: List[Optional[Tuple[int, int]]] = [None for _ in range(n_samples)]
74
+ target_position: List[int] = []
75
+
76
+ # Process each sample
77
+ for i in range(n_samples):
78
+ # Determine size from first non-None image
79
+ for j in range(n_task_images):
80
+ if input_images[i][j] is not None:
81
+ aspect_ratio = input_images[i][j].width / input_images[i][j].height
82
+ target_area = self.resolution * self.resolution
83
+ new_h = int((target_area / aspect_ratio) ** 0.5)
84
+ new_w = int(new_h * aspect_ratio)
85
+
86
+ new_w = max(new_w // divisible, 1) * divisible
87
+ new_h = max(new_h // divisible, 1) * divisible
88
+ resize_size[i] = (new_w, new_h)
89
+ break
90
+
91
+ # Process all images in the sample
92
+ for j in range(n_task_images):
93
+ if input_images[i][j] is not None:
94
+ target = self._resize_and_crop(input_images[i][j], resize_size[i][0], resize_size[i][1])
95
+ processed_images[i].append(target)
96
+ if i == n_samples - 1:
97
+ target_position.append(0)
98
+ else:
99
+ blank = Image.new("RGB", resize_size[i] or (self.resolution, self.resolution), (0, 0, 0))
100
+ processed_images[i].append(blank)
101
+ if i == n_samples - 1:
102
+ target_position.append(1)
103
+
104
+ # Ensure consistent width for multiple target images when there are multiple target images
105
+ if len(target_position) > 1 and sum(target_position) > 1:
106
+ new_w = resize_size[n_samples - 1][0] or 384
107
+ for i in range(len(processed_images)):
108
+ for j in range(len(processed_images[i])):
109
+ if processed_images[i][j] is not None:
110
+ new_h = int(processed_images[i][j].height * (new_w / processed_images[i][j].width))
111
+ new_w = int(new_w / 16) * 16
112
+ new_h = int(new_h / 16) * 16
113
+ processed_images[i][j] = self.height(processed_images[i][j], new_h, new_w)
114
+
115
+ # Convert to tensors and normalize
116
+ image_sizes = []
117
+ for i in range(len(processed_images)):
118
+ image_sizes.append([[img.height, img.width] for img in processed_images[i]])
119
+ for j, image in enumerate(processed_images[i]):
120
+ image = self.pil_to_numpy(image)
121
+ image = self.numpy_to_pt(image)
122
+ image = self.normalize(image)
123
+ processed_images[i][j] = image
124
+
125
+ return processed_images, image_sizes, target_position
126
+
127
+ def preprocess_mask(
128
+ self, input_images: List[List[Image.Image]], target_position: List[int]
129
+ ) -> List[List[torch.Tensor]]:
130
+ """
131
+ Generate masks for the VisualCloze pipeline.
132
+
133
+ Args:
134
+ input_images (List[List[Image.Image]]):
135
+ Processed images from preprocess_image
136
+ target_position (List[int]):
137
+ Binary list marking the positions of target images (1 for target, 0 for condition)
138
+
139
+ Returns:
140
+ List[List[torch.Tensor]]:
141
+ A nested list of mask tensors (1 for target positions, 0 for condition images)
142
+ """
143
+ mask = []
144
+ for i, row in enumerate(input_images):
145
+ if i == len(input_images) - 1: # Query row
146
+ row_masks = [
147
+ torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=m) for m in target_position
148
+ ]
149
+ else: # In-context examples
150
+ row_masks = [
151
+ torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=0) for _ in target_position
152
+ ]
153
+ mask.append(row_masks)
154
+ return mask
155
+
156
+ def preprocess_image_upsampling(
157
+ self,
158
+ input_images: List[List[Image.Image]],
159
+ height: int,
160
+ width: int,
161
+ ) -> Tuple[List[List[Image.Image]], List[List[List[int]]]]:
162
+ """Process images for the upsampling stage in the VisualCloze pipeline.
163
+
164
+ Args:
165
+ input_images: Input image to process
166
+ height: Target height
167
+ width: Target width
168
+
169
+ Returns:
170
+ Tuple of processed image and its size
171
+ """
172
+ image = self.resize(input_images[0][0], height, width)
173
+ image = self.pil_to_numpy(image) # to np
174
+ image = self.numpy_to_pt(image) # to pt
175
+ image = self.normalize(image)
176
+
177
+ input_images[0][0] = image
178
+ image_sizes = [[[height, width]]]
179
+ return input_images, image_sizes
180
+
181
+ def preprocess_mask_upsampling(self, input_images: List[List[Image.Image]]) -> List[List[torch.Tensor]]:
182
+ return [[torch.ones((1, 1, input_images[0][0].shape[2], input_images[0][0].shape[3]))]]
183
+
184
+ def get_layout_prompt(self, size: Tuple[int, int]) -> str:
185
+ layout_instruction = (
186
+ f"A grid layout with {size[0]} rows and {size[1]} columns, displaying {size[0] * size[1]} images arranged side by side.",
187
+ )
188
+ return layout_instruction
189
+
190
+ def preprocess(
191
+ self,
192
+ task_prompt: Union[str, List[str]],
193
+ content_prompt: Union[str, List[str]],
194
+ input_images: Optional[List[List[List[Optional[str]]]]] = None,
195
+ height: Optional[int] = None,
196
+ width: Optional[int] = None,
197
+ upsampling: bool = False,
198
+ vae_scale_factor: int = 16,
199
+ ) -> Dict:
200
+ """Process visual cloze inputs.
201
+
202
+ Args:
203
+ task_prompt: Task description(s)
204
+ content_prompt: Content description(s)
205
+ input_images: List of images or None for the target images
206
+ height: Optional target height for upsampling stage
207
+ width: Optional target width for upsampling stage
208
+ upsampling: Whether this is in the upsampling processing stage
209
+
210
+ Returns:
211
+ Dictionary containing processed images, masks, prompts and metadata
212
+ """
213
+ if isinstance(task_prompt, str):
214
+ task_prompt = [task_prompt]
215
+ content_prompt = [content_prompt]
216
+ input_images = [input_images]
217
+
218
+ output = {
219
+ "init_image": [],
220
+ "mask": [],
221
+ "task_prompt": task_prompt if not upsampling else [None for _ in range(len(task_prompt))],
222
+ "content_prompt": content_prompt,
223
+ "layout_prompt": [],
224
+ "target_position": [],
225
+ "image_size": [],
226
+ }
227
+ for i in range(len(task_prompt)):
228
+ if upsampling:
229
+ layout_prompt = None
230
+ else:
231
+ layout_prompt = self.get_layout_prompt((len(input_images[i]), len(input_images[i][0])))
232
+
233
+ if upsampling:
234
+ cur_processed_images, cur_image_size = self.preprocess_image_upsampling(
235
+ input_images[i], height=height, width=width
236
+ )
237
+ cur_mask = self.preprocess_mask_upsampling(cur_processed_images)
238
+ else:
239
+ cur_processed_images, cur_image_size, cur_target_position = self.preprocess_image(
240
+ input_images[i], vae_scale_factor=vae_scale_factor
241
+ )
242
+ cur_mask = self.preprocess_mask(cur_processed_images, cur_target_position)
243
+
244
+ output["target_position"].append(cur_target_position)
245
+
246
+ output["image_size"].append(cur_image_size)
247
+ output["init_image"].append(cur_processed_images)
248
+ output["mask"].append(cur_mask)
249
+ output["layout_prompt"].append(layout_prompt)
250
+
251
+ return output
@@ -24,6 +24,7 @@ except OptionalDependencyNotAvailable:
24
24
  else:
25
25
  _import_structure["pipeline_wan"] = ["WanPipeline"]
26
26
  _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
27
+ _import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
27
28
  _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
28
29
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29
30
  try:
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
35
36
  else:
36
37
  from .pipeline_wan import WanPipeline
37
38
  from .pipeline_wan_i2v import WanImageToVideoPipeline
39
+ from .pipeline_wan_vace import WanVACEPipeline
38
40
  from .pipeline_wan_video2video import WanVideoToVideoPipeline
39
41
 
40
42
  else:
@@ -112,18 +112,31 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
112
112
  A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
113
113
  vae ([`AutoencoderKLWan`]):
114
114
  Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
115
+ transformer_2 ([`WanTransformer3DModel`], *optional*):
116
+ Conditional Transformer to denoise the input latents during the low-noise stage. If provided, enables
117
+ two-stage denoising where `transformer` handles high-noise stages and `transformer_2` handles low-noise
118
+ stages. If not provided, only `transformer` is used.
119
+ boundary_ratio (`float`, *optional*, defaults to `None`):
120
+ Ratio of total timesteps to use as the boundary for switching between transformers in two-stage denoising.
121
+ The actual boundary timestep is calculated as `boundary_ratio * num_train_timesteps`. When provided,
122
+ `transformer` handles timesteps >= boundary_timestep and `transformer_2` handles timesteps <
123
+ boundary_timestep. If `None`, only `transformer` is used for the entire denoising process.
115
124
  """
116
125
 
117
- model_cpu_offload_seq = "text_encoder->transformer->vae"
126
+ model_cpu_offload_seq = "text_encoder->transformer->transformer_2->vae"
118
127
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
128
+ _optional_components = ["transformer", "transformer_2"]
119
129
 
120
130
  def __init__(
121
131
  self,
122
132
  tokenizer: AutoTokenizer,
123
133
  text_encoder: UMT5EncoderModel,
124
- transformer: WanTransformer3DModel,
125
134
  vae: AutoencoderKLWan,
126
135
  scheduler: FlowMatchEulerDiscreteScheduler,
136
+ transformer: Optional[WanTransformer3DModel] = None,
137
+ transformer_2: Optional[WanTransformer3DModel] = None,
138
+ boundary_ratio: Optional[float] = None,
139
+ expand_timesteps: bool = False, # Wan2.2 ti2v
127
140
  ):
128
141
  super().__init__()
129
142
 
@@ -133,10 +146,12 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
133
146
  tokenizer=tokenizer,
134
147
  transformer=transformer,
135
148
  scheduler=scheduler,
149
+ transformer_2=transformer_2,
136
150
  )
137
-
138
- self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
139
- self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
151
+ self.register_to_config(boundary_ratio=boundary_ratio)
152
+ self.register_to_config(expand_timesteps=expand_timesteps)
153
+ self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
154
+ self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
140
155
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
141
156
 
142
157
  def _get_t5_prompt_embeds(
@@ -270,6 +285,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
270
285
  prompt_embeds=None,
271
286
  negative_prompt_embeds=None,
272
287
  callback_on_step_end_tensor_inputs=None,
288
+ guidance_scale_2=None,
273
289
  ):
274
290
  if height % 16 != 0 or width % 16 != 0:
275
291
  raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
@@ -302,6 +318,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
302
318
  ):
303
319
  raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
304
320
 
321
+ if self.config.boundary_ratio is None and guidance_scale_2 is not None:
322
+ raise ValueError("`guidance_scale_2` is only supported when the pipeline's `boundary_ratio` is not None.")
323
+
305
324
  def prepare_latents(
306
325
  self,
307
326
  batch_size: int,
@@ -369,6 +388,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
369
388
  num_frames: int = 81,
370
389
  num_inference_steps: int = 50,
371
390
  guidance_scale: float = 5.0,
391
+ guidance_scale_2: Optional[float] = None,
372
392
  num_videos_per_prompt: Optional[int] = 1,
373
393
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
374
394
  latents: Optional[torch.Tensor] = None,
@@ -388,8 +408,10 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
388
408
 
389
409
  Args:
390
410
  prompt (`str` or `List[str]`, *optional*):
391
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
392
- instead.
411
+ The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
412
+ negative_prompt (`str` or `List[str]`, *optional*):
413
+ The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
414
+ instead. Ignored when not using guidance (`guidance_scale` < `1`).
393
415
  height (`int`, defaults to `480`):
394
416
  The height in pixels of the generated image.
395
417
  width (`int`, defaults to `832`):
@@ -400,11 +422,15 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
400
422
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
401
423
  expense of slower inference.
402
424
  guidance_scale (`float`, defaults to `5.0`):
403
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
404
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
405
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
406
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
407
- usually at the expense of lower image quality.
425
+ Guidance scale as defined in [Classifier-Free Diffusion
426
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
427
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
428
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
429
+ the text `prompt`, usually at the expense of lower image quality.
430
+ guidance_scale_2 (`float`, *optional*, defaults to `None`):
431
+ Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
432
+ `boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
433
+ and the pipeline's `boundary_ratio` are not None.
408
434
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
409
435
  The number of images to generate per prompt.
410
436
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -417,7 +443,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
417
443
  prompt_embeds (`torch.Tensor`, *optional*):
418
444
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
419
445
  provided, text embeddings are generated from the `prompt` input argument.
420
- output_type (`str`, *optional*, defaults to `"pil"`):
446
+ output_type (`str`, *optional*, defaults to `"np"`):
421
447
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
422
448
  return_dict (`bool`, *optional*, defaults to `True`):
423
449
  Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
@@ -434,8 +460,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
434
460
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
435
461
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
436
462
  `._callback_tensor_inputs` attribute of your pipeline class.
437
- autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
438
- The dtype to use for the torch.amp.autocast.
463
+ max_sequence_length (`int`, defaults to `512`):
464
+ The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
465
+ truncated. If the prompt is shorter, it will be padded to this length.
439
466
 
440
467
  Examples:
441
468
 
@@ -458,6 +485,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
458
485
  prompt_embeds,
459
486
  negative_prompt_embeds,
460
487
  callback_on_step_end_tensor_inputs,
488
+ guidance_scale_2,
461
489
  )
462
490
 
463
491
  if num_frames % self.vae_scale_factor_temporal != 1:
@@ -467,7 +495,11 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
467
495
  num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
468
496
  num_frames = max(num_frames, 1)
469
497
 
498
+ if self.config.boundary_ratio is not None and guidance_scale_2 is None:
499
+ guidance_scale_2 = guidance_scale
500
+
470
501
  self._guidance_scale = guidance_scale
502
+ self._guidance_scale_2 = guidance_scale_2
471
503
  self._attention_kwargs = attention_kwargs
472
504
  self._current_timestep = None
473
505
  self._interrupt = False
@@ -494,7 +526,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
494
526
  device=device,
495
527
  )
496
528
 
497
- transformer_dtype = self.transformer.dtype
529
+ transformer_dtype = self.transformer.dtype if self.transformer is not None else self.transformer_2.dtype
498
530
  prompt_embeds = prompt_embeds.to(transformer_dtype)
499
531
  if negative_prompt_embeds is not None:
500
532
  negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
@@ -504,7 +536,11 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
504
536
  timesteps = self.scheduler.timesteps
505
537
 
506
538
  # 5. Prepare latent variables
507
- num_channels_latents = self.transformer.config.in_channels
539
+ num_channels_latents = (
540
+ self.transformer.config.in_channels
541
+ if self.transformer is not None
542
+ else self.transformer_2.config.in_channels
543
+ )
508
544
  latents = self.prepare_latents(
509
545
  batch_size * num_videos_per_prompt,
510
546
  num_channels_latents,
@@ -517,36 +553,61 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
517
553
  latents,
518
554
  )
519
555
 
556
+ mask = torch.ones(latents.shape, dtype=torch.float32, device=device)
557
+
520
558
  # 6. Denoising loop
521
559
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
522
560
  self._num_timesteps = len(timesteps)
523
561
 
562
+ if self.config.boundary_ratio is not None:
563
+ boundary_timestep = self.config.boundary_ratio * self.scheduler.config.num_train_timesteps
564
+ else:
565
+ boundary_timestep = None
566
+
524
567
  with self.progress_bar(total=num_inference_steps) as progress_bar:
525
568
  for i, t in enumerate(timesteps):
526
569
  if self.interrupt:
527
570
  continue
528
571
 
529
572
  self._current_timestep = t
530
- latent_model_input = latents.to(transformer_dtype)
531
- timestep = t.expand(latents.shape[0])
532
573
 
533
- noise_pred = self.transformer(
534
- hidden_states=latent_model_input,
535
- timestep=timestep,
536
- encoder_hidden_states=prompt_embeds,
537
- attention_kwargs=attention_kwargs,
538
- return_dict=False,
539
- )[0]
574
+ if boundary_timestep is None or t >= boundary_timestep:
575
+ # wan2.1 or high-noise stage in wan2.2
576
+ current_model = self.transformer
577
+ current_guidance_scale = guidance_scale
578
+ else:
579
+ # low-noise stage in wan2.2
580
+ current_model = self.transformer_2
581
+ current_guidance_scale = guidance_scale_2
540
582
 
541
- if self.do_classifier_free_guidance:
542
- noise_uncond = self.transformer(
583
+ latent_model_input = latents.to(transformer_dtype)
584
+ if self.config.expand_timesteps:
585
+ # seq_len: num_latent_frames * latent_height//2 * latent_width//2
586
+ temp_ts = (mask[0][0][:, ::2, ::2] * t).flatten()
587
+ # batch_size, seq_len
588
+ timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
589
+ else:
590
+ timestep = t.expand(latents.shape[0])
591
+
592
+ with current_model.cache_context("cond"):
593
+ noise_pred = current_model(
543
594
  hidden_states=latent_model_input,
544
595
  timestep=timestep,
545
- encoder_hidden_states=negative_prompt_embeds,
596
+ encoder_hidden_states=prompt_embeds,
546
597
  attention_kwargs=attention_kwargs,
547
598
  return_dict=False,
548
599
  )[0]
549
- noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
600
+
601
+ if self.do_classifier_free_guidance:
602
+ with current_model.cache_context("uncond"):
603
+ noise_uncond = current_model(
604
+ hidden_states=latent_model_input,
605
+ timestep=timestep,
606
+ encoder_hidden_states=negative_prompt_embeds,
607
+ attention_kwargs=attention_kwargs,
608
+ return_dict=False,
609
+ )[0]
610
+ noise_pred = noise_uncond + current_guidance_scale * (noise_pred - noise_uncond)
550
611
 
551
612
  # compute the previous noisy sample x_t -> x_t-1
552
613
  latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]