diffusers 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (478) hide show
  1. diffusers/__init__.py +48 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/dependency_versions_check.py +1 -1
  7. diffusers/dependency_versions_table.py +1 -1
  8. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  9. diffusers/hooks/faster_cache.py +2 -2
  10. diffusers/hooks/group_offloading.py +128 -29
  11. diffusers/hooks/hooks.py +2 -2
  12. diffusers/hooks/layerwise_casting.py +3 -3
  13. diffusers/hooks/pyramid_attention_broadcast.py +1 -1
  14. diffusers/image_processor.py +7 -2
  15. diffusers/loaders/__init__.py +4 -0
  16. diffusers/loaders/ip_adapter.py +5 -14
  17. diffusers/loaders/lora_base.py +212 -111
  18. diffusers/loaders/lora_conversion_utils.py +275 -34
  19. diffusers/loaders/lora_pipeline.py +1554 -819
  20. diffusers/loaders/peft.py +52 -109
  21. diffusers/loaders/single_file.py +2 -2
  22. diffusers/loaders/single_file_model.py +20 -4
  23. diffusers/loaders/single_file_utils.py +225 -5
  24. diffusers/loaders/textual_inversion.py +3 -2
  25. diffusers/loaders/transformer_flux.py +1 -1
  26. diffusers/loaders/transformer_sd3.py +2 -2
  27. diffusers/loaders/unet.py +2 -16
  28. diffusers/loaders/unet_loader_utils.py +1 -1
  29. diffusers/loaders/utils.py +1 -1
  30. diffusers/models/__init__.py +15 -1
  31. diffusers/models/activations.py +5 -5
  32. diffusers/models/adapter.py +2 -3
  33. diffusers/models/attention.py +4 -4
  34. diffusers/models/attention_flax.py +10 -10
  35. diffusers/models/attention_processor.py +14 -10
  36. diffusers/models/auto_model.py +47 -10
  37. diffusers/models/autoencoders/__init__.py +1 -0
  38. diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
  39. diffusers/models/autoencoders/autoencoder_dc.py +3 -3
  40. diffusers/models/autoencoders/autoencoder_kl.py +4 -4
  41. diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
  42. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
  43. diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
  44. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
  45. diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
  46. diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
  47. diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
  48. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
  49. diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
  50. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
  51. diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
  52. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  53. diffusers/models/autoencoders/vae.py +13 -2
  54. diffusers/models/autoencoders/vq_model.py +2 -2
  55. diffusers/models/cache_utils.py +1 -1
  56. diffusers/models/controlnet.py +1 -1
  57. diffusers/models/controlnet_flux.py +1 -1
  58. diffusers/models/controlnet_sd3.py +1 -1
  59. diffusers/models/controlnet_sparsectrl.py +1 -1
  60. diffusers/models/controlnets/__init__.py +1 -0
  61. diffusers/models/controlnets/controlnet.py +3 -3
  62. diffusers/models/controlnets/controlnet_flax.py +1 -1
  63. diffusers/models/controlnets/controlnet_flux.py +16 -15
  64. diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
  65. diffusers/models/controlnets/controlnet_sana.py +290 -0
  66. diffusers/models/controlnets/controlnet_sd3.py +1 -1
  67. diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
  68. diffusers/models/controlnets/controlnet_union.py +1 -1
  69. diffusers/models/controlnets/controlnet_xs.py +7 -7
  70. diffusers/models/controlnets/multicontrolnet.py +4 -5
  71. diffusers/models/controlnets/multicontrolnet_union.py +5 -6
  72. diffusers/models/downsampling.py +2 -2
  73. diffusers/models/embeddings.py +10 -12
  74. diffusers/models/embeddings_flax.py +2 -2
  75. diffusers/models/lora.py +3 -3
  76. diffusers/models/modeling_utils.py +44 -14
  77. diffusers/models/normalization.py +4 -4
  78. diffusers/models/resnet.py +2 -2
  79. diffusers/models/resnet_flax.py +1 -1
  80. diffusers/models/transformers/__init__.py +5 -0
  81. diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
  82. diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
  83. diffusers/models/transformers/consisid_transformer_3d.py +1 -1
  84. diffusers/models/transformers/dit_transformer_2d.py +2 -2
  85. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  86. diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
  87. diffusers/models/transformers/latte_transformer_3d.py +4 -5
  88. diffusers/models/transformers/lumina_nextdit2d.py +2 -2
  89. diffusers/models/transformers/pixart_transformer_2d.py +3 -3
  90. diffusers/models/transformers/prior_transformer.py +1 -1
  91. diffusers/models/transformers/sana_transformer.py +8 -3
  92. diffusers/models/transformers/stable_audio_transformer.py +5 -9
  93. diffusers/models/transformers/t5_film_transformer.py +3 -3
  94. diffusers/models/transformers/transformer_2d.py +1 -1
  95. diffusers/models/transformers/transformer_allegro.py +1 -1
  96. diffusers/models/transformers/transformer_chroma.py +742 -0
  97. diffusers/models/transformers/transformer_cogview3plus.py +5 -10
  98. diffusers/models/transformers/transformer_cogview4.py +317 -25
  99. diffusers/models/transformers/transformer_cosmos.py +579 -0
  100. diffusers/models/transformers/transformer_flux.py +9 -11
  101. diffusers/models/transformers/transformer_hidream_image.py +942 -0
  102. diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
  103. diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
  104. diffusers/models/transformers/transformer_ltx.py +2 -2
  105. diffusers/models/transformers/transformer_lumina2.py +1 -1
  106. diffusers/models/transformers/transformer_mochi.py +1 -1
  107. diffusers/models/transformers/transformer_omnigen.py +2 -2
  108. diffusers/models/transformers/transformer_sd3.py +7 -7
  109. diffusers/models/transformers/transformer_temporal.py +1 -1
  110. diffusers/models/transformers/transformer_wan.py +24 -8
  111. diffusers/models/transformers/transformer_wan_vace.py +393 -0
  112. diffusers/models/unets/unet_1d.py +1 -1
  113. diffusers/models/unets/unet_1d_blocks.py +1 -1
  114. diffusers/models/unets/unet_2d.py +1 -1
  115. diffusers/models/unets/unet_2d_blocks.py +1 -1
  116. diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
  117. diffusers/models/unets/unet_2d_condition.py +2 -2
  118. diffusers/models/unets/unet_2d_condition_flax.py +2 -2
  119. diffusers/models/unets/unet_3d_blocks.py +1 -1
  120. diffusers/models/unets/unet_3d_condition.py +3 -3
  121. diffusers/models/unets/unet_i2vgen_xl.py +3 -3
  122. diffusers/models/unets/unet_kandinsky3.py +1 -1
  123. diffusers/models/unets/unet_motion_model.py +2 -2
  124. diffusers/models/unets/unet_stable_cascade.py +1 -1
  125. diffusers/models/upsampling.py +2 -2
  126. diffusers/models/vae_flax.py +2 -2
  127. diffusers/models/vq_model.py +1 -1
  128. diffusers/pipelines/__init__.py +37 -6
  129. diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
  130. diffusers/pipelines/amused/pipeline_amused.py +7 -6
  131. diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
  132. diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
  133. diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
  134. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
  135. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
  136. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
  137. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
  138. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
  139. diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
  140. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  141. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
  142. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
  143. diffusers/pipelines/auto_pipeline.py +6 -7
  144. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  145. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  146. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
  147. diffusers/pipelines/chroma/__init__.py +49 -0
  148. diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
  149. diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
  150. diffusers/pipelines/chroma/pipeline_output.py +21 -0
  151. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
  152. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
  153. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
  154. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
  155. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
  156. diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
  157. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
  158. diffusers/pipelines/consisid/consisid_utils.py +2 -2
  159. diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
  160. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  161. diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
  162. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
  163. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
  164. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
  165. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
  166. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
  167. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
  168. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
  169. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
  170. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
  171. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  172. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
  173. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
  174. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  175. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
  176. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
  177. diffusers/pipelines/cosmos/__init__.py +54 -0
  178. diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
  179. diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
  180. diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
  181. diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
  182. diffusers/pipelines/cosmos/pipeline_output.py +40 -0
  183. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
  184. diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
  185. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  186. diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
  187. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
  188. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
  189. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
  190. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
  191. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
  192. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
  193. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
  194. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  195. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
  196. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  197. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
  198. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
  199. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  200. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  201. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  202. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  203. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  204. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
  205. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
  206. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
  207. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
  208. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
  209. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
  210. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  211. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
  212. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
  213. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
  214. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
  215. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
  216. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  217. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  218. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
  219. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
  220. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
  221. diffusers/pipelines/flux/modeling_flux.py +1 -1
  222. diffusers/pipelines/flux/pipeline_flux.py +10 -17
  223. diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
  224. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
  225. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
  226. diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
  227. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
  228. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
  229. diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
  230. diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
  231. diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
  232. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
  233. diffusers/pipelines/free_init_utils.py +2 -2
  234. diffusers/pipelines/free_noise_utils.py +3 -3
  235. diffusers/pipelines/hidream_image/__init__.py +47 -0
  236. diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
  237. diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
  238. diffusers/pipelines/hunyuan_video/__init__.py +2 -0
  239. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
  240. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
  241. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  242. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
  243. diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
  244. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
  245. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
  246. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
  247. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
  248. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
  249. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
  250. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
  251. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  252. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
  253. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
  254. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
  255. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
  256. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
  257. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
  258. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
  259. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
  260. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
  261. diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
  262. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
  263. diffusers/pipelines/kolors/text_encoder.py +3 -3
  264. diffusers/pipelines/kolors/tokenizer.py +1 -1
  265. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
  266. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
  267. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  268. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
  269. diffusers/pipelines/latte/pipeline_latte.py +12 -12
  270. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
  271. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
  272. diffusers/pipelines/ltx/__init__.py +4 -0
  273. diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
  274. diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
  275. diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
  276. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
  277. diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
  278. diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
  279. diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
  280. diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
  281. diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
  282. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
  283. diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
  284. diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
  285. diffusers/pipelines/onnx_utils.py +15 -2
  286. diffusers/pipelines/pag/pag_utils.py +2 -2
  287. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
  288. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
  289. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
  290. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
  291. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
  292. diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
  293. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
  294. diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
  295. diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
  296. diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
  297. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
  298. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
  299. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
  300. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
  301. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
  302. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
  303. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
  304. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  305. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
  306. diffusers/pipelines/pia/pipeline_pia.py +8 -6
  307. diffusers/pipelines/pipeline_flax_utils.py +3 -4
  308. diffusers/pipelines/pipeline_loading_utils.py +89 -13
  309. diffusers/pipelines/pipeline_utils.py +105 -33
  310. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
  311. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
  312. diffusers/pipelines/sana/__init__.py +4 -0
  313. diffusers/pipelines/sana/pipeline_sana.py +23 -21
  314. diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
  315. diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
  316. diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
  317. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
  318. diffusers/pipelines/shap_e/camera.py +1 -1
  319. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  320. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  321. diffusers/pipelines/shap_e/renderer.py +3 -3
  322. diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
  323. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
  324. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
  325. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
  326. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
  327. diffusers/pipelines/stable_diffusion/__init__.py +0 -7
  328. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  329. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
  330. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  331. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
  332. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  333. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
  334. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
  335. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
  336. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
  337. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
  338. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
  339. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
  340. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
  341. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
  342. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
  343. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
  344. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
  345. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
  346. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
  347. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  348. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  349. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  350. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
  351. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
  352. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
  353. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
  354. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
  355. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
  356. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
  357. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
  358. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
  359. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
  360. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
  361. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
  362. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  363. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
  364. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  365. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
  366. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
  367. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
  368. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
  369. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
  370. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
  371. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
  372. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
  373. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
  374. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
  375. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
  376. diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
  377. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
  378. diffusers/pipelines/unclip/text_proj.py +2 -2
  379. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
  380. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  381. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
  382. diffusers/pipelines/visualcloze/__init__.py +52 -0
  383. diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
  384. diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
  385. diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
  386. diffusers/pipelines/wan/__init__.py +2 -0
  387. diffusers/pipelines/wan/pipeline_wan.py +13 -10
  388. diffusers/pipelines/wan/pipeline_wan_i2v.py +38 -18
  389. diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
  390. diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
  391. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  392. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
  393. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  394. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  395. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
  396. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
  397. diffusers/quantizers/__init__.py +179 -1
  398. diffusers/quantizers/base.py +6 -1
  399. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
  400. diffusers/quantizers/bitsandbytes/utils.py +10 -7
  401. diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
  402. diffusers/quantizers/gguf/utils.py +16 -13
  403. diffusers/quantizers/quantization_config.py +18 -16
  404. diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
  405. diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
  406. diffusers/schedulers/__init__.py +3 -1
  407. diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
  408. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  409. diffusers/schedulers/scheduling_consistency_models.py +1 -1
  410. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
  411. diffusers/schedulers/scheduling_ddim.py +8 -8
  412. diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
  413. diffusers/schedulers/scheduling_ddim_flax.py +6 -6
  414. diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
  415. diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
  416. diffusers/schedulers/scheduling_ddpm.py +9 -9
  417. diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
  418. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
  419. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
  420. diffusers/schedulers/scheduling_deis_multistep.py +8 -8
  421. diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
  422. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
  423. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
  424. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
  425. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  426. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
  427. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
  428. diffusers/schedulers/scheduling_edm_euler.py +20 -11
  429. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
  430. diffusers/schedulers/scheduling_euler_discrete.py +3 -3
  431. diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
  432. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
  433. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
  434. diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
  435. diffusers/schedulers/scheduling_heun_discrete.py +2 -2
  436. diffusers/schedulers/scheduling_ipndm.py +2 -2
  437. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
  438. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
  439. diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
  440. diffusers/schedulers/scheduling_lcm.py +3 -3
  441. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  442. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  443. diffusers/schedulers/scheduling_pndm.py +4 -4
  444. diffusers/schedulers/scheduling_pndm_flax.py +4 -4
  445. diffusers/schedulers/scheduling_repaint.py +9 -9
  446. diffusers/schedulers/scheduling_sasolver.py +15 -15
  447. diffusers/schedulers/scheduling_scm.py +1 -1
  448. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  449. diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
  450. diffusers/schedulers/scheduling_tcd.py +3 -3
  451. diffusers/schedulers/scheduling_unclip.py +5 -5
  452. diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
  453. diffusers/schedulers/scheduling_utils.py +1 -1
  454. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  455. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  456. diffusers/training_utils.py +13 -5
  457. diffusers/utils/__init__.py +5 -0
  458. diffusers/utils/accelerate_utils.py +1 -1
  459. diffusers/utils/doc_utils.py +1 -1
  460. diffusers/utils/dummy_pt_objects.py +120 -0
  461. diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
  462. diffusers/utils/dynamic_modules_utils.py +21 -3
  463. diffusers/utils/export_utils.py +1 -1
  464. diffusers/utils/import_utils.py +81 -18
  465. diffusers/utils/logging.py +1 -1
  466. diffusers/utils/outputs.py +2 -1
  467. diffusers/utils/peft_utils.py +91 -8
  468. diffusers/utils/state_dict_utils.py +20 -3
  469. diffusers/utils/testing_utils.py +59 -7
  470. diffusers/utils/torch_utils.py +25 -5
  471. diffusers/video_processor.py +2 -2
  472. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/METADATA +70 -55
  473. diffusers-0.34.0.dist-info/RECORD +639 -0
  474. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/WHEEL +1 -1
  475. diffusers-0.33.1.dist-info/RECORD +0 -608
  476. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
  477. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
  478. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
1
+ # Copyright 2025 VisualCloze team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ from PIL import Image
19
+
20
+ from ...image_processor import VaeImageProcessor
21
+
22
+
23
+ class VisualClozeProcessor(VaeImageProcessor):
24
+ """
25
+ Image processor for the VisualCloze pipeline.
26
+
27
+ This processor handles the preprocessing of images for visual cloze tasks, including resizing, normalization, and
28
+ mask generation.
29
+
30
+ Args:
31
+ resolution (int, optional):
32
+ Target resolution for processing images. Each image will be resized to this resolution before being
33
+ concatenated to avoid the out-of-memory error. Defaults to 384.
34
+ *args: Additional arguments passed to [~image_processor.VaeImageProcessor]
35
+ **kwargs: Additional keyword arguments passed to [~image_processor.VaeImageProcessor]
36
+ """
37
+
38
+ def __init__(self, *args, resolution: int = 384, **kwargs):
39
+ super().__init__(*args, **kwargs)
40
+ self.resolution = resolution
41
+
42
+ def preprocess_image(
43
+ self, input_images: List[List[Optional[Image.Image]]], vae_scale_factor: int
44
+ ) -> Tuple[List[List[torch.Tensor]], List[List[List[int]]], List[int]]:
45
+ """
46
+ Preprocesses input images for the VisualCloze pipeline.
47
+
48
+ This function handles the preprocessing of input images by:
49
+ 1. Resizing and cropping images to maintain consistent dimensions
50
+ 2. Converting images to the Tensor format for the VAE
51
+ 3. Normalizing pixel values
52
+ 4. Tracking image sizes and positions of target images
53
+
54
+ Args:
55
+ input_images (List[List[Optional[Image.Image]]]):
56
+ A nested list of PIL Images where:
57
+ - Outer list represents different samples, including in-context examples and the query
58
+ - Inner list contains images for the task
59
+ - In the last row, condition images are provided and the target images are placed as None
60
+ vae_scale_factor (int):
61
+ The scale factor used by the VAE for resizing images
62
+
63
+ Returns:
64
+ Tuple containing:
65
+ - List[List[torch.Tensor]]: Preprocessed images in tensor format
66
+ - List[List[List[int]]]: Dimensions of each processed image [height, width]
67
+ - List[int]: Target positions indicating which images are to be generated
68
+ """
69
+ n_samples, n_task_images = len(input_images), len(input_images[0])
70
+ divisible = 2 * vae_scale_factor
71
+
72
+ processed_images: List[List[Image.Image]] = [[] for _ in range(n_samples)]
73
+ resize_size: List[Optional[Tuple[int, int]]] = [None for _ in range(n_samples)]
74
+ target_position: List[int] = []
75
+
76
+ # Process each sample
77
+ for i in range(n_samples):
78
+ # Determine size from first non-None image
79
+ for j in range(n_task_images):
80
+ if input_images[i][j] is not None:
81
+ aspect_ratio = input_images[i][j].width / input_images[i][j].height
82
+ target_area = self.resolution * self.resolution
83
+ new_h = int((target_area / aspect_ratio) ** 0.5)
84
+ new_w = int(new_h * aspect_ratio)
85
+
86
+ new_w = max(new_w // divisible, 1) * divisible
87
+ new_h = max(new_h // divisible, 1) * divisible
88
+ resize_size[i] = (new_w, new_h)
89
+ break
90
+
91
+ # Process all images in the sample
92
+ for j in range(n_task_images):
93
+ if input_images[i][j] is not None:
94
+ target = self._resize_and_crop(input_images[i][j], resize_size[i][0], resize_size[i][1])
95
+ processed_images[i].append(target)
96
+ if i == n_samples - 1:
97
+ target_position.append(0)
98
+ else:
99
+ blank = Image.new("RGB", resize_size[i] or (self.resolution, self.resolution), (0, 0, 0))
100
+ processed_images[i].append(blank)
101
+ if i == n_samples - 1:
102
+ target_position.append(1)
103
+
104
+ # Ensure consistent width for multiple target images when there are multiple target images
105
+ if len(target_position) > 1 and sum(target_position) > 1:
106
+ new_w = resize_size[n_samples - 1][0] or 384
107
+ for i in range(len(processed_images)):
108
+ for j in range(len(processed_images[i])):
109
+ if processed_images[i][j] is not None:
110
+ new_h = int(processed_images[i][j].height * (new_w / processed_images[i][j].width))
111
+ new_w = int(new_w / 16) * 16
112
+ new_h = int(new_h / 16) * 16
113
+ processed_images[i][j] = self.height(processed_images[i][j], new_h, new_w)
114
+
115
+ # Convert to tensors and normalize
116
+ image_sizes = []
117
+ for i in range(len(processed_images)):
118
+ image_sizes.append([[img.height, img.width] for img in processed_images[i]])
119
+ for j, image in enumerate(processed_images[i]):
120
+ image = self.pil_to_numpy(image)
121
+ image = self.numpy_to_pt(image)
122
+ image = self.normalize(image)
123
+ processed_images[i][j] = image
124
+
125
+ return processed_images, image_sizes, target_position
126
+
127
+ def preprocess_mask(
128
+ self, input_images: List[List[Image.Image]], target_position: List[int]
129
+ ) -> List[List[torch.Tensor]]:
130
+ """
131
+ Generate masks for the VisualCloze pipeline.
132
+
133
+ Args:
134
+ input_images (List[List[Image.Image]]):
135
+ Processed images from preprocess_image
136
+ target_position (List[int]):
137
+ Binary list marking the positions of target images (1 for target, 0 for condition)
138
+
139
+ Returns:
140
+ List[List[torch.Tensor]]:
141
+ A nested list of mask tensors (1 for target positions, 0 for condition images)
142
+ """
143
+ mask = []
144
+ for i, row in enumerate(input_images):
145
+ if i == len(input_images) - 1: # Query row
146
+ row_masks = [
147
+ torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=m) for m in target_position
148
+ ]
149
+ else: # In-context examples
150
+ row_masks = [
151
+ torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=0) for _ in target_position
152
+ ]
153
+ mask.append(row_masks)
154
+ return mask
155
+
156
+ def preprocess_image_upsampling(
157
+ self,
158
+ input_images: List[List[Image.Image]],
159
+ height: int,
160
+ width: int,
161
+ ) -> Tuple[List[List[Image.Image]], List[List[List[int]]]]:
162
+ """Process images for the upsampling stage in the VisualCloze pipeline.
163
+
164
+ Args:
165
+ input_images: Input image to process
166
+ height: Target height
167
+ width: Target width
168
+
169
+ Returns:
170
+ Tuple of processed image and its size
171
+ """
172
+ image = self.resize(input_images[0][0], height, width)
173
+ image = self.pil_to_numpy(image) # to np
174
+ image = self.numpy_to_pt(image) # to pt
175
+ image = self.normalize(image)
176
+
177
+ input_images[0][0] = image
178
+ image_sizes = [[[height, width]]]
179
+ return input_images, image_sizes
180
+
181
+ def preprocess_mask_upsampling(self, input_images: List[List[Image.Image]]) -> List[List[torch.Tensor]]:
182
+ return [[torch.ones((1, 1, input_images[0][0].shape[2], input_images[0][0].shape[3]))]]
183
+
184
+ def get_layout_prompt(self, size: Tuple[int, int]) -> str:
185
+ layout_instruction = (
186
+ f"A grid layout with {size[0]} rows and {size[1]} columns, displaying {size[0] * size[1]} images arranged side by side.",
187
+ )
188
+ return layout_instruction
189
+
190
+ def preprocess(
191
+ self,
192
+ task_prompt: Union[str, List[str]],
193
+ content_prompt: Union[str, List[str]],
194
+ input_images: Optional[List[List[List[Optional[str]]]]] = None,
195
+ height: Optional[int] = None,
196
+ width: Optional[int] = None,
197
+ upsampling: bool = False,
198
+ vae_scale_factor: int = 16,
199
+ ) -> Dict:
200
+ """Process visual cloze inputs.
201
+
202
+ Args:
203
+ task_prompt: Task description(s)
204
+ content_prompt: Content description(s)
205
+ input_images: List of images or None for the target images
206
+ height: Optional target height for upsampling stage
207
+ width: Optional target width for upsampling stage
208
+ upsampling: Whether this is in the upsampling processing stage
209
+
210
+ Returns:
211
+ Dictionary containing processed images, masks, prompts and metadata
212
+ """
213
+ if isinstance(task_prompt, str):
214
+ task_prompt = [task_prompt]
215
+ content_prompt = [content_prompt]
216
+ input_images = [input_images]
217
+
218
+ output = {
219
+ "init_image": [],
220
+ "mask": [],
221
+ "task_prompt": task_prompt if not upsampling else [None for _ in range(len(task_prompt))],
222
+ "content_prompt": content_prompt,
223
+ "layout_prompt": [],
224
+ "target_position": [],
225
+ "image_size": [],
226
+ }
227
+ for i in range(len(task_prompt)):
228
+ if upsampling:
229
+ layout_prompt = None
230
+ else:
231
+ layout_prompt = self.get_layout_prompt((len(input_images[i]), len(input_images[i][0])))
232
+
233
+ if upsampling:
234
+ cur_processed_images, cur_image_size = self.preprocess_image_upsampling(
235
+ input_images[i], height=height, width=width
236
+ )
237
+ cur_mask = self.preprocess_mask_upsampling(cur_processed_images)
238
+ else:
239
+ cur_processed_images, cur_image_size, cur_target_position = self.preprocess_image(
240
+ input_images[i], vae_scale_factor=vae_scale_factor
241
+ )
242
+ cur_mask = self.preprocess_mask(cur_processed_images, cur_target_position)
243
+
244
+ output["target_position"].append(cur_target_position)
245
+
246
+ output["image_size"].append(cur_image_size)
247
+ output["init_image"].append(cur_processed_images)
248
+ output["mask"].append(cur_mask)
249
+ output["layout_prompt"].append(layout_prompt)
250
+
251
+ return output
@@ -24,6 +24,7 @@ except OptionalDependencyNotAvailable:
24
24
  else:
25
25
  _import_structure["pipeline_wan"] = ["WanPipeline"]
26
26
  _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
27
+ _import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
27
28
  _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
28
29
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29
30
  try:
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
35
36
  else:
36
37
  from .pipeline_wan import WanPipeline
37
38
  from .pipeline_wan_i2v import WanImageToVideoPipeline
39
+ from .pipeline_wan_vace import WanVACEPipeline
38
40
  from .pipeline_wan_video2video import WanVideoToVideoPipeline
39
41
 
40
42
  else:
@@ -388,8 +388,10 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
388
388
 
389
389
  Args:
390
390
  prompt (`str` or `List[str]`, *optional*):
391
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
392
- instead.
391
+ The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
392
+ negative_prompt (`str` or `List[str]`, *optional*):
393
+ The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
394
+ instead. Ignored when not using guidance (`guidance_scale` < `1`).
393
395
  height (`int`, defaults to `480`):
394
396
  The height in pixels of the generated image.
395
397
  width (`int`, defaults to `832`):
@@ -400,11 +402,11 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
400
402
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
401
403
  expense of slower inference.
402
404
  guidance_scale (`float`, defaults to `5.0`):
403
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
404
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
405
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
406
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
407
- usually at the expense of lower image quality.
405
+ Guidance scale as defined in [Classifier-Free Diffusion
406
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
407
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
408
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
409
+ the text `prompt`, usually at the expense of lower image quality.
408
410
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
409
411
  The number of images to generate per prompt.
410
412
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -417,7 +419,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
417
419
  prompt_embeds (`torch.Tensor`, *optional*):
418
420
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
419
421
  provided, text embeddings are generated from the `prompt` input argument.
420
- output_type (`str`, *optional*, defaults to `"pil"`):
422
+ output_type (`str`, *optional*, defaults to `"np"`):
421
423
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
422
424
  return_dict (`bool`, *optional*, defaults to `True`):
423
425
  Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
@@ -434,8 +436,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
434
436
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
435
437
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
436
438
  `._callback_tensor_inputs` attribute of your pipeline class.
437
- autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
438
- The dtype to use for the torch.amp.autocast.
439
+ max_sequence_length (`int`, defaults to `512`):
440
+ The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
441
+ truncated. If the prompt is shorter, it will be padded to this length.
439
442
 
440
443
  Examples:
441
444
 
@@ -380,6 +380,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
380
380
  device: Optional[torch.device] = None,
381
381
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
382
382
  latents: Optional[torch.Tensor] = None,
383
+ last_image: Optional[torch.Tensor] = None,
383
384
  ) -> Tuple[torch.Tensor, torch.Tensor]:
384
385
  num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
385
386
  latent_height = height // self.vae_scale_factor_spatial
@@ -398,10 +399,17 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
398
399
  latents = latents.to(device=device, dtype=dtype)
399
400
 
400
401
  image = image.unsqueeze(2)
401
- video_condition = torch.cat(
402
- [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
403
- )
404
- video_condition = video_condition.to(device=device, dtype=dtype)
402
+ if last_image is None:
403
+ video_condition = torch.cat(
404
+ [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
405
+ )
406
+ else:
407
+ last_image = last_image.unsqueeze(2)
408
+ video_condition = torch.cat(
409
+ [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
410
+ dim=2,
411
+ )
412
+ video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
405
413
 
406
414
  latents_mean = (
407
415
  torch.tensor(self.vae.config.latents_mean)
@@ -421,10 +429,15 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
421
429
  latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
422
430
  latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
423
431
 
432
+ latent_condition = latent_condition.to(dtype)
424
433
  latent_condition = (latent_condition - latents_mean) * latents_std
425
434
 
426
435
  mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
427
- mask_lat_size[:, :, list(range(1, num_frames))] = 0
436
+
437
+ if last_image is None:
438
+ mask_lat_size[:, :, list(range(1, num_frames))] = 0
439
+ else:
440
+ mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
428
441
  first_frame_mask = mask_lat_size[:, :, 0:1]
429
442
  first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
430
443
  mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
@@ -476,6 +489,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
476
489
  prompt_embeds: Optional[torch.Tensor] = None,
477
490
  negative_prompt_embeds: Optional[torch.Tensor] = None,
478
491
  image_embeds: Optional[torch.Tensor] = None,
492
+ last_image: Optional[torch.Tensor] = None,
479
493
  output_type: Optional[str] = "np",
480
494
  return_dict: bool = True,
481
495
  attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -508,11 +522,11 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
508
522
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
509
523
  expense of slower inference.
510
524
  guidance_scale (`float`, defaults to `5.0`):
511
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
512
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
513
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
514
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
515
- usually at the expense of lower image quality.
525
+ Guidance scale as defined in [Classifier-Free Diffusion
526
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
527
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
528
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
529
+ the text `prompt`, usually at the expense of lower image quality.
516
530
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
517
531
  The number of images to generate per prompt.
518
532
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -531,7 +545,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
531
545
  image_embeds (`torch.Tensor`, *optional*):
532
546
  Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
533
547
  image embeddings are generated from the `image` input argument.
534
- output_type (`str`, *optional*, defaults to `"pil"`):
548
+ output_type (`str`, *optional*, defaults to `"np"`):
535
549
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
536
550
  return_dict (`bool`, *optional*, defaults to `True`):
537
551
  Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
@@ -548,12 +562,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
548
562
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
549
563
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
550
564
  `._callback_tensor_inputs` attribute of your pipeline class.
551
- max_sequence_length (`int`, *optional*, defaults to `512`):
552
- The maximum sequence length of the prompt.
553
- shift (`float`, *optional*, defaults to `5.0`):
554
- The shift of the flow.
555
- autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
556
- The dtype to use for the torch.amp.autocast.
565
+ max_sequence_length (`int`, defaults to `512`):
566
+ The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
567
+ truncated. If the prompt is shorter, it will be padded to this length.
568
+
557
569
  Examples:
558
570
 
559
571
  Returns:
@@ -620,7 +632,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
620
632
  negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
621
633
 
622
634
  if image_embeds is None:
623
- image_embeds = self.encode_image(image, device)
635
+ if last_image is None:
636
+ image_embeds = self.encode_image(image, device)
637
+ else:
638
+ image_embeds = self.encode_image([image, last_image], device)
624
639
  image_embeds = image_embeds.repeat(batch_size, 1, 1)
625
640
  image_embeds = image_embeds.to(transformer_dtype)
626
641
 
@@ -631,6 +646,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
631
646
  # 5. Prepare latent variables
632
647
  num_channels_latents = self.vae.config.z_dim
633
648
  image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
649
+ if last_image is not None:
650
+ last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
651
+ device, dtype=torch.float32
652
+ )
634
653
  latents, condition = self.prepare_latents(
635
654
  image,
636
655
  batch_size * num_videos_per_prompt,
@@ -642,6 +661,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
642
661
  device,
643
662
  generator,
644
663
  latents,
664
+ last_image,
645
665
  )
646
666
 
647
667
  # 6. Denoising loop