diffusers 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (478) hide show
  1. diffusers/__init__.py +48 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/dependency_versions_check.py +1 -1
  7. diffusers/dependency_versions_table.py +1 -1
  8. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  9. diffusers/hooks/faster_cache.py +2 -2
  10. diffusers/hooks/group_offloading.py +128 -29
  11. diffusers/hooks/hooks.py +2 -2
  12. diffusers/hooks/layerwise_casting.py +3 -3
  13. diffusers/hooks/pyramid_attention_broadcast.py +1 -1
  14. diffusers/image_processor.py +7 -2
  15. diffusers/loaders/__init__.py +4 -0
  16. diffusers/loaders/ip_adapter.py +5 -14
  17. diffusers/loaders/lora_base.py +212 -111
  18. diffusers/loaders/lora_conversion_utils.py +275 -34
  19. diffusers/loaders/lora_pipeline.py +1554 -819
  20. diffusers/loaders/peft.py +52 -109
  21. diffusers/loaders/single_file.py +2 -2
  22. diffusers/loaders/single_file_model.py +20 -4
  23. diffusers/loaders/single_file_utils.py +225 -5
  24. diffusers/loaders/textual_inversion.py +3 -2
  25. diffusers/loaders/transformer_flux.py +1 -1
  26. diffusers/loaders/transformer_sd3.py +2 -2
  27. diffusers/loaders/unet.py +2 -16
  28. diffusers/loaders/unet_loader_utils.py +1 -1
  29. diffusers/loaders/utils.py +1 -1
  30. diffusers/models/__init__.py +15 -1
  31. diffusers/models/activations.py +5 -5
  32. diffusers/models/adapter.py +2 -3
  33. diffusers/models/attention.py +4 -4
  34. diffusers/models/attention_flax.py +10 -10
  35. diffusers/models/attention_processor.py +14 -10
  36. diffusers/models/auto_model.py +47 -10
  37. diffusers/models/autoencoders/__init__.py +1 -0
  38. diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
  39. diffusers/models/autoencoders/autoencoder_dc.py +3 -3
  40. diffusers/models/autoencoders/autoencoder_kl.py +4 -4
  41. diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
  42. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
  43. diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
  44. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
  45. diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
  46. diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
  47. diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
  48. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
  49. diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
  50. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
  51. diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
  52. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  53. diffusers/models/autoencoders/vae.py +13 -2
  54. diffusers/models/autoencoders/vq_model.py +2 -2
  55. diffusers/models/cache_utils.py +1 -1
  56. diffusers/models/controlnet.py +1 -1
  57. diffusers/models/controlnet_flux.py +1 -1
  58. diffusers/models/controlnet_sd3.py +1 -1
  59. diffusers/models/controlnet_sparsectrl.py +1 -1
  60. diffusers/models/controlnets/__init__.py +1 -0
  61. diffusers/models/controlnets/controlnet.py +3 -3
  62. diffusers/models/controlnets/controlnet_flax.py +1 -1
  63. diffusers/models/controlnets/controlnet_flux.py +16 -15
  64. diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
  65. diffusers/models/controlnets/controlnet_sana.py +290 -0
  66. diffusers/models/controlnets/controlnet_sd3.py +1 -1
  67. diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
  68. diffusers/models/controlnets/controlnet_union.py +1 -1
  69. diffusers/models/controlnets/controlnet_xs.py +7 -7
  70. diffusers/models/controlnets/multicontrolnet.py +4 -5
  71. diffusers/models/controlnets/multicontrolnet_union.py +5 -6
  72. diffusers/models/downsampling.py +2 -2
  73. diffusers/models/embeddings.py +10 -12
  74. diffusers/models/embeddings_flax.py +2 -2
  75. diffusers/models/lora.py +3 -3
  76. diffusers/models/modeling_utils.py +44 -14
  77. diffusers/models/normalization.py +4 -4
  78. diffusers/models/resnet.py +2 -2
  79. diffusers/models/resnet_flax.py +1 -1
  80. diffusers/models/transformers/__init__.py +5 -0
  81. diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
  82. diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
  83. diffusers/models/transformers/consisid_transformer_3d.py +1 -1
  84. diffusers/models/transformers/dit_transformer_2d.py +2 -2
  85. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  86. diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
  87. diffusers/models/transformers/latte_transformer_3d.py +4 -5
  88. diffusers/models/transformers/lumina_nextdit2d.py +2 -2
  89. diffusers/models/transformers/pixart_transformer_2d.py +3 -3
  90. diffusers/models/transformers/prior_transformer.py +1 -1
  91. diffusers/models/transformers/sana_transformer.py +8 -3
  92. diffusers/models/transformers/stable_audio_transformer.py +5 -9
  93. diffusers/models/transformers/t5_film_transformer.py +3 -3
  94. diffusers/models/transformers/transformer_2d.py +1 -1
  95. diffusers/models/transformers/transformer_allegro.py +1 -1
  96. diffusers/models/transformers/transformer_chroma.py +742 -0
  97. diffusers/models/transformers/transformer_cogview3plus.py +5 -10
  98. diffusers/models/transformers/transformer_cogview4.py +317 -25
  99. diffusers/models/transformers/transformer_cosmos.py +579 -0
  100. diffusers/models/transformers/transformer_flux.py +9 -11
  101. diffusers/models/transformers/transformer_hidream_image.py +942 -0
  102. diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
  103. diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
  104. diffusers/models/transformers/transformer_ltx.py +2 -2
  105. diffusers/models/transformers/transformer_lumina2.py +1 -1
  106. diffusers/models/transformers/transformer_mochi.py +1 -1
  107. diffusers/models/transformers/transformer_omnigen.py +2 -2
  108. diffusers/models/transformers/transformer_sd3.py +7 -7
  109. diffusers/models/transformers/transformer_temporal.py +1 -1
  110. diffusers/models/transformers/transformer_wan.py +24 -8
  111. diffusers/models/transformers/transformer_wan_vace.py +393 -0
  112. diffusers/models/unets/unet_1d.py +1 -1
  113. diffusers/models/unets/unet_1d_blocks.py +1 -1
  114. diffusers/models/unets/unet_2d.py +1 -1
  115. diffusers/models/unets/unet_2d_blocks.py +1 -1
  116. diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
  117. diffusers/models/unets/unet_2d_condition.py +2 -2
  118. diffusers/models/unets/unet_2d_condition_flax.py +2 -2
  119. diffusers/models/unets/unet_3d_blocks.py +1 -1
  120. diffusers/models/unets/unet_3d_condition.py +3 -3
  121. diffusers/models/unets/unet_i2vgen_xl.py +3 -3
  122. diffusers/models/unets/unet_kandinsky3.py +1 -1
  123. diffusers/models/unets/unet_motion_model.py +2 -2
  124. diffusers/models/unets/unet_stable_cascade.py +1 -1
  125. diffusers/models/upsampling.py +2 -2
  126. diffusers/models/vae_flax.py +2 -2
  127. diffusers/models/vq_model.py +1 -1
  128. diffusers/pipelines/__init__.py +37 -6
  129. diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
  130. diffusers/pipelines/amused/pipeline_amused.py +7 -6
  131. diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
  132. diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
  133. diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
  134. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
  135. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
  136. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
  137. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
  138. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
  139. diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
  140. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  141. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
  142. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
  143. diffusers/pipelines/auto_pipeline.py +6 -7
  144. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  145. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  146. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
  147. diffusers/pipelines/chroma/__init__.py +49 -0
  148. diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
  149. diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
  150. diffusers/pipelines/chroma/pipeline_output.py +21 -0
  151. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
  152. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
  153. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
  154. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
  155. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
  156. diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
  157. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
  158. diffusers/pipelines/consisid/consisid_utils.py +2 -2
  159. diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
  160. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  161. diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
  162. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
  163. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
  164. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
  165. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
  166. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
  167. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
  168. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
  169. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
  170. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
  171. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  172. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
  173. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
  174. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  175. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
  176. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
  177. diffusers/pipelines/cosmos/__init__.py +54 -0
  178. diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
  179. diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
  180. diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
  181. diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
  182. diffusers/pipelines/cosmos/pipeline_output.py +40 -0
  183. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
  184. diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
  185. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  186. diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
  187. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
  188. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
  189. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
  190. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
  191. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
  192. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
  193. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
  194. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  195. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
  196. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  197. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
  198. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
  199. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  200. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  201. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  202. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  203. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  204. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
  205. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
  206. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
  207. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
  208. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
  209. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
  210. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  211. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
  212. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
  213. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
  214. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
  215. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
  216. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  217. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  218. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
  219. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
  220. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
  221. diffusers/pipelines/flux/modeling_flux.py +1 -1
  222. diffusers/pipelines/flux/pipeline_flux.py +10 -17
  223. diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
  224. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
  225. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
  226. diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
  227. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
  228. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
  229. diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
  230. diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
  231. diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
  232. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
  233. diffusers/pipelines/free_init_utils.py +2 -2
  234. diffusers/pipelines/free_noise_utils.py +3 -3
  235. diffusers/pipelines/hidream_image/__init__.py +47 -0
  236. diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
  237. diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
  238. diffusers/pipelines/hunyuan_video/__init__.py +2 -0
  239. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
  240. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
  241. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  242. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
  243. diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
  244. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
  245. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
  246. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
  247. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
  248. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
  249. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
  250. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
  251. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  252. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
  253. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
  254. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
  255. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
  256. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
  257. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
  258. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
  259. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
  260. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
  261. diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
  262. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
  263. diffusers/pipelines/kolors/text_encoder.py +3 -3
  264. diffusers/pipelines/kolors/tokenizer.py +1 -1
  265. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
  266. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
  267. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  268. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
  269. diffusers/pipelines/latte/pipeline_latte.py +12 -12
  270. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
  271. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
  272. diffusers/pipelines/ltx/__init__.py +4 -0
  273. diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
  274. diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
  275. diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
  276. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
  277. diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
  278. diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
  279. diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
  280. diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
  281. diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
  282. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
  283. diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
  284. diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
  285. diffusers/pipelines/onnx_utils.py +15 -2
  286. diffusers/pipelines/pag/pag_utils.py +2 -2
  287. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
  288. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
  289. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
  290. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
  291. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
  292. diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
  293. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
  294. diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
  295. diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
  296. diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
  297. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
  298. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
  299. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
  300. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
  301. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
  302. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
  303. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
  304. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  305. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
  306. diffusers/pipelines/pia/pipeline_pia.py +8 -6
  307. diffusers/pipelines/pipeline_flax_utils.py +3 -4
  308. diffusers/pipelines/pipeline_loading_utils.py +89 -13
  309. diffusers/pipelines/pipeline_utils.py +105 -33
  310. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
  311. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
  312. diffusers/pipelines/sana/__init__.py +4 -0
  313. diffusers/pipelines/sana/pipeline_sana.py +23 -21
  314. diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
  315. diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
  316. diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
  317. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
  318. diffusers/pipelines/shap_e/camera.py +1 -1
  319. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  320. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  321. diffusers/pipelines/shap_e/renderer.py +3 -3
  322. diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
  323. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
  324. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
  325. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
  326. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
  327. diffusers/pipelines/stable_diffusion/__init__.py +0 -7
  328. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  329. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
  330. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  331. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
  332. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  333. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
  334. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
  335. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
  336. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
  337. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
  338. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
  339. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
  340. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
  341. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
  342. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
  343. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
  344. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
  345. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
  346. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
  347. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  348. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  349. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  350. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
  351. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
  352. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
  353. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
  354. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
  355. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
  356. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
  357. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
  358. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
  359. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
  360. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
  361. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
  362. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  363. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
  364. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  365. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
  366. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
  367. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
  368. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
  369. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
  370. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
  371. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
  372. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
  373. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
  374. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
  375. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
  376. diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
  377. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
  378. diffusers/pipelines/unclip/text_proj.py +2 -2
  379. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
  380. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  381. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
  382. diffusers/pipelines/visualcloze/__init__.py +52 -0
  383. diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
  384. diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
  385. diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
  386. diffusers/pipelines/wan/__init__.py +2 -0
  387. diffusers/pipelines/wan/pipeline_wan.py +17 -12
  388. diffusers/pipelines/wan/pipeline_wan_i2v.py +42 -20
  389. diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
  390. diffusers/pipelines/wan/pipeline_wan_video2video.py +18 -18
  391. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  392. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
  393. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  394. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  395. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
  396. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
  397. diffusers/quantizers/__init__.py +179 -1
  398. diffusers/quantizers/base.py +6 -1
  399. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
  400. diffusers/quantizers/bitsandbytes/utils.py +10 -7
  401. diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
  402. diffusers/quantizers/gguf/utils.py +16 -13
  403. diffusers/quantizers/quantization_config.py +18 -16
  404. diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
  405. diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
  406. diffusers/schedulers/__init__.py +3 -1
  407. diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
  408. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  409. diffusers/schedulers/scheduling_consistency_models.py +1 -1
  410. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
  411. diffusers/schedulers/scheduling_ddim.py +8 -8
  412. diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
  413. diffusers/schedulers/scheduling_ddim_flax.py +6 -6
  414. diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
  415. diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
  416. diffusers/schedulers/scheduling_ddpm.py +9 -9
  417. diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
  418. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
  419. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
  420. diffusers/schedulers/scheduling_deis_multistep.py +8 -8
  421. diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
  422. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
  423. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
  424. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
  425. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  426. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
  427. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
  428. diffusers/schedulers/scheduling_edm_euler.py +20 -11
  429. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
  430. diffusers/schedulers/scheduling_euler_discrete.py +3 -3
  431. diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
  432. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
  433. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
  434. diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
  435. diffusers/schedulers/scheduling_heun_discrete.py +2 -2
  436. diffusers/schedulers/scheduling_ipndm.py +2 -2
  437. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
  438. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
  439. diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
  440. diffusers/schedulers/scheduling_lcm.py +3 -3
  441. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  442. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  443. diffusers/schedulers/scheduling_pndm.py +4 -4
  444. diffusers/schedulers/scheduling_pndm_flax.py +4 -4
  445. diffusers/schedulers/scheduling_repaint.py +9 -9
  446. diffusers/schedulers/scheduling_sasolver.py +15 -15
  447. diffusers/schedulers/scheduling_scm.py +1 -1
  448. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  449. diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
  450. diffusers/schedulers/scheduling_tcd.py +3 -3
  451. diffusers/schedulers/scheduling_unclip.py +5 -5
  452. diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
  453. diffusers/schedulers/scheduling_utils.py +1 -1
  454. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  455. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  456. diffusers/training_utils.py +13 -5
  457. diffusers/utils/__init__.py +5 -0
  458. diffusers/utils/accelerate_utils.py +1 -1
  459. diffusers/utils/doc_utils.py +1 -1
  460. diffusers/utils/dummy_pt_objects.py +120 -0
  461. diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
  462. diffusers/utils/dynamic_modules_utils.py +21 -3
  463. diffusers/utils/export_utils.py +1 -1
  464. diffusers/utils/import_utils.py +81 -18
  465. diffusers/utils/logging.py +1 -1
  466. diffusers/utils/outputs.py +2 -1
  467. diffusers/utils/peft_utils.py +91 -8
  468. diffusers/utils/state_dict_utils.py +20 -3
  469. diffusers/utils/testing_utils.py +59 -7
  470. diffusers/utils/torch_utils.py +25 -5
  471. diffusers/video_processor.py +2 -2
  472. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/METADATA +3 -3
  473. diffusers-0.34.0.dist-info/RECORD +639 -0
  474. diffusers-0.33.0.dist-info/RECORD +0 -608
  475. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
  476. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/WHEEL +0 -0
  477. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
  478. {diffusers-0.33.0.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
1
+ # Copyright 2025 VisualCloze team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ from PIL import Image
19
+
20
+ from ...image_processor import VaeImageProcessor
21
+
22
+
23
+ class VisualClozeProcessor(VaeImageProcessor):
24
+ """
25
+ Image processor for the VisualCloze pipeline.
26
+
27
+ This processor handles the preprocessing of images for visual cloze tasks, including resizing, normalization, and
28
+ mask generation.
29
+
30
+ Args:
31
+ resolution (int, optional):
32
+ Target resolution for processing images. Each image will be resized to this resolution before being
33
+ concatenated to avoid the out-of-memory error. Defaults to 384.
34
+ *args: Additional arguments passed to [~image_processor.VaeImageProcessor]
35
+ **kwargs: Additional keyword arguments passed to [~image_processor.VaeImageProcessor]
36
+ """
37
+
38
+ def __init__(self, *args, resolution: int = 384, **kwargs):
39
+ super().__init__(*args, **kwargs)
40
+ self.resolution = resolution
41
+
42
+ def preprocess_image(
43
+ self, input_images: List[List[Optional[Image.Image]]], vae_scale_factor: int
44
+ ) -> Tuple[List[List[torch.Tensor]], List[List[List[int]]], List[int]]:
45
+ """
46
+ Preprocesses input images for the VisualCloze pipeline.
47
+
48
+ This function handles the preprocessing of input images by:
49
+ 1. Resizing and cropping images to maintain consistent dimensions
50
+ 2. Converting images to the Tensor format for the VAE
51
+ 3. Normalizing pixel values
52
+ 4. Tracking image sizes and positions of target images
53
+
54
+ Args:
55
+ input_images (List[List[Optional[Image.Image]]]):
56
+ A nested list of PIL Images where:
57
+ - Outer list represents different samples, including in-context examples and the query
58
+ - Inner list contains images for the task
59
+ - In the last row, condition images are provided and the target images are placed as None
60
+ vae_scale_factor (int):
61
+ The scale factor used by the VAE for resizing images
62
+
63
+ Returns:
64
+ Tuple containing:
65
+ - List[List[torch.Tensor]]: Preprocessed images in tensor format
66
+ - List[List[List[int]]]: Dimensions of each processed image [height, width]
67
+ - List[int]: Target positions indicating which images are to be generated
68
+ """
69
+ n_samples, n_task_images = len(input_images), len(input_images[0])
70
+ divisible = 2 * vae_scale_factor
71
+
72
+ processed_images: List[List[Image.Image]] = [[] for _ in range(n_samples)]
73
+ resize_size: List[Optional[Tuple[int, int]]] = [None for _ in range(n_samples)]
74
+ target_position: List[int] = []
75
+
76
+ # Process each sample
77
+ for i in range(n_samples):
78
+ # Determine size from first non-None image
79
+ for j in range(n_task_images):
80
+ if input_images[i][j] is not None:
81
+ aspect_ratio = input_images[i][j].width / input_images[i][j].height
82
+ target_area = self.resolution * self.resolution
83
+ new_h = int((target_area / aspect_ratio) ** 0.5)
84
+ new_w = int(new_h * aspect_ratio)
85
+
86
+ new_w = max(new_w // divisible, 1) * divisible
87
+ new_h = max(new_h // divisible, 1) * divisible
88
+ resize_size[i] = (new_w, new_h)
89
+ break
90
+
91
+ # Process all images in the sample
92
+ for j in range(n_task_images):
93
+ if input_images[i][j] is not None:
94
+ target = self._resize_and_crop(input_images[i][j], resize_size[i][0], resize_size[i][1])
95
+ processed_images[i].append(target)
96
+ if i == n_samples - 1:
97
+ target_position.append(0)
98
+ else:
99
+ blank = Image.new("RGB", resize_size[i] or (self.resolution, self.resolution), (0, 0, 0))
100
+ processed_images[i].append(blank)
101
+ if i == n_samples - 1:
102
+ target_position.append(1)
103
+
104
+ # Ensure consistent width for multiple target images when there are multiple target images
105
+ if len(target_position) > 1 and sum(target_position) > 1:
106
+ new_w = resize_size[n_samples - 1][0] or 384
107
+ for i in range(len(processed_images)):
108
+ for j in range(len(processed_images[i])):
109
+ if processed_images[i][j] is not None:
110
+ new_h = int(processed_images[i][j].height * (new_w / processed_images[i][j].width))
111
+ new_w = int(new_w / 16) * 16
112
+ new_h = int(new_h / 16) * 16
113
+ processed_images[i][j] = self.height(processed_images[i][j], new_h, new_w)
114
+
115
+ # Convert to tensors and normalize
116
+ image_sizes = []
117
+ for i in range(len(processed_images)):
118
+ image_sizes.append([[img.height, img.width] for img in processed_images[i]])
119
+ for j, image in enumerate(processed_images[i]):
120
+ image = self.pil_to_numpy(image)
121
+ image = self.numpy_to_pt(image)
122
+ image = self.normalize(image)
123
+ processed_images[i][j] = image
124
+
125
+ return processed_images, image_sizes, target_position
126
+
127
+ def preprocess_mask(
128
+ self, input_images: List[List[Image.Image]], target_position: List[int]
129
+ ) -> List[List[torch.Tensor]]:
130
+ """
131
+ Generate masks for the VisualCloze pipeline.
132
+
133
+ Args:
134
+ input_images (List[List[Image.Image]]):
135
+ Processed images from preprocess_image
136
+ target_position (List[int]):
137
+ Binary list marking the positions of target images (1 for target, 0 for condition)
138
+
139
+ Returns:
140
+ List[List[torch.Tensor]]:
141
+ A nested list of mask tensors (1 for target positions, 0 for condition images)
142
+ """
143
+ mask = []
144
+ for i, row in enumerate(input_images):
145
+ if i == len(input_images) - 1: # Query row
146
+ row_masks = [
147
+ torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=m) for m in target_position
148
+ ]
149
+ else: # In-context examples
150
+ row_masks = [
151
+ torch.full((1, 1, row[0].shape[2], row[0].shape[3]), fill_value=0) for _ in target_position
152
+ ]
153
+ mask.append(row_masks)
154
+ return mask
155
+
156
+ def preprocess_image_upsampling(
157
+ self,
158
+ input_images: List[List[Image.Image]],
159
+ height: int,
160
+ width: int,
161
+ ) -> Tuple[List[List[Image.Image]], List[List[List[int]]]]:
162
+ """Process images for the upsampling stage in the VisualCloze pipeline.
163
+
164
+ Args:
165
+ input_images: Input image to process
166
+ height: Target height
167
+ width: Target width
168
+
169
+ Returns:
170
+ Tuple of processed image and its size
171
+ """
172
+ image = self.resize(input_images[0][0], height, width)
173
+ image = self.pil_to_numpy(image) # to np
174
+ image = self.numpy_to_pt(image) # to pt
175
+ image = self.normalize(image)
176
+
177
+ input_images[0][0] = image
178
+ image_sizes = [[[height, width]]]
179
+ return input_images, image_sizes
180
+
181
+ def preprocess_mask_upsampling(self, input_images: List[List[Image.Image]]) -> List[List[torch.Tensor]]:
182
+ return [[torch.ones((1, 1, input_images[0][0].shape[2], input_images[0][0].shape[3]))]]
183
+
184
+ def get_layout_prompt(self, size: Tuple[int, int]) -> str:
185
+ layout_instruction = (
186
+ f"A grid layout with {size[0]} rows and {size[1]} columns, displaying {size[0] * size[1]} images arranged side by side.",
187
+ )
188
+ return layout_instruction
189
+
190
+ def preprocess(
191
+ self,
192
+ task_prompt: Union[str, List[str]],
193
+ content_prompt: Union[str, List[str]],
194
+ input_images: Optional[List[List[List[Optional[str]]]]] = None,
195
+ height: Optional[int] = None,
196
+ width: Optional[int] = None,
197
+ upsampling: bool = False,
198
+ vae_scale_factor: int = 16,
199
+ ) -> Dict:
200
+ """Process visual cloze inputs.
201
+
202
+ Args:
203
+ task_prompt: Task description(s)
204
+ content_prompt: Content description(s)
205
+ input_images: List of images or None for the target images
206
+ height: Optional target height for upsampling stage
207
+ width: Optional target width for upsampling stage
208
+ upsampling: Whether this is in the upsampling processing stage
209
+
210
+ Returns:
211
+ Dictionary containing processed images, masks, prompts and metadata
212
+ """
213
+ if isinstance(task_prompt, str):
214
+ task_prompt = [task_prompt]
215
+ content_prompt = [content_prompt]
216
+ input_images = [input_images]
217
+
218
+ output = {
219
+ "init_image": [],
220
+ "mask": [],
221
+ "task_prompt": task_prompt if not upsampling else [None for _ in range(len(task_prompt))],
222
+ "content_prompt": content_prompt,
223
+ "layout_prompt": [],
224
+ "target_position": [],
225
+ "image_size": [],
226
+ }
227
+ for i in range(len(task_prompt)):
228
+ if upsampling:
229
+ layout_prompt = None
230
+ else:
231
+ layout_prompt = self.get_layout_prompt((len(input_images[i]), len(input_images[i][0])))
232
+
233
+ if upsampling:
234
+ cur_processed_images, cur_image_size = self.preprocess_image_upsampling(
235
+ input_images[i], height=height, width=width
236
+ )
237
+ cur_mask = self.preprocess_mask_upsampling(cur_processed_images)
238
+ else:
239
+ cur_processed_images, cur_image_size, cur_target_position = self.preprocess_image(
240
+ input_images[i], vae_scale_factor=vae_scale_factor
241
+ )
242
+ cur_mask = self.preprocess_mask(cur_processed_images, cur_target_position)
243
+
244
+ output["target_position"].append(cur_target_position)
245
+
246
+ output["image_size"].append(cur_image_size)
247
+ output["init_image"].append(cur_processed_images)
248
+ output["mask"].append(cur_mask)
249
+ output["layout_prompt"].append(layout_prompt)
250
+
251
+ return output
@@ -24,6 +24,7 @@ except OptionalDependencyNotAvailable:
24
24
  else:
25
25
  _import_structure["pipeline_wan"] = ["WanPipeline"]
26
26
  _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
27
+ _import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
27
28
  _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
28
29
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29
30
  try:
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
35
36
  else:
36
37
  from .pipeline_wan import WanPipeline
37
38
  from .pipeline_wan_i2v import WanImageToVideoPipeline
39
+ from .pipeline_wan_vace import WanVACEPipeline
38
40
  from .pipeline_wan_video2video import WanVideoToVideoPipeline
39
41
 
40
42
  else:
@@ -15,7 +15,6 @@
15
15
  import html
16
16
  from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
- import ftfy
19
18
  import regex as re
20
19
  import torch
21
20
  from transformers import AutoTokenizer, UMT5EncoderModel
@@ -24,7 +23,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
23
  from ...loaders import WanLoraLoaderMixin
25
24
  from ...models import AutoencoderKLWan, WanTransformer3DModel
26
25
  from ...schedulers import FlowMatchEulerDiscreteScheduler
27
- from ...utils import is_torch_xla_available, logging, replace_example_docstring
26
+ from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
28
27
  from ...utils.torch_utils import randn_tensor
29
28
  from ...video_processor import VideoProcessor
30
29
  from ..pipeline_utils import DiffusionPipeline
@@ -40,6 +39,9 @@ else:
40
39
 
41
40
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
41
 
42
+ if is_ftfy_available():
43
+ import ftfy
44
+
43
45
 
44
46
  EXAMPLE_DOC_STRING = """
45
47
  Examples:
@@ -386,8 +388,10 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
386
388
 
387
389
  Args:
388
390
  prompt (`str` or `List[str]`, *optional*):
389
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
390
- instead.
391
+ The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
392
+ negative_prompt (`str` or `List[str]`, *optional*):
393
+ The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
394
+ instead. Ignored when not using guidance (`guidance_scale` < `1`).
391
395
  height (`int`, defaults to `480`):
392
396
  The height in pixels of the generated image.
393
397
  width (`int`, defaults to `832`):
@@ -398,11 +402,11 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
398
402
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
399
403
  expense of slower inference.
400
404
  guidance_scale (`float`, defaults to `5.0`):
401
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
402
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
403
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
404
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
405
- usually at the expense of lower image quality.
405
+ Guidance scale as defined in [Classifier-Free Diffusion
406
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
407
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
408
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
409
+ the text `prompt`, usually at the expense of lower image quality.
406
410
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
407
411
  The number of images to generate per prompt.
408
412
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -415,7 +419,7 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
415
419
  prompt_embeds (`torch.Tensor`, *optional*):
416
420
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
417
421
  provided, text embeddings are generated from the `prompt` input argument.
418
- output_type (`str`, *optional*, defaults to `"pil"`):
422
+ output_type (`str`, *optional*, defaults to `"np"`):
419
423
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
420
424
  return_dict (`bool`, *optional*, defaults to `True`):
421
425
  Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
@@ -432,8 +436,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
432
436
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
433
437
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
434
438
  `._callback_tensor_inputs` attribute of your pipeline class.
435
- autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
436
- The dtype to use for the torch.amp.autocast.
439
+ max_sequence_length (`int`, defaults to `512`):
440
+ The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
441
+ truncated. If the prompt is shorter, it will be padded to this length.
437
442
 
438
443
  Examples:
439
444
 
@@ -15,7 +15,6 @@
15
15
  import html
16
16
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
17
 
18
- import ftfy
19
18
  import PIL
20
19
  import regex as re
21
20
  import torch
@@ -26,7 +25,7 @@ from ...image_processor import PipelineImageInput
26
25
  from ...loaders import WanLoraLoaderMixin
27
26
  from ...models import AutoencoderKLWan, WanTransformer3DModel
28
27
  from ...schedulers import FlowMatchEulerDiscreteScheduler
29
- from ...utils import is_torch_xla_available, logging, replace_example_docstring
28
+ from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
30
29
  from ...utils.torch_utils import randn_tensor
31
30
  from ...video_processor import VideoProcessor
32
31
  from ..pipeline_utils import DiffusionPipeline
@@ -42,6 +41,9 @@ else:
42
41
 
43
42
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
44
43
 
44
+ if is_ftfy_available():
45
+ import ftfy
46
+
45
47
  EXAMPLE_DOC_STRING = """
46
48
  Examples:
47
49
  ```python
@@ -378,6 +380,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
378
380
  device: Optional[torch.device] = None,
379
381
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
380
382
  latents: Optional[torch.Tensor] = None,
383
+ last_image: Optional[torch.Tensor] = None,
381
384
  ) -> Tuple[torch.Tensor, torch.Tensor]:
382
385
  num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
383
386
  latent_height = height // self.vae_scale_factor_spatial
@@ -396,10 +399,17 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
396
399
  latents = latents.to(device=device, dtype=dtype)
397
400
 
398
401
  image = image.unsqueeze(2)
399
- video_condition = torch.cat(
400
- [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
401
- )
402
- video_condition = video_condition.to(device=device, dtype=dtype)
402
+ if last_image is None:
403
+ video_condition = torch.cat(
404
+ [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
405
+ )
406
+ else:
407
+ last_image = last_image.unsqueeze(2)
408
+ video_condition = torch.cat(
409
+ [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
410
+ dim=2,
411
+ )
412
+ video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
403
413
 
404
414
  latents_mean = (
405
415
  torch.tensor(self.vae.config.latents_mean)
@@ -419,10 +429,15 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
419
429
  latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
420
430
  latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
421
431
 
432
+ latent_condition = latent_condition.to(dtype)
422
433
  latent_condition = (latent_condition - latents_mean) * latents_std
423
434
 
424
435
  mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
425
- mask_lat_size[:, :, list(range(1, num_frames))] = 0
436
+
437
+ if last_image is None:
438
+ mask_lat_size[:, :, list(range(1, num_frames))] = 0
439
+ else:
440
+ mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
426
441
  first_frame_mask = mask_lat_size[:, :, 0:1]
427
442
  first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
428
443
  mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
@@ -474,6 +489,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
474
489
  prompt_embeds: Optional[torch.Tensor] = None,
475
490
  negative_prompt_embeds: Optional[torch.Tensor] = None,
476
491
  image_embeds: Optional[torch.Tensor] = None,
492
+ last_image: Optional[torch.Tensor] = None,
477
493
  output_type: Optional[str] = "np",
478
494
  return_dict: bool = True,
479
495
  attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -506,11 +522,11 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
506
522
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
507
523
  expense of slower inference.
508
524
  guidance_scale (`float`, defaults to `5.0`):
509
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
510
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
511
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
512
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
513
- usually at the expense of lower image quality.
525
+ Guidance scale as defined in [Classifier-Free Diffusion
526
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
527
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
528
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
529
+ the text `prompt`, usually at the expense of lower image quality.
514
530
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
515
531
  The number of images to generate per prompt.
516
532
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -529,7 +545,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
529
545
  image_embeds (`torch.Tensor`, *optional*):
530
546
  Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
531
547
  image embeddings are generated from the `image` input argument.
532
- output_type (`str`, *optional*, defaults to `"pil"`):
548
+ output_type (`str`, *optional*, defaults to `"np"`):
533
549
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
534
550
  return_dict (`bool`, *optional*, defaults to `True`):
535
551
  Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
@@ -546,12 +562,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
546
562
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
547
563
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
548
564
  `._callback_tensor_inputs` attribute of your pipeline class.
549
- max_sequence_length (`int`, *optional*, defaults to `512`):
550
- The maximum sequence length of the prompt.
551
- shift (`float`, *optional*, defaults to `5.0`):
552
- The shift of the flow.
553
- autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
554
- The dtype to use for the torch.amp.autocast.
565
+ max_sequence_length (`int`, defaults to `512`):
566
+ The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
567
+ truncated. If the prompt is shorter, it will be padded to this length.
568
+
555
569
  Examples:
556
570
 
557
571
  Returns:
@@ -618,7 +632,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
618
632
  negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
619
633
 
620
634
  if image_embeds is None:
621
- image_embeds = self.encode_image(image, device)
635
+ if last_image is None:
636
+ image_embeds = self.encode_image(image, device)
637
+ else:
638
+ image_embeds = self.encode_image([image, last_image], device)
622
639
  image_embeds = image_embeds.repeat(batch_size, 1, 1)
623
640
  image_embeds = image_embeds.to(transformer_dtype)
624
641
 
@@ -629,6 +646,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
629
646
  # 5. Prepare latent variables
630
647
  num_channels_latents = self.vae.config.z_dim
631
648
  image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
649
+ if last_image is not None:
650
+ last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
651
+ device, dtype=torch.float32
652
+ )
632
653
  latents, condition = self.prepare_latents(
633
654
  image,
634
655
  batch_size * num_videos_per_prompt,
@@ -640,6 +661,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
640
661
  device,
641
662
  generator,
642
663
  latents,
664
+ last_image,
643
665
  )
644
666
 
645
667
  # 6. Denoising loop