diffusers 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (478) hide show
  1. diffusers/__init__.py +48 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/dependency_versions_check.py +1 -1
  7. diffusers/dependency_versions_table.py +1 -1
  8. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  9. diffusers/hooks/faster_cache.py +2 -2
  10. diffusers/hooks/group_offloading.py +128 -29
  11. diffusers/hooks/hooks.py +2 -2
  12. diffusers/hooks/layerwise_casting.py +3 -3
  13. diffusers/hooks/pyramid_attention_broadcast.py +1 -1
  14. diffusers/image_processor.py +7 -2
  15. diffusers/loaders/__init__.py +4 -0
  16. diffusers/loaders/ip_adapter.py +5 -14
  17. diffusers/loaders/lora_base.py +212 -111
  18. diffusers/loaders/lora_conversion_utils.py +275 -34
  19. diffusers/loaders/lora_pipeline.py +1554 -819
  20. diffusers/loaders/peft.py +52 -109
  21. diffusers/loaders/single_file.py +2 -2
  22. diffusers/loaders/single_file_model.py +20 -4
  23. diffusers/loaders/single_file_utils.py +225 -5
  24. diffusers/loaders/textual_inversion.py +3 -2
  25. diffusers/loaders/transformer_flux.py +1 -1
  26. diffusers/loaders/transformer_sd3.py +2 -2
  27. diffusers/loaders/unet.py +2 -16
  28. diffusers/loaders/unet_loader_utils.py +1 -1
  29. diffusers/loaders/utils.py +1 -1
  30. diffusers/models/__init__.py +15 -1
  31. diffusers/models/activations.py +5 -5
  32. diffusers/models/adapter.py +2 -3
  33. diffusers/models/attention.py +4 -4
  34. diffusers/models/attention_flax.py +10 -10
  35. diffusers/models/attention_processor.py +14 -10
  36. diffusers/models/auto_model.py +47 -10
  37. diffusers/models/autoencoders/__init__.py +1 -0
  38. diffusers/models/autoencoders/autoencoder_asym_kl.py +4 -4
  39. diffusers/models/autoencoders/autoencoder_dc.py +3 -3
  40. diffusers/models/autoencoders/autoencoder_kl.py +4 -4
  41. diffusers/models/autoencoders/autoencoder_kl_allegro.py +4 -4
  42. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +6 -6
  43. diffusers/models/autoencoders/autoencoder_kl_cosmos.py +1108 -0
  44. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +2 -2
  45. diffusers/models/autoencoders/autoencoder_kl_ltx.py +3 -3
  46. diffusers/models/autoencoders/autoencoder_kl_magvit.py +4 -4
  47. diffusers/models/autoencoders/autoencoder_kl_mochi.py +3 -3
  48. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -4
  49. diffusers/models/autoencoders/autoencoder_kl_wan.py +256 -22
  50. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -1
  51. diffusers/models/autoencoders/autoencoder_tiny.py +3 -3
  52. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  53. diffusers/models/autoencoders/vae.py +13 -2
  54. diffusers/models/autoencoders/vq_model.py +2 -2
  55. diffusers/models/cache_utils.py +1 -1
  56. diffusers/models/controlnet.py +1 -1
  57. diffusers/models/controlnet_flux.py +1 -1
  58. diffusers/models/controlnet_sd3.py +1 -1
  59. diffusers/models/controlnet_sparsectrl.py +1 -1
  60. diffusers/models/controlnets/__init__.py +1 -0
  61. diffusers/models/controlnets/controlnet.py +3 -3
  62. diffusers/models/controlnets/controlnet_flax.py +1 -1
  63. diffusers/models/controlnets/controlnet_flux.py +16 -15
  64. diffusers/models/controlnets/controlnet_hunyuan.py +2 -2
  65. diffusers/models/controlnets/controlnet_sana.py +290 -0
  66. diffusers/models/controlnets/controlnet_sd3.py +1 -1
  67. diffusers/models/controlnets/controlnet_sparsectrl.py +2 -2
  68. diffusers/models/controlnets/controlnet_union.py +1 -1
  69. diffusers/models/controlnets/controlnet_xs.py +7 -7
  70. diffusers/models/controlnets/multicontrolnet.py +4 -5
  71. diffusers/models/controlnets/multicontrolnet_union.py +5 -6
  72. diffusers/models/downsampling.py +2 -2
  73. diffusers/models/embeddings.py +10 -12
  74. diffusers/models/embeddings_flax.py +2 -2
  75. diffusers/models/lora.py +3 -3
  76. diffusers/models/modeling_utils.py +44 -14
  77. diffusers/models/normalization.py +4 -4
  78. diffusers/models/resnet.py +2 -2
  79. diffusers/models/resnet_flax.py +1 -1
  80. diffusers/models/transformers/__init__.py +5 -0
  81. diffusers/models/transformers/auraflow_transformer_2d.py +70 -24
  82. diffusers/models/transformers/cogvideox_transformer_3d.py +1 -1
  83. diffusers/models/transformers/consisid_transformer_3d.py +1 -1
  84. diffusers/models/transformers/dit_transformer_2d.py +2 -2
  85. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  86. diffusers/models/transformers/hunyuan_transformer_2d.py +2 -2
  87. diffusers/models/transformers/latte_transformer_3d.py +4 -5
  88. diffusers/models/transformers/lumina_nextdit2d.py +2 -2
  89. diffusers/models/transformers/pixart_transformer_2d.py +3 -3
  90. diffusers/models/transformers/prior_transformer.py +1 -1
  91. diffusers/models/transformers/sana_transformer.py +8 -3
  92. diffusers/models/transformers/stable_audio_transformer.py +5 -9
  93. diffusers/models/transformers/t5_film_transformer.py +3 -3
  94. diffusers/models/transformers/transformer_2d.py +1 -1
  95. diffusers/models/transformers/transformer_allegro.py +1 -1
  96. diffusers/models/transformers/transformer_chroma.py +742 -0
  97. diffusers/models/transformers/transformer_cogview3plus.py +5 -10
  98. diffusers/models/transformers/transformer_cogview4.py +317 -25
  99. diffusers/models/transformers/transformer_cosmos.py +579 -0
  100. diffusers/models/transformers/transformer_flux.py +9 -11
  101. diffusers/models/transformers/transformer_hidream_image.py +942 -0
  102. diffusers/models/transformers/transformer_hunyuan_video.py +6 -8
  103. diffusers/models/transformers/transformer_hunyuan_video_framepack.py +416 -0
  104. diffusers/models/transformers/transformer_ltx.py +2 -2
  105. diffusers/models/transformers/transformer_lumina2.py +1 -1
  106. diffusers/models/transformers/transformer_mochi.py +1 -1
  107. diffusers/models/transformers/transformer_omnigen.py +2 -2
  108. diffusers/models/transformers/transformer_sd3.py +7 -7
  109. diffusers/models/transformers/transformer_temporal.py +1 -1
  110. diffusers/models/transformers/transformer_wan.py +24 -8
  111. diffusers/models/transformers/transformer_wan_vace.py +393 -0
  112. diffusers/models/unets/unet_1d.py +1 -1
  113. diffusers/models/unets/unet_1d_blocks.py +1 -1
  114. diffusers/models/unets/unet_2d.py +1 -1
  115. diffusers/models/unets/unet_2d_blocks.py +1 -1
  116. diffusers/models/unets/unet_2d_blocks_flax.py +8 -7
  117. diffusers/models/unets/unet_2d_condition.py +2 -2
  118. diffusers/models/unets/unet_2d_condition_flax.py +2 -2
  119. diffusers/models/unets/unet_3d_blocks.py +1 -1
  120. diffusers/models/unets/unet_3d_condition.py +3 -3
  121. diffusers/models/unets/unet_i2vgen_xl.py +3 -3
  122. diffusers/models/unets/unet_kandinsky3.py +1 -1
  123. diffusers/models/unets/unet_motion_model.py +2 -2
  124. diffusers/models/unets/unet_stable_cascade.py +1 -1
  125. diffusers/models/upsampling.py +2 -2
  126. diffusers/models/vae_flax.py +2 -2
  127. diffusers/models/vq_model.py +1 -1
  128. diffusers/pipelines/__init__.py +37 -6
  129. diffusers/pipelines/allegro/pipeline_allegro.py +11 -11
  130. diffusers/pipelines/amused/pipeline_amused.py +7 -6
  131. diffusers/pipelines/amused/pipeline_amused_img2img.py +6 -5
  132. diffusers/pipelines/amused/pipeline_amused_inpaint.py +6 -5
  133. diffusers/pipelines/animatediff/pipeline_animatediff.py +6 -6
  134. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +6 -6
  135. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +16 -15
  136. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +6 -6
  137. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +5 -5
  138. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +5 -5
  139. diffusers/pipelines/audioldm/pipeline_audioldm.py +8 -7
  140. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  141. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +23 -13
  142. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +48 -11
  143. diffusers/pipelines/auto_pipeline.py +6 -7
  144. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  145. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  146. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +11 -10
  147. diffusers/pipelines/chroma/__init__.py +49 -0
  148. diffusers/pipelines/chroma/pipeline_chroma.py +949 -0
  149. diffusers/pipelines/chroma/pipeline_chroma_img2img.py +1034 -0
  150. diffusers/pipelines/chroma/pipeline_output.py +21 -0
  151. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +8 -8
  152. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +8 -8
  153. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +8 -8
  154. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +8 -8
  155. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +9 -9
  156. diffusers/pipelines/cogview4/pipeline_cogview4.py +7 -7
  157. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +7 -7
  158. diffusers/pipelines/consisid/consisid_utils.py +2 -2
  159. diffusers/pipelines/consisid/pipeline_consisid.py +8 -8
  160. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  161. diffusers/pipelines/controlnet/pipeline_controlnet.py +7 -7
  162. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +8 -8
  163. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +7 -7
  164. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +7 -7
  165. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +14 -14
  166. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +10 -6
  167. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +13 -13
  168. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +14 -14
  169. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +5 -5
  170. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +13 -13
  171. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  172. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +8 -8
  173. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +7 -7
  174. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  175. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -10
  176. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +9 -7
  177. diffusers/pipelines/cosmos/__init__.py +54 -0
  178. diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py +673 -0
  179. diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py +792 -0
  180. diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py +664 -0
  181. diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py +826 -0
  182. diffusers/pipelines/cosmos/pipeline_output.py +40 -0
  183. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +5 -4
  184. diffusers/pipelines/ddim/pipeline_ddim.py +4 -4
  185. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  186. diffusers/pipelines/deepfloyd_if/pipeline_if.py +10 -10
  187. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +10 -10
  188. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +10 -10
  189. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +10 -10
  190. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +10 -10
  191. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +10 -10
  192. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +8 -8
  193. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -5
  194. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  195. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +3 -3
  196. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  197. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +2 -2
  198. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +4 -3
  199. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  200. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  201. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  202. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  203. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  204. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +7 -7
  205. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +9 -9
  206. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +10 -10
  207. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -8
  208. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -5
  209. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +18 -18
  210. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  211. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +2 -2
  212. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +6 -6
  213. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +5 -5
  214. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +5 -5
  215. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +5 -5
  216. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  217. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  218. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +4 -4
  219. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +4 -4
  220. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +7 -6
  221. diffusers/pipelines/flux/modeling_flux.py +1 -1
  222. diffusers/pipelines/flux/pipeline_flux.py +10 -17
  223. diffusers/pipelines/flux/pipeline_flux_control.py +6 -6
  224. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -6
  225. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +6 -6
  226. diffusers/pipelines/flux/pipeline_flux_controlnet.py +6 -6
  227. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +30 -22
  228. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +2 -1
  229. diffusers/pipelines/flux/pipeline_flux_fill.py +6 -6
  230. diffusers/pipelines/flux/pipeline_flux_img2img.py +39 -6
  231. diffusers/pipelines/flux/pipeline_flux_inpaint.py +11 -6
  232. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +1 -1
  233. diffusers/pipelines/free_init_utils.py +2 -2
  234. diffusers/pipelines/free_noise_utils.py +3 -3
  235. diffusers/pipelines/hidream_image/__init__.py +47 -0
  236. diffusers/pipelines/hidream_image/pipeline_hidream_image.py +1026 -0
  237. diffusers/pipelines/hidream_image/pipeline_output.py +35 -0
  238. diffusers/pipelines/hunyuan_video/__init__.py +2 -0
  239. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +8 -8
  240. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +8 -8
  241. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py +1114 -0
  242. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +71 -15
  243. diffusers/pipelines/hunyuan_video/pipeline_output.py +19 -0
  244. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +8 -8
  245. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +10 -8
  246. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +6 -6
  247. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +34 -34
  248. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +19 -26
  249. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +7 -7
  250. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +11 -11
  251. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  252. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +35 -35
  253. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +6 -6
  254. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +17 -39
  255. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +17 -45
  256. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +7 -7
  257. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +10 -10
  258. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +10 -10
  259. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +7 -7
  260. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +17 -38
  261. diffusers/pipelines/kolors/pipeline_kolors.py +10 -10
  262. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +12 -12
  263. diffusers/pipelines/kolors/text_encoder.py +3 -3
  264. diffusers/pipelines/kolors/tokenizer.py +1 -1
  265. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +2 -2
  266. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +2 -2
  267. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  268. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +3 -3
  269. diffusers/pipelines/latte/pipeline_latte.py +12 -12
  270. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +13 -13
  271. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +17 -16
  272. diffusers/pipelines/ltx/__init__.py +4 -0
  273. diffusers/pipelines/ltx/modeling_latent_upsampler.py +188 -0
  274. diffusers/pipelines/ltx/pipeline_ltx.py +51 -6
  275. diffusers/pipelines/ltx/pipeline_ltx_condition.py +107 -29
  276. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +50 -6
  277. diffusers/pipelines/ltx/pipeline_ltx_latent_upsample.py +277 -0
  278. diffusers/pipelines/lumina/pipeline_lumina.py +13 -13
  279. diffusers/pipelines/lumina2/pipeline_lumina2.py +10 -10
  280. diffusers/pipelines/marigold/marigold_image_processing.py +2 -2
  281. diffusers/pipelines/mochi/pipeline_mochi.py +6 -6
  282. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -13
  283. diffusers/pipelines/omnigen/pipeline_omnigen.py +13 -11
  284. diffusers/pipelines/omnigen/processor_omnigen.py +8 -3
  285. diffusers/pipelines/onnx_utils.py +15 -2
  286. diffusers/pipelines/pag/pag_utils.py +2 -2
  287. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -8
  288. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +7 -7
  289. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +10 -6
  290. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +14 -14
  291. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +8 -8
  292. diffusers/pipelines/pag/pipeline_pag_kolors.py +10 -10
  293. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +11 -11
  294. diffusers/pipelines/pag/pipeline_pag_sana.py +18 -12
  295. diffusers/pipelines/pag/pipeline_pag_sd.py +8 -8
  296. diffusers/pipelines/pag/pipeline_pag_sd_3.py +7 -7
  297. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +7 -7
  298. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +6 -6
  299. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +5 -5
  300. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +8 -8
  301. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +16 -15
  302. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +18 -17
  303. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +12 -12
  304. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  305. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +8 -7
  306. diffusers/pipelines/pia/pipeline_pia.py +8 -6
  307. diffusers/pipelines/pipeline_flax_utils.py +3 -4
  308. diffusers/pipelines/pipeline_loading_utils.py +89 -13
  309. diffusers/pipelines/pipeline_utils.py +105 -33
  310. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +11 -11
  311. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +11 -11
  312. diffusers/pipelines/sana/__init__.py +4 -0
  313. diffusers/pipelines/sana/pipeline_sana.py +23 -21
  314. diffusers/pipelines/sana/pipeline_sana_controlnet.py +1106 -0
  315. diffusers/pipelines/sana/pipeline_sana_sprint.py +23 -19
  316. diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py +981 -0
  317. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +7 -6
  318. diffusers/pipelines/shap_e/camera.py +1 -1
  319. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  320. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  321. diffusers/pipelines/shap_e/renderer.py +3 -3
  322. diffusers/pipelines/stable_audio/modeling_stable_audio.py +1 -1
  323. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +5 -5
  324. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +8 -8
  325. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +13 -13
  326. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +9 -9
  327. diffusers/pipelines/stable_diffusion/__init__.py +0 -7
  328. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  329. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +11 -4
  330. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  331. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +1 -1
  332. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  333. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +10 -10
  334. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +10 -10
  335. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +10 -10
  336. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +9 -9
  337. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +8 -8
  338. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -5
  339. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +5 -5
  340. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -5
  341. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +5 -5
  342. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +5 -5
  343. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +4 -4
  344. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -5
  345. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +7 -7
  346. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -5
  347. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  348. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  349. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  350. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +7 -7
  351. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +7 -7
  352. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +7 -7
  353. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +12 -8
  354. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +15 -9
  355. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +11 -9
  356. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -9
  357. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +18 -12
  358. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +11 -8
  359. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +11 -8
  360. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -12
  361. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +8 -6
  362. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  363. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +15 -11
  364. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  365. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -15
  366. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +18 -17
  367. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +12 -12
  368. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -15
  369. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +3 -3
  370. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +12 -12
  371. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -17
  372. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +12 -7
  373. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +12 -7
  374. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +15 -13
  375. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +24 -21
  376. diffusers/pipelines/unclip/pipeline_unclip.py +4 -3
  377. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +4 -3
  378. diffusers/pipelines/unclip/text_proj.py +2 -2
  379. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +2 -2
  380. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  381. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +8 -7
  382. diffusers/pipelines/visualcloze/__init__.py +52 -0
  383. diffusers/pipelines/visualcloze/pipeline_visualcloze_combined.py +444 -0
  384. diffusers/pipelines/visualcloze/pipeline_visualcloze_generation.py +952 -0
  385. diffusers/pipelines/visualcloze/visualcloze_utils.py +251 -0
  386. diffusers/pipelines/wan/__init__.py +2 -0
  387. diffusers/pipelines/wan/pipeline_wan.py +13 -10
  388. diffusers/pipelines/wan/pipeline_wan_i2v.py +38 -18
  389. diffusers/pipelines/wan/pipeline_wan_vace.py +976 -0
  390. diffusers/pipelines/wan/pipeline_wan_video2video.py +14 -16
  391. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  392. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +1 -1
  393. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  394. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  395. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +16 -15
  396. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +6 -6
  397. diffusers/quantizers/__init__.py +179 -1
  398. diffusers/quantizers/base.py +6 -1
  399. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -0
  400. diffusers/quantizers/bitsandbytes/utils.py +10 -7
  401. diffusers/quantizers/gguf/gguf_quantizer.py +13 -4
  402. diffusers/quantizers/gguf/utils.py +16 -13
  403. diffusers/quantizers/quantization_config.py +18 -16
  404. diffusers/quantizers/quanto/quanto_quantizer.py +4 -0
  405. diffusers/quantizers/torchao/torchao_quantizer.py +5 -1
  406. diffusers/schedulers/__init__.py +3 -1
  407. diffusers/schedulers/deprecated/scheduling_karras_ve.py +4 -3
  408. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  409. diffusers/schedulers/scheduling_consistency_models.py +1 -1
  410. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +10 -5
  411. diffusers/schedulers/scheduling_ddim.py +8 -8
  412. diffusers/schedulers/scheduling_ddim_cogvideox.py +5 -5
  413. diffusers/schedulers/scheduling_ddim_flax.py +6 -6
  414. diffusers/schedulers/scheduling_ddim_inverse.py +6 -6
  415. diffusers/schedulers/scheduling_ddim_parallel.py +22 -22
  416. diffusers/schedulers/scheduling_ddpm.py +9 -9
  417. diffusers/schedulers/scheduling_ddpm_flax.py +7 -7
  418. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -18
  419. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +2 -2
  420. diffusers/schedulers/scheduling_deis_multistep.py +8 -8
  421. diffusers/schedulers/scheduling_dpm_cogvideox.py +5 -5
  422. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -12
  423. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +22 -20
  424. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +11 -11
  425. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  426. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -13
  427. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +13 -8
  428. diffusers/schedulers/scheduling_edm_euler.py +20 -11
  429. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +3 -3
  430. diffusers/schedulers/scheduling_euler_discrete.py +3 -3
  431. diffusers/schedulers/scheduling_euler_discrete_flax.py +3 -3
  432. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +20 -5
  433. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +1 -1
  434. diffusers/schedulers/scheduling_flow_match_lcm.py +561 -0
  435. diffusers/schedulers/scheduling_heun_discrete.py +2 -2
  436. diffusers/schedulers/scheduling_ipndm.py +2 -2
  437. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -2
  438. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -2
  439. diffusers/schedulers/scheduling_karras_ve_flax.py +5 -5
  440. diffusers/schedulers/scheduling_lcm.py +3 -3
  441. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  442. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  443. diffusers/schedulers/scheduling_pndm.py +4 -4
  444. diffusers/schedulers/scheduling_pndm_flax.py +4 -4
  445. diffusers/schedulers/scheduling_repaint.py +9 -9
  446. diffusers/schedulers/scheduling_sasolver.py +15 -15
  447. diffusers/schedulers/scheduling_scm.py +1 -1
  448. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  449. diffusers/schedulers/scheduling_sde_ve_flax.py +2 -2
  450. diffusers/schedulers/scheduling_tcd.py +3 -3
  451. diffusers/schedulers/scheduling_unclip.py +5 -5
  452. diffusers/schedulers/scheduling_unipc_multistep.py +11 -11
  453. diffusers/schedulers/scheduling_utils.py +1 -1
  454. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  455. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  456. diffusers/training_utils.py +13 -5
  457. diffusers/utils/__init__.py +5 -0
  458. diffusers/utils/accelerate_utils.py +1 -1
  459. diffusers/utils/doc_utils.py +1 -1
  460. diffusers/utils/dummy_pt_objects.py +120 -0
  461. diffusers/utils/dummy_torch_and_transformers_objects.py +225 -0
  462. diffusers/utils/dynamic_modules_utils.py +21 -3
  463. diffusers/utils/export_utils.py +1 -1
  464. diffusers/utils/import_utils.py +81 -18
  465. diffusers/utils/logging.py +1 -1
  466. diffusers/utils/outputs.py +2 -1
  467. diffusers/utils/peft_utils.py +91 -8
  468. diffusers/utils/state_dict_utils.py +20 -3
  469. diffusers/utils/testing_utils.py +59 -7
  470. diffusers/utils/torch_utils.py +25 -5
  471. diffusers/video_processor.py +2 -2
  472. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/METADATA +70 -55
  473. diffusers-0.34.0.dist-info/RECORD +639 -0
  474. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/WHEEL +1 -1
  475. diffusers-0.33.1.dist-info/RECORD +0 -608
  476. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/LICENSE +0 -0
  477. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/entry_points.txt +0 -0
  478. {diffusers-0.33.1.dist-info → diffusers-0.34.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -100,6 +100,50 @@ DEFAULT_PROMPT_TEMPLATE = {
100
100
  }
101
101
 
102
102
 
103
+ def _expand_input_ids_with_image_tokens(
104
+ text_input_ids,
105
+ prompt_attention_mask,
106
+ max_sequence_length,
107
+ image_token_index,
108
+ image_emb_len,
109
+ image_emb_start,
110
+ image_emb_end,
111
+ pad_token_id,
112
+ ):
113
+ special_image_token_mask = text_input_ids == image_token_index
114
+ num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
115
+ batch_indices, non_image_indices = torch.where(text_input_ids != image_token_index)
116
+
117
+ max_expanded_length = max_sequence_length + (num_special_image_tokens.max() * (image_emb_len - 1))
118
+ new_token_positions = torch.cumsum((special_image_token_mask * (image_emb_len - 1) + 1), -1) - 1
119
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
120
+
121
+ expanded_input_ids = torch.full(
122
+ (text_input_ids.shape[0], max_expanded_length),
123
+ pad_token_id,
124
+ dtype=text_input_ids.dtype,
125
+ device=text_input_ids.device,
126
+ )
127
+ expanded_input_ids[batch_indices, text_to_overwrite] = text_input_ids[batch_indices, non_image_indices]
128
+ expanded_input_ids[batch_indices, image_emb_start:image_emb_end] = image_token_index
129
+
130
+ expanded_attention_mask = torch.zeros(
131
+ (text_input_ids.shape[0], max_expanded_length),
132
+ dtype=prompt_attention_mask.dtype,
133
+ device=prompt_attention_mask.device,
134
+ )
135
+ attn_batch_indices, attention_indices = torch.where(expanded_input_ids != pad_token_id)
136
+ expanded_attention_mask[attn_batch_indices, attention_indices] = 1.0
137
+ expanded_attention_mask = expanded_attention_mask.to(prompt_attention_mask.dtype)
138
+ position_ids = (expanded_attention_mask.cumsum(-1) - 1).masked_fill_((expanded_attention_mask == 0), 1)
139
+
140
+ return {
141
+ "input_ids": expanded_input_ids,
142
+ "attention_mask": expanded_attention_mask,
143
+ "position_ids": position_ids,
144
+ }
145
+
146
+
103
147
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
104
148
  def retrieve_timesteps(
105
149
  scheduler,
@@ -251,6 +295,12 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
251
295
  prompt = [prompt_template["template"].format(p) for p in prompt]
252
296
 
253
297
  crop_start = prompt_template.get("crop_start", None)
298
+
299
+ image_emb_len = prompt_template.get("image_emb_len", 576)
300
+ image_emb_start = prompt_template.get("image_emb_start", 5)
301
+ image_emb_end = prompt_template.get("image_emb_end", 581)
302
+ double_return_token_id = prompt_template.get("double_return_token_id", 271)
303
+
254
304
  if crop_start is None:
255
305
  prompt_template_input = self.tokenizer(
256
306
  prompt_template["template"],
@@ -280,19 +330,25 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
280
330
 
281
331
  image_embeds = self.image_processor(image, return_tensors="pt").pixel_values.to(device)
282
332
 
333
+ image_token_index = self.text_encoder.config.image_token_index
334
+ pad_token_id = self.text_encoder.config.pad_token_id
335
+ expanded_inputs = _expand_input_ids_with_image_tokens(
336
+ text_input_ids,
337
+ prompt_attention_mask,
338
+ max_sequence_length,
339
+ image_token_index,
340
+ image_emb_len,
341
+ image_emb_start,
342
+ image_emb_end,
343
+ pad_token_id,
344
+ )
283
345
  prompt_embeds = self.text_encoder(
284
- input_ids=text_input_ids,
285
- attention_mask=prompt_attention_mask,
346
+ **expanded_inputs,
286
347
  pixel_values=image_embeds,
287
348
  output_hidden_states=True,
288
349
  ).hidden_states[-(num_hidden_layers_to_skip + 1)]
289
350
  prompt_embeds = prompt_embeds.to(dtype=dtype)
290
351
 
291
- image_emb_len = prompt_template.get("image_emb_len", 576)
292
- image_emb_start = prompt_template.get("image_emb_start", 5)
293
- image_emb_end = prompt_template.get("image_emb_end", 581)
294
- double_return_token_id = prompt_template.get("double_return_token_id", 271)
295
-
296
352
  if crop_start is not None and crop_start > 0:
297
353
  text_crop_start = crop_start - 1 + image_emb_len
298
354
  batch_indices, last_double_return_token_indices = torch.where(text_input_ids == double_return_token_id)
@@ -655,13 +711,13 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
655
711
  true_cfg_scale (`float`, *optional*, defaults to 1.0):
656
712
  When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
657
713
  guidance_scale (`float`, defaults to `1.0`):
658
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
659
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
660
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
661
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
662
- usually at the expense of lower image quality. Note that the only available HunyuanVideo model is
663
- CFG-distilled, which means that traditional guidance between unconditional and conditional latent is
664
- not applied.
714
+ Guidance scale as defined in [Classifier-Free Diffusion
715
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
716
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
717
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
718
+ the text `prompt`, usually at the expense of lower image quality. Note that the only available
719
+ HunyuanVideo model is CFG-distilled, which means that traditional guidance between unconditional and
720
+ conditional latent is not applied.
665
721
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
666
722
  The number of images to generate per prompt.
667
723
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -1,5 +1,8 @@
1
1
  from dataclasses import dataclass
2
+ from typing import List, Union
2
3
 
4
+ import numpy as np
5
+ import PIL.Image
3
6
  import torch
4
7
 
5
8
  from diffusers.utils import BaseOutput
@@ -18,3 +21,19 @@ class HunyuanVideoPipelineOutput(BaseOutput):
18
21
  """
19
22
 
20
23
  frames: torch.Tensor
24
+
25
+
26
+ @dataclass
27
+ class HunyuanVideoFramepackPipelineOutput(BaseOutput):
28
+ r"""
29
+ Output class for HunyuanVideo pipelines.
30
+
31
+ Args:
32
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
33
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
34
+ denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
35
+ `(batch_size, num_frames, channels, height, width)`. Or, a list of torch tensors where each tensor
36
+ corresponds to a latent that decodes to multiple frames.
37
+ """
38
+
39
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]], List[torch.Tensor]]
@@ -1,4 +1,4 @@
1
- # Copyright 2024 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 HunyuanDiT Authors and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -128,7 +128,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
128
128
  r"""
129
129
  Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
130
130
  Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
131
- Flawed](https://arxiv.org/pdf/2305.08891.pdf).
131
+ Flawed](https://huggingface.co/papers/2305.08891).
132
132
 
133
133
  Args:
134
134
  noise_cfg (`torch.Tensor`):
@@ -433,7 +433,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
433
433
  def prepare_extra_step_kwargs(self, generator, eta):
434
434
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
435
435
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
436
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
436
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
437
437
  # and should be between [0, 1]
438
438
 
439
439
  accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -555,7 +555,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
555
555
  return self._guidance_rescale
556
556
 
557
557
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
558
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
558
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
559
559
  # corresponds to doing no classifier free guidance.
560
560
  @property
561
561
  def do_classifier_free_guidance(self):
@@ -625,8 +625,8 @@ class HunyuanDiTPipeline(DiffusionPipeline):
625
625
  num_images_per_prompt (`int`, *optional*, defaults to 1):
626
626
  The number of images to generate per prompt.
627
627
  eta (`float`, *optional*, defaults to 0.0):
628
- Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
629
- to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
628
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
629
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
630
630
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
631
631
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
632
632
  generation deterministic.
@@ -662,7 +662,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
662
662
  inputs will be passed.
663
663
  guidance_rescale (`float`, *optional*, defaults to 0.0):
664
664
  Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
665
- Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
665
+ Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891). See Section 3.4
666
666
  original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
667
667
  The original size of the image. Used to calculate the time ids.
668
668
  target_size (`Tuple[int, int]`, *optional*):
@@ -865,7 +865,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
865
865
  noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
866
866
 
867
867
  if self.do_classifier_free_guidance and guidance_rescale > 0.0:
868
- # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
868
+ # Based on 3.4. in https://huggingface.co/papers/2305.08891
869
869
  noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
870
870
 
871
871
  # compute the previous noisy sample x_t -> x_t-1
@@ -1,4 +1,4 @@
1
- # Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ from ...utils import (
33
33
  )
34
34
  from ...utils.torch_utils import randn_tensor
35
35
  from ...video_processor import VideoProcessor
36
- from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
36
+ from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
37
37
 
38
38
 
39
39
  if is_torch_xla_available():
@@ -97,9 +97,11 @@ class I2VGenXLPipelineOutput(BaseOutput):
97
97
 
98
98
 
99
99
  class I2VGenXLPipeline(
100
+ DeprecatedPipelineMixin,
100
101
  DiffusionPipeline,
101
102
  StableDiffusionMixin,
102
103
  ):
104
+ _last_supported_version = "0.33.1"
103
105
  r"""
104
106
  Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/).
105
107
 
@@ -151,7 +153,7 @@ class I2VGenXLPipeline(
151
153
  return self._guidance_scale
152
154
 
153
155
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
154
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
156
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
155
157
  # corresponds to doing no classifier free guidance.
156
158
  @property
157
159
  def do_classifier_free_guidance(self):
@@ -384,7 +386,7 @@ class I2VGenXLPipeline(
384
386
  def prepare_extra_step_kwargs(self, generator, eta):
385
387
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
386
388
  # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
387
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
389
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
388
390
  # and should be between [0, 1]
389
391
 
390
392
  accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -462,7 +464,7 @@ class I2VGenXLPipeline(
462
464
  image_latents = image_latents.unsqueeze(2)
463
465
 
464
466
  # Append a position mask for each subsequent frame
465
- # after the intial image latent frame
467
+ # after the initial image latent frame
466
468
  frame_position_mask = []
467
469
  for frame_idx in range(num_frames - 1):
468
470
  scale = (frame_idx + 1) / (num_frames - 1)
@@ -557,8 +559,8 @@ class I2VGenXLPipeline(
557
559
  The prompt or prompts to guide what to not include in image generation. If not defined, you need to
558
560
  pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
559
561
  eta (`float`, *optional*):
560
- Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
561
- to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
562
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
563
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
562
564
  num_videos_per_prompt (`int`, *optional*):
563
565
  The number of images to generate per prompt.
564
566
  decode_chunk_size (`int`, *optional*):
@@ -614,7 +616,7 @@ class I2VGenXLPipeline(
614
616
 
615
617
  device = self._execution_device
616
618
  # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
617
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
619
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
618
620
  # corresponds to doing no classifier free guidance.
619
621
  self._guidance_scale = guidance_scale
620
622
 
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -278,11 +278,11 @@ class KandinskyPipeline(DiffusionPipeline):
278
278
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
279
279
  expense of slower inference.
280
280
  guidance_scale (`float`, *optional*, defaults to 4.0):
281
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
282
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
283
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
284
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
285
- usually at the expense of lower image quality.
281
+ Guidance scale as defined in [Classifier-Free Diffusion
282
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
283
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
284
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
285
+ the text `prompt`, usually at the expense of lower image quality.
286
286
  num_images_per_prompt (`int`, *optional*, defaults to 1):
287
287
  The number of images to generate per prompt.
288
288
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -193,7 +193,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
193
193
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
194
194
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
195
195
 
196
- def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
196
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
197
197
  r"""
198
198
  Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
199
199
  Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
@@ -251,20 +251,20 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
251
251
  width (`int`, *optional*, defaults to 512):
252
252
  The width in pixels of the generated image.
253
253
  prior_guidance_scale (`float`, *optional*, defaults to 4.0):
254
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
255
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
256
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
257
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
258
- usually at the expense of lower image quality.
254
+ Guidance scale as defined in [Classifier-Free Diffusion
255
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
256
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
257
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
258
+ the text `prompt`, usually at the expense of lower image quality.
259
259
  prior_num_inference_steps (`int`, *optional*, defaults to 100):
260
260
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
261
261
  expense of slower inference.
262
262
  guidance_scale (`float`, *optional*, defaults to 4.0):
263
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
264
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
265
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
266
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
267
- usually at the expense of lower image quality.
263
+ Guidance scale as defined in [Classifier-Free Diffusion
264
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
265
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
266
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
267
+ the text `prompt`, usually at the expense of lower image quality.
268
268
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
269
269
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
270
270
  to make generation deterministic.
@@ -411,7 +411,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
411
411
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
412
412
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
413
413
 
414
- def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
414
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
415
415
  r"""
416
416
  Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
417
417
  text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -482,20 +482,20 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
482
482
  be maximum and the denoising process will run for the full number of iterations specified in
483
483
  `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
484
484
  prior_guidance_scale (`float`, *optional*, defaults to 4.0):
485
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
486
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
487
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
488
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
489
- usually at the expense of lower image quality.
485
+ Guidance scale as defined in [Classifier-Free Diffusion
486
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
487
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
488
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
489
+ the text `prompt`, usually at the expense of lower image quality.
490
490
  prior_num_inference_steps (`int`, *optional*, defaults to 100):
491
491
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
492
492
  expense of slower inference.
493
493
  guidance_scale (`float`, *optional*, defaults to 4.0):
494
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
495
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
496
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
497
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
498
- usually at the expense of lower image quality.
494
+ Guidance scale as defined in [Classifier-Free Diffusion
495
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
496
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
497
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
498
+ the text `prompt`, usually at the expense of lower image quality.
499
499
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
500
500
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
501
501
  to make generation deterministic.
@@ -652,7 +652,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
652
652
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
653
653
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
654
654
 
655
- def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
655
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
656
656
  r"""
657
657
  Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
658
658
  text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -722,20 +722,20 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
722
722
  width (`int`, *optional*, defaults to 512):
723
723
  The width in pixels of the generated image.
724
724
  prior_guidance_scale (`float`, *optional*, defaults to 4.0):
725
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
726
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
727
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
728
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
729
- usually at the expense of lower image quality.
725
+ Guidance scale as defined in [Classifier-Free Diffusion
726
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
727
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
728
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
729
+ the text `prompt`, usually at the expense of lower image quality.
730
730
  prior_num_inference_steps (`int`, *optional*, defaults to 100):
731
731
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
732
732
  expense of slower inference.
733
733
  guidance_scale (`float`, *optional*, defaults to 4.0):
734
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
735
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
736
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
737
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
738
- usually at the expense of lower image quality.
734
+ Guidance scale as defined in [Classifier-Free Diffusion
735
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
736
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
737
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
738
+ the text `prompt`, usually at the expense of lower image quality.
739
739
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
740
740
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
741
741
  to make generation deterministic.
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -14,14 +14,13 @@
14
14
 
15
15
  from typing import Callable, List, Optional, Union
16
16
 
17
- import numpy as np
18
17
  import PIL.Image
19
18
  import torch
20
- from PIL import Image
21
19
  from transformers import (
22
20
  XLMRobertaTokenizer,
23
21
  )
24
22
 
23
+ from ...image_processor import VaeImageProcessor
25
24
  from ...models import UNet2DConditionModel, VQModel
26
25
  from ...schedulers import DDIMScheduler
27
26
  from ...utils import (
@@ -95,15 +94,6 @@ def get_new_h_w(h, w, scale_factor=8):
95
94
  return new_h * scale_factor, new_w * scale_factor
96
95
 
97
96
 
98
- def prepare_image(pil_image, w=512, h=512):
99
- pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
100
- arr = np.array(pil_image.convert("RGB"))
101
- arr = arr.astype(np.float32) / 127.5 - 1
102
- arr = np.transpose(arr, [2, 0, 1])
103
- image = torch.from_numpy(arr).unsqueeze(0)
104
- return image
105
-
106
-
107
97
  class KandinskyImg2ImgPipeline(DiffusionPipeline):
108
98
  """
109
99
  Pipeline for image-to-image generation using Kandinsky
@@ -143,7 +133,16 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
143
133
  scheduler=scheduler,
144
134
  movq=movq,
145
135
  )
146
- self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
136
+ self.movq_scale_factor = (
137
+ 2 ** (len(self.movq.config.block_out_channels) - 1) if getattr(self, "movq", None) else 8
138
+ )
139
+ movq_latent_channels = self.movq.config.latent_channels if getattr(self, "movq", None) else 4
140
+ self.image_processor = VaeImageProcessor(
141
+ vae_scale_factor=self.movq_scale_factor,
142
+ vae_latent_channels=movq_latent_channels,
143
+ resample="bicubic",
144
+ reducing_gap=1,
145
+ )
147
146
 
148
147
  def get_timesteps(self, num_inference_steps, strength, device):
149
148
  # get the original timestep using init_timestep
@@ -350,11 +349,11 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
350
349
  be maximum and the denoising process will run for the full number of iterations specified in
351
350
  `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
352
351
  guidance_scale (`float`, *optional*, defaults to 4.0):
353
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
354
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
355
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
356
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
357
- usually at the expense of lower image quality.
352
+ Guidance scale as defined in [Classifier-Free Diffusion
353
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
354
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
355
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
356
+ the text `prompt`, usually at the expense of lower image quality.
358
357
  num_images_per_prompt (`int`, *optional*, defaults to 1):
359
358
  The number of images to generate per prompt.
360
359
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -417,7 +416,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
417
416
  f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support PIL image and pytorch tensor"
418
417
  )
419
418
 
420
- image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
419
+ image = torch.cat([self.image_processor.preprocess(i, width, height) for i in image], dim=0)
421
420
  image = image.to(dtype=prompt_embeds.dtype, device=device)
422
421
 
423
422
  latents = self.movq.encode(image)["latents"]
@@ -498,13 +497,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
498
497
  if output_type not in ["pt", "np", "pil"]:
499
498
  raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
500
499
 
501
- if output_type in ["np", "pil"]:
502
- image = image * 0.5 + 0.5
503
- image = image.clamp(0, 1)
504
- image = image.cpu().permute(0, 2, 3, 1).float().numpy()
505
-
506
- if output_type == "pil":
507
- image = self.numpy_to_pil(image)
500
+ image = self.image_processor.postprocess(image, output_type)
508
501
 
509
502
  if not return_dict:
510
503
  return (image,)
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -456,11 +456,11 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
456
456
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
457
457
  expense of slower inference.
458
458
  guidance_scale (`float`, *optional*, defaults to 4.0):
459
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
460
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
461
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
462
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
463
- usually at the expense of lower image quality.
459
+ Guidance scale as defined in [Classifier-Free Diffusion
460
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
461
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
462
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
463
+ the text `prompt`, usually at the expense of lower image quality.
464
464
  num_images_per_prompt (`int`, *optional*, defaults to 1):
465
465
  The number of images to generate per prompt.
466
466
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -496,7 +496,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
496
496
  "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
497
497
  "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
498
498
  "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
499
- "This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
499
+ "This warning will be suppressed after the first inference call and will be removed in diffusers>0.23.0"
500
500
  )
501
501
  self._warn_has_been_called = True
502
502
 
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -220,11 +220,11 @@ class KandinskyPriorPipeline(DiffusionPipeline):
220
220
  The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
221
221
  `guidance_scale` is less than `1`).
222
222
  guidance_scale (`float`, *optional*, defaults to 4.0):
223
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
224
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
225
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
226
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
227
- usually at the expense of lower image quality.
223
+ Guidance scale as defined in [Classifier-Free Diffusion
224
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
225
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
226
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
227
+ the text `prompt`, usually at the expense of lower image quality.
228
228
 
229
229
  Examples:
230
230
 
@@ -439,11 +439,11 @@ class KandinskyPriorPipeline(DiffusionPipeline):
439
439
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
440
440
  tensor will ge generated by sampling using the supplied random `generator`.
441
441
  guidance_scale (`float`, *optional*, defaults to 4.0):
442
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
443
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
444
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
445
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
446
- usually at the expense of lower image quality.
442
+ Guidance scale as defined in [Classifier-Free Diffusion
443
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
444
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
445
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
446
+ the text `prompt`, usually at the expense of lower image quality.
447
447
  output_type (`str`, *optional*, defaults to `"pt"`):
448
448
  The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
449
449
  (`torch.Tensor`).
@@ -1,4 +1,4 @@
1
- # Copyright 2024 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -162,11 +162,11 @@ class KandinskyV22Pipeline(DiffusionPipeline):
162
162
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
163
163
  expense of slower inference.
164
164
  guidance_scale (`float`, *optional*, defaults to 4.0):
165
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
166
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
167
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
168
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
169
- usually at the expense of lower image quality.
165
+ Guidance scale as defined in [Classifier-Free Diffusion
166
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
167
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
168
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
169
+ the text `prompt`, usually at the expense of lower image quality.
170
170
  num_images_per_prompt (`int`, *optional*, defaults to 1):
171
171
  The number of images to generate per prompt.
172
172
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):