diffusers 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (389) hide show
  1. diffusers/__init__.py +186 -3
  2. diffusers/configuration_utils.py +40 -12
  3. diffusers/dependency_versions_table.py +9 -2
  4. diffusers/hooks/__init__.py +9 -0
  5. diffusers/hooks/faster_cache.py +653 -0
  6. diffusers/hooks/group_offloading.py +793 -0
  7. diffusers/hooks/hooks.py +236 -0
  8. diffusers/hooks/layerwise_casting.py +245 -0
  9. diffusers/hooks/pyramid_attention_broadcast.py +311 -0
  10. diffusers/loaders/__init__.py +6 -0
  11. diffusers/loaders/ip_adapter.py +38 -30
  12. diffusers/loaders/lora_base.py +198 -28
  13. diffusers/loaders/lora_conversion_utils.py +679 -44
  14. diffusers/loaders/lora_pipeline.py +1963 -801
  15. diffusers/loaders/peft.py +169 -84
  16. diffusers/loaders/single_file.py +17 -2
  17. diffusers/loaders/single_file_model.py +53 -5
  18. diffusers/loaders/single_file_utils.py +653 -75
  19. diffusers/loaders/textual_inversion.py +9 -9
  20. diffusers/loaders/transformer_flux.py +8 -9
  21. diffusers/loaders/transformer_sd3.py +120 -39
  22. diffusers/loaders/unet.py +22 -32
  23. diffusers/models/__init__.py +22 -0
  24. diffusers/models/activations.py +9 -9
  25. diffusers/models/attention.py +0 -1
  26. diffusers/models/attention_processor.py +163 -25
  27. diffusers/models/auto_model.py +169 -0
  28. diffusers/models/autoencoders/__init__.py +2 -0
  29. diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
  30. diffusers/models/autoencoders/autoencoder_dc.py +106 -4
  31. diffusers/models/autoencoders/autoencoder_kl.py +0 -4
  32. diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
  33. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
  34. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
  35. diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
  36. diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
  37. diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
  38. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
  39. diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
  40. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
  41. diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
  42. diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
  43. diffusers/models/autoencoders/vae.py +31 -141
  44. diffusers/models/autoencoders/vq_model.py +3 -0
  45. diffusers/models/cache_utils.py +108 -0
  46. diffusers/models/controlnets/__init__.py +1 -0
  47. diffusers/models/controlnets/controlnet.py +3 -8
  48. diffusers/models/controlnets/controlnet_flux.py +14 -42
  49. diffusers/models/controlnets/controlnet_sd3.py +58 -34
  50. diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
  51. diffusers/models/controlnets/controlnet_union.py +27 -18
  52. diffusers/models/controlnets/controlnet_xs.py +7 -46
  53. diffusers/models/controlnets/multicontrolnet_union.py +196 -0
  54. diffusers/models/embeddings.py +18 -7
  55. diffusers/models/model_loading_utils.py +122 -80
  56. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  57. diffusers/models/modeling_flax_utils.py +1 -1
  58. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  59. diffusers/models/modeling_utils.py +617 -272
  60. diffusers/models/normalization.py +67 -14
  61. diffusers/models/resnet.py +1 -1
  62. diffusers/models/transformers/__init__.py +6 -0
  63. diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
  64. diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
  65. diffusers/models/transformers/consisid_transformer_3d.py +789 -0
  66. diffusers/models/transformers/dit_transformer_2d.py +5 -19
  67. diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
  68. diffusers/models/transformers/latte_transformer_3d.py +20 -15
  69. diffusers/models/transformers/lumina_nextdit2d.py +3 -1
  70. diffusers/models/transformers/pixart_transformer_2d.py +4 -19
  71. diffusers/models/transformers/prior_transformer.py +5 -1
  72. diffusers/models/transformers/sana_transformer.py +144 -40
  73. diffusers/models/transformers/stable_audio_transformer.py +5 -20
  74. diffusers/models/transformers/transformer_2d.py +7 -22
  75. diffusers/models/transformers/transformer_allegro.py +9 -17
  76. diffusers/models/transformers/transformer_cogview3plus.py +6 -17
  77. diffusers/models/transformers/transformer_cogview4.py +462 -0
  78. diffusers/models/transformers/transformer_easyanimate.py +527 -0
  79. diffusers/models/transformers/transformer_flux.py +68 -110
  80. diffusers/models/transformers/transformer_hunyuan_video.py +409 -49
  81. diffusers/models/transformers/transformer_ltx.py +53 -35
  82. diffusers/models/transformers/transformer_lumina2.py +548 -0
  83. diffusers/models/transformers/transformer_mochi.py +6 -17
  84. diffusers/models/transformers/transformer_omnigen.py +469 -0
  85. diffusers/models/transformers/transformer_sd3.py +56 -86
  86. diffusers/models/transformers/transformer_temporal.py +5 -11
  87. diffusers/models/transformers/transformer_wan.py +469 -0
  88. diffusers/models/unets/unet_1d.py +3 -1
  89. diffusers/models/unets/unet_2d.py +21 -20
  90. diffusers/models/unets/unet_2d_blocks.py +19 -243
  91. diffusers/models/unets/unet_2d_condition.py +4 -6
  92. diffusers/models/unets/unet_3d_blocks.py +14 -127
  93. diffusers/models/unets/unet_3d_condition.py +8 -12
  94. diffusers/models/unets/unet_i2vgen_xl.py +5 -13
  95. diffusers/models/unets/unet_kandinsky3.py +0 -4
  96. diffusers/models/unets/unet_motion_model.py +20 -114
  97. diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
  98. diffusers/models/unets/unet_stable_cascade.py +8 -35
  99. diffusers/models/unets/uvit_2d.py +1 -4
  100. diffusers/optimization.py +2 -2
  101. diffusers/pipelines/__init__.py +57 -8
  102. diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
  103. diffusers/pipelines/amused/pipeline_amused.py +15 -2
  104. diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
  105. diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
  106. diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
  107. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
  108. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
  109. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
  110. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
  111. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
  112. diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
  113. diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
  114. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
  115. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
  116. diffusers/pipelines/auto_pipeline.py +35 -14
  117. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  118. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
  119. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
  120. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
  121. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
  122. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
  123. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
  124. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
  125. diffusers/pipelines/cogview4/__init__.py +49 -0
  126. diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
  127. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
  128. diffusers/pipelines/cogview4/pipeline_output.py +21 -0
  129. diffusers/pipelines/consisid/__init__.py +49 -0
  130. diffusers/pipelines/consisid/consisid_utils.py +357 -0
  131. diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
  132. diffusers/pipelines/consisid/pipeline_output.py +20 -0
  133. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
  134. diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
  135. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
  136. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
  137. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
  138. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
  139. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
  140. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
  141. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
  142. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
  143. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
  144. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
  145. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
  146. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
  147. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
  148. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
  149. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
  150. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
  151. diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
  152. diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
  153. diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
  154. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
  155. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
  156. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
  157. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
  158. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
  159. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
  160. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
  161. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  162. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
  163. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
  164. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
  165. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
  166. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
  167. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
  168. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
  169. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
  170. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  171. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  172. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
  173. diffusers/pipelines/dit/pipeline_dit.py +15 -2
  174. diffusers/pipelines/easyanimate/__init__.py +52 -0
  175. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
  176. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
  177. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
  178. diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
  179. diffusers/pipelines/flux/pipeline_flux.py +53 -21
  180. diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
  181. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
  182. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
  183. diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
  184. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
  185. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
  186. diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
  187. diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
  188. diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
  189. diffusers/pipelines/free_noise_utils.py +3 -3
  190. diffusers/pipelines/hunyuan_video/__init__.py +4 -0
  191. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
  192. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
  193. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
  194. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
  195. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
  196. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
  197. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  198. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
  199. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
  200. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
  201. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
  202. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
  203. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
  204. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
  205. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
  206. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
  207. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
  208. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
  209. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
  210. diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
  211. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
  212. diffusers/pipelines/kolors/text_encoder.py +7 -34
  213. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
  214. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
  215. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
  216. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
  217. diffusers/pipelines/latte/pipeline_latte.py +36 -7
  218. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
  219. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
  220. diffusers/pipelines/ltx/__init__.py +2 -0
  221. diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
  222. diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
  223. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
  224. diffusers/pipelines/lumina/__init__.py +2 -2
  225. diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
  226. diffusers/pipelines/lumina2/__init__.py +48 -0
  227. diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
  228. diffusers/pipelines/marigold/__init__.py +2 -0
  229. diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
  230. diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
  231. diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
  232. diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
  233. diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
  234. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
  235. diffusers/pipelines/omnigen/__init__.py +50 -0
  236. diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
  237. diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
  238. diffusers/pipelines/onnx_utils.py +5 -3
  239. diffusers/pipelines/pag/pag_utils.py +1 -1
  240. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
  241. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
  242. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
  243. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
  244. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
  245. diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
  246. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
  247. diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
  248. diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
  249. diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
  250. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
  251. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
  252. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
  253. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
  254. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
  255. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
  256. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
  257. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
  258. diffusers/pipelines/pia/pipeline_pia.py +13 -1
  259. diffusers/pipelines/pipeline_flax_utils.py +7 -7
  260. diffusers/pipelines/pipeline_loading_utils.py +193 -83
  261. diffusers/pipelines/pipeline_utils.py +221 -106
  262. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
  263. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
  264. diffusers/pipelines/sana/__init__.py +2 -0
  265. diffusers/pipelines/sana/pipeline_sana.py +183 -58
  266. diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
  267. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
  268. diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
  269. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
  270. diffusers/pipelines/shap_e/renderer.py +6 -6
  271. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
  272. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
  273. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
  274. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
  275. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
  276. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
  277. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
  278. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
  279. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  280. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
  281. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
  282. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
  283. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
  284. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
  285. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
  286. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
  287. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
  288. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
  289. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
  290. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
  291. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
  292. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
  293. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
  294. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
  295. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
  296. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
  297. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
  298. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
  299. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
  300. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
  301. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
  302. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
  303. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
  304. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
  305. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
  306. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  307. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
  308. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
  309. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
  310. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
  311. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
  312. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
  313. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
  314. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
  315. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
  316. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
  317. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
  318. diffusers/pipelines/transformers_loading_utils.py +121 -0
  319. diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
  320. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
  321. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
  322. diffusers/pipelines/wan/__init__.py +51 -0
  323. diffusers/pipelines/wan/pipeline_output.py +20 -0
  324. diffusers/pipelines/wan/pipeline_wan.py +593 -0
  325. diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
  326. diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
  327. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
  328. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
  329. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
  330. diffusers/quantizers/auto.py +5 -1
  331. diffusers/quantizers/base.py +5 -9
  332. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
  333. diffusers/quantizers/bitsandbytes/utils.py +30 -20
  334. diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
  335. diffusers/quantizers/gguf/utils.py +4 -2
  336. diffusers/quantizers/quantization_config.py +59 -4
  337. diffusers/quantizers/quanto/__init__.py +1 -0
  338. diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
  339. diffusers/quantizers/quanto/utils.py +60 -0
  340. diffusers/quantizers/torchao/__init__.py +1 -1
  341. diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
  342. diffusers/schedulers/__init__.py +2 -1
  343. diffusers/schedulers/scheduling_consistency_models.py +1 -2
  344. diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
  345. diffusers/schedulers/scheduling_ddpm.py +2 -3
  346. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
  347. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
  348. diffusers/schedulers/scheduling_edm_euler.py +45 -10
  349. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
  350. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
  351. diffusers/schedulers/scheduling_heun_discrete.py +1 -1
  352. diffusers/schedulers/scheduling_lcm.py +1 -2
  353. diffusers/schedulers/scheduling_lms_discrete.py +1 -1
  354. diffusers/schedulers/scheduling_repaint.py +5 -1
  355. diffusers/schedulers/scheduling_scm.py +265 -0
  356. diffusers/schedulers/scheduling_tcd.py +1 -2
  357. diffusers/schedulers/scheduling_utils.py +2 -1
  358. diffusers/training_utils.py +14 -7
  359. diffusers/utils/__init__.py +10 -2
  360. diffusers/utils/constants.py +13 -1
  361. diffusers/utils/deprecation_utils.py +1 -1
  362. diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
  363. diffusers/utils/dummy_gguf_objects.py +17 -0
  364. diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
  365. diffusers/utils/dummy_pt_objects.py +233 -0
  366. diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
  367. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  368. diffusers/utils/dummy_torchao_objects.py +17 -0
  369. diffusers/utils/dynamic_modules_utils.py +1 -1
  370. diffusers/utils/export_utils.py +28 -3
  371. diffusers/utils/hub_utils.py +52 -102
  372. diffusers/utils/import_utils.py +121 -221
  373. diffusers/utils/loading_utils.py +14 -1
  374. diffusers/utils/logging.py +1 -2
  375. diffusers/utils/peft_utils.py +6 -14
  376. diffusers/utils/remote_utils.py +425 -0
  377. diffusers/utils/source_code_parsing_utils.py +52 -0
  378. diffusers/utils/state_dict_utils.py +15 -1
  379. diffusers/utils/testing_utils.py +243 -13
  380. diffusers/utils/torch_utils.py +10 -0
  381. diffusers/utils/typing_utils.py +91 -0
  382. diffusers/video_processor.py +1 -1
  383. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
  384. diffusers-0.33.0.dist-info/RECORD +608 -0
  385. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
  386. diffusers-0.32.1.dist-info/RECORD +0 -550
  387. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
  388. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
  389. {diffusers-0.32.1.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -61,6 +61,17 @@ from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutpu
61
61
  if is_invisible_watermark_available():
62
62
  from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
63
63
 
64
+
65
+ from ...utils import is_torch_xla_available
66
+
67
+
68
+ if is_torch_xla_available():
69
+ import torch_xla.core.xla_model as xm
70
+
71
+ XLA_AVAILABLE = True
72
+ else:
73
+ XLA_AVAILABLE = False
74
+
64
75
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
65
76
 
66
77
 
@@ -241,12 +252,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
241
252
  "feature_extractor",
242
253
  "image_encoder",
243
254
  ]
244
- _callback_tensor_inputs = [
245
- "latents",
246
- "prompt_embeds",
247
- "add_text_embeds",
248
- "add_time_ids",
249
- ]
255
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "add_text_embeds", "add_time_ids", "control_image"]
250
256
 
251
257
  def __init__(
252
258
  self,
@@ -281,7 +287,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
281
287
  feature_extractor=feature_extractor,
282
288
  image_encoder=image_encoder,
283
289
  )
284
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
290
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
285
291
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
286
292
  self.control_image_processor = VaeImageProcessor(
287
293
  vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
@@ -422,7 +428,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
422
428
  prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
423
429
 
424
430
  # We are only ALWAYS interested in the pooled output of the final text encoder
425
- pooled_prompt_embeds = prompt_embeds[0]
431
+ if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
432
+ pooled_prompt_embeds = prompt_embeds[0]
433
+
426
434
  if clip_skip is None:
427
435
  prompt_embeds = prompt_embeds.hidden_states[-2]
428
436
  else:
@@ -481,8 +489,10 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
481
489
  uncond_input.input_ids.to(device),
482
490
  output_hidden_states=True,
483
491
  )
492
+
484
493
  # We are only ALWAYS interested in the pooled output of the final text encoder
485
- negative_pooled_prompt_embeds = negative_prompt_embeds[0]
494
+ if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
495
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
486
496
  negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
487
497
 
488
498
  negative_prompt_embeds_list.append(negative_prompt_embeds)
@@ -731,26 +741,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
731
741
  else:
732
742
  assert False
733
743
 
734
- # Check `controlnet_conditioning_scale`
735
- if (
736
- isinstance(self.controlnet, ControlNetModel)
737
- or is_compiled
738
- and isinstance(self.controlnet._orig_mod, ControlNetModel)
739
- ):
740
- if not isinstance(controlnet_conditioning_scale, float):
741
- raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
742
-
743
- elif (
744
- isinstance(self.controlnet, ControlNetUnionModel)
745
- or is_compiled
746
- and isinstance(self.controlnet._orig_mod, ControlNetUnionModel)
747
- ):
748
- if not isinstance(controlnet_conditioning_scale, float):
749
- raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
750
-
751
- else:
752
- assert False
753
-
754
744
  if not isinstance(control_guidance_start, (tuple, list)):
755
745
  control_guidance_start = [control_guidance_start]
756
746
 
@@ -1291,6 +1281,8 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
1291
1281
 
1292
1282
  if not isinstance(control_image, list):
1293
1283
  control_image = [control_image]
1284
+ else:
1285
+ control_image = control_image.copy()
1294
1286
 
1295
1287
  if not isinstance(control_mode, list):
1296
1288
  control_mode = [control_mode]
@@ -1565,6 +1557,7 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
1565
1557
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1566
1558
  add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1567
1559
  add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1560
+ control_image = callback_outputs.pop("control_image", control_image)
1568
1561
 
1569
1562
  # call the callback, if provided
1570
1563
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -1573,6 +1566,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
1573
1566
  step_idx = i // getattr(self.scheduler, "order", 1)
1574
1567
  callback(step_idx, t, latents)
1575
1568
 
1569
+ if XLA_AVAILABLE:
1570
+ xm.mark_step()
1571
+
1576
1572
  # If we do sequential model offloading, let's offload unet and controlnet
1577
1573
  # manually for max memory savings
1578
1574
  if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
@@ -75,7 +75,10 @@ EXAMPLE_DOC_STRING = """
75
75
  ... "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
76
76
  ... )
77
77
  >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
78
- ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32
78
+ ... "stable-diffusion-v1-5/stable-diffusion-v1-5",
79
+ ... controlnet=controlnet,
80
+ ... revision="flax",
81
+ ... dtype=jnp.float32,
79
82
  ... )
80
83
  >>> params["controlnet"] = controlnet_params
81
84
 
@@ -132,8 +135,8 @@ class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
132
135
  [`FlaxDPMSolverMultistepScheduler`].
133
136
  safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
134
137
  Classification module that estimates whether generated images could be considered offensive or harmful.
135
- Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
136
- about a model's potential harms.
138
+ Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
139
+ more details about a model's potential harms.
137
140
  feature_extractor ([`~transformers.CLIPImageProcessor`]):
138
141
  A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
139
142
  """
@@ -175,7 +178,7 @@ class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
175
178
  safety_checker=safety_checker,
176
179
  feature_extractor=feature_extractor,
177
180
  )
178
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
181
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
179
182
 
180
183
  def prepare_text_inputs(self, prompt: Union[str, List[str]]):
181
184
  if not isinstance(prompt, (str, list)):
@@ -232,8 +232,8 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
232
232
  Tuple[HunyuanDiT2DControlNetModel],
233
233
  HunyuanDiT2DMultiControlNetModel,
234
234
  ],
235
- text_encoder_2=T5EncoderModel,
236
- tokenizer_2=MT5Tokenizer,
235
+ text_encoder_2: Optional[T5EncoderModel] = None,
236
+ tokenizer_2: Optional[MT5Tokenizer] = None,
237
237
  requires_safety_checker: bool = True,
238
238
  ):
239
239
  super().__init__()
@@ -269,9 +269,7 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
269
269
  " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
270
270
  )
271
271
 
272
- self.vae_scale_factor = (
273
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
274
- )
272
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
275
273
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
276
274
  self.register_to_config(requires_safety_checker=requires_safety_checker)
277
275
  self.default_sample_size = (
@@ -19,12 +19,14 @@ import torch
19
19
  from transformers import (
20
20
  CLIPTextModelWithProjection,
21
21
  CLIPTokenizer,
22
+ SiglipImageProcessor,
23
+ SiglipVisionModel,
22
24
  T5EncoderModel,
23
25
  T5TokenizerFast,
24
26
  )
25
27
 
26
28
  from ...image_processor import PipelineImageInput, VaeImageProcessor
27
- from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
29
+ from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
28
30
  from ...models.autoencoders import AutoencoderKL
29
31
  from ...models.controlnets.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
30
32
  from ...models.transformers import SD3Transformer2DModel
@@ -138,7 +140,9 @@ def retrieve_timesteps(
138
140
  return timesteps, num_inference_steps
139
141
 
140
142
 
141
- class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
143
+ class StableDiffusion3ControlNetPipeline(
144
+ DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin
145
+ ):
142
146
  r"""
143
147
  Args:
144
148
  transformer ([`SD3Transformer2DModel`]):
@@ -174,10 +178,14 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
174
178
  Provides additional conditioning to the `unet` during the denoising process. If you set multiple
175
179
  ControlNets as a list, the outputs from each ControlNet are added together to create one combined
176
180
  additional conditioning.
181
+ image_encoder (`SiglipVisionModel`, *optional*):
182
+ Pre-trained Vision Model for IP Adapter.
183
+ feature_extractor (`SiglipImageProcessor`, *optional*):
184
+ Image processor for IP Adapter.
177
185
  """
178
186
 
179
- model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
180
- _optional_components = []
187
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
188
+ _optional_components = ["image_encoder", "feature_extractor"]
181
189
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
182
190
 
183
191
  def __init__(
@@ -194,6 +202,8 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
194
202
  controlnet: Union[
195
203
  SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
196
204
  ],
205
+ image_encoder: Optional[SiglipVisionModel] = None,
206
+ feature_extractor: Optional[SiglipImageProcessor] = None,
197
207
  ):
198
208
  super().__init__()
199
209
  if isinstance(controlnet, (list, tuple)):
@@ -223,10 +233,10 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
223
233
  transformer=transformer,
224
234
  scheduler=scheduler,
225
235
  controlnet=controlnet,
236
+ image_encoder=image_encoder,
237
+ feature_extractor=feature_extractor,
226
238
  )
227
- self.vae_scale_factor = (
228
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
229
- )
239
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
230
240
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
231
241
  self.tokenizer_max_length = (
232
242
  self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
@@ -394,9 +404,9 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
394
404
  negative_prompt_2 (`str` or `List[str]`, *optional*):
395
405
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
396
406
  `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
397
- negative_prompt_2 (`str` or `List[str]`, *optional*):
407
+ negative_prompt_3 (`str` or `List[str]`, *optional*):
398
408
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
399
- `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
409
+ `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
400
410
  prompt_embeds (`torch.FloatTensor`, *optional*):
401
411
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
402
412
  provided, text embeddings will be generated from `prompt` input argument.
@@ -727,6 +737,84 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
727
737
  def interrupt(self):
728
738
  return self._interrupt
729
739
 
740
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
741
+ def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
742
+ """Encodes the given image into a feature representation using a pre-trained image encoder.
743
+
744
+ Args:
745
+ image (`PipelineImageInput`):
746
+ Input image to be encoded.
747
+ device: (`torch.device`):
748
+ Torch device.
749
+
750
+ Returns:
751
+ `torch.Tensor`: The encoded image feature representation.
752
+ """
753
+ if not isinstance(image, torch.Tensor):
754
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
755
+
756
+ image = image.to(device=device, dtype=self.dtype)
757
+
758
+ return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
759
+
760
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
761
+ def prepare_ip_adapter_image_embeds(
762
+ self,
763
+ ip_adapter_image: Optional[PipelineImageInput] = None,
764
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
765
+ device: Optional[torch.device] = None,
766
+ num_images_per_prompt: int = 1,
767
+ do_classifier_free_guidance: bool = True,
768
+ ) -> torch.Tensor:
769
+ """Prepares image embeddings for use in the IP-Adapter.
770
+
771
+ Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
772
+
773
+ Args:
774
+ ip_adapter_image (`PipelineImageInput`, *optional*):
775
+ The input image to extract features from for IP-Adapter.
776
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
777
+ Precomputed image embeddings.
778
+ device: (`torch.device`, *optional*):
779
+ Torch device.
780
+ num_images_per_prompt (`int`, defaults to 1):
781
+ Number of images that should be generated per prompt.
782
+ do_classifier_free_guidance (`bool`, defaults to True):
783
+ Whether to use classifier free guidance or not.
784
+ """
785
+ device = device or self._execution_device
786
+
787
+ if ip_adapter_image_embeds is not None:
788
+ if do_classifier_free_guidance:
789
+ single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
790
+ else:
791
+ single_image_embeds = ip_adapter_image_embeds
792
+ elif ip_adapter_image is not None:
793
+ single_image_embeds = self.encode_image(ip_adapter_image, device)
794
+ if do_classifier_free_guidance:
795
+ single_negative_image_embeds = torch.zeros_like(single_image_embeds)
796
+ else:
797
+ raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
798
+
799
+ image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
800
+
801
+ if do_classifier_free_guidance:
802
+ negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
803
+ image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
804
+
805
+ return image_embeds.to(device=device)
806
+
807
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
808
+ def enable_sequential_cpu_offload(self, *args, **kwargs):
809
+ if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
810
+ logger.warning(
811
+ "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
812
+ "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
813
+ "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
814
+ )
815
+
816
+ super().enable_sequential_cpu_offload(*args, **kwargs)
817
+
730
818
  @torch.no_grad()
731
819
  @replace_example_docstring(EXAMPLE_DOC_STRING)
732
820
  def __call__(
@@ -754,6 +842,8 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
754
842
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
755
843
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
756
844
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
845
+ ip_adapter_image: Optional[PipelineImageInput] = None,
846
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
757
847
  output_type: Optional[str] = "pil",
758
848
  return_dict: bool = True,
759
849
  joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -843,6 +933,12 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
843
933
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
844
934
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
845
935
  input argument.
936
+ ip_adapter_image (`PipelineImageInput`, *optional*):
937
+ Optional image input to work with IP Adapters.
938
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
939
+ Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
940
+ emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
941
+ `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
846
942
  output_type (`str`, *optional*, defaults to `"pil"`):
847
943
  The output format of the generate image. Choose between
848
944
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -1040,7 +1136,22 @@ class StableDiffusion3ControlNetPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
1040
1136
  # SD35 official 8b controlnet does not use encoder_hidden_states
1041
1137
  controlnet_encoder_hidden_states = None
1042
1138
 
1043
- # 7. Denoising loop
1139
+ # 7. Prepare image embeddings
1140
+ if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
1141
+ ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
1142
+ ip_adapter_image,
1143
+ ip_adapter_image_embeds,
1144
+ device,
1145
+ batch_size * num_images_per_prompt,
1146
+ self.do_classifier_free_guidance,
1147
+ )
1148
+
1149
+ if self.joint_attention_kwargs is None:
1150
+ self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
1151
+ else:
1152
+ self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
1153
+
1154
+ # 8. Denoising loop
1044
1155
  with self.progress_bar(total=num_inference_steps) as progress_bar:
1045
1156
  for i, t in enumerate(timesteps):
1046
1157
  if self.interrupt:
@@ -19,12 +19,14 @@ import torch
19
19
  from transformers import (
20
20
  CLIPTextModelWithProjection,
21
21
  CLIPTokenizer,
22
+ SiglipImageProcessor,
23
+ SiglipModel,
22
24
  T5EncoderModel,
23
25
  T5TokenizerFast,
24
26
  )
25
27
 
26
28
  from ...image_processor import PipelineImageInput, VaeImageProcessor
27
- from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
29
+ from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
28
30
  from ...models.autoencoders import AutoencoderKL
29
31
  from ...models.controlnets.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
30
32
  from ...models.transformers import SD3Transformer2DModel
@@ -159,7 +161,9 @@ def retrieve_timesteps(
159
161
  return timesteps, num_inference_steps
160
162
 
161
163
 
162
- class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
164
+ class StableDiffusion3ControlNetInpaintingPipeline(
165
+ DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin
166
+ ):
163
167
  r"""
164
168
  Args:
165
169
  transformer ([`SD3Transformer2DModel`]):
@@ -192,13 +196,17 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
192
196
  Tokenizer of class
193
197
  [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
194
198
  controlnet ([`SD3ControlNetModel`] or `List[SD3ControlNetModel]` or [`SD3MultiControlNetModel`]):
195
- Provides additional conditioning to the `unet` during the denoising process. If you set multiple
199
+ Provides additional conditioning to the `transformer` during the denoising process. If you set multiple
196
200
  ControlNets as a list, the outputs from each ControlNet are added together to create one combined
197
201
  additional conditioning.
202
+ image_encoder (`PreTrainedModel`, *optional*):
203
+ Pre-trained Vision Model for IP Adapter.
204
+ feature_extractor (`BaseImageProcessor`, *optional*):
205
+ Image processor for IP Adapter.
198
206
  """
199
207
 
200
- model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
201
- _optional_components = []
208
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
209
+ _optional_components = ["image_encoder", "feature_extractor"]
202
210
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
203
211
 
204
212
  def __init__(
@@ -215,6 +223,8 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
215
223
  controlnet: Union[
216
224
  SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel
217
225
  ],
226
+ image_encoder: SiglipModel = None,
227
+ feature_extractor: Optional[SiglipImageProcessor] = None,
218
228
  ):
219
229
  super().__init__()
220
230
 
@@ -229,10 +239,10 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
229
239
  transformer=transformer,
230
240
  scheduler=scheduler,
231
241
  controlnet=controlnet,
242
+ image_encoder=image_encoder,
243
+ feature_extractor=feature_extractor,
232
244
  )
233
- self.vae_scale_factor = (
234
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
235
- )
245
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
236
246
  self.image_processor = VaeImageProcessor(
237
247
  vae_scale_factor=self.vae_scale_factor, do_resize=True, do_convert_rgb=True, do_normalize=True
238
248
  )
@@ -412,9 +422,9 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
412
422
  negative_prompt_2 (`str` or `List[str]`, *optional*):
413
423
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
414
424
  `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
415
- negative_prompt_2 (`str` or `List[str]`, *optional*):
425
+ negative_prompt_3 (`str` or `List[str]`, *optional*):
416
426
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
417
- `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
427
+ `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
418
428
  prompt_embeds (`torch.FloatTensor`, *optional*):
419
429
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
420
430
  provided, text embeddings will be generated from `prompt` input argument.
@@ -777,6 +787,84 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
777
787
  def interrupt(self):
778
788
  return self._interrupt
779
789
 
790
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
791
+ def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
792
+ """Encodes the given image into a feature representation using a pre-trained image encoder.
793
+
794
+ Args:
795
+ image (`PipelineImageInput`):
796
+ Input image to be encoded.
797
+ device: (`torch.device`):
798
+ Torch device.
799
+
800
+ Returns:
801
+ `torch.Tensor`: The encoded image feature representation.
802
+ """
803
+ if not isinstance(image, torch.Tensor):
804
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
805
+
806
+ image = image.to(device=device, dtype=self.dtype)
807
+
808
+ return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
809
+
810
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
811
+ def prepare_ip_adapter_image_embeds(
812
+ self,
813
+ ip_adapter_image: Optional[PipelineImageInput] = None,
814
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
815
+ device: Optional[torch.device] = None,
816
+ num_images_per_prompt: int = 1,
817
+ do_classifier_free_guidance: bool = True,
818
+ ) -> torch.Tensor:
819
+ """Prepares image embeddings for use in the IP-Adapter.
820
+
821
+ Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
822
+
823
+ Args:
824
+ ip_adapter_image (`PipelineImageInput`, *optional*):
825
+ The input image to extract features from for IP-Adapter.
826
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
827
+ Precomputed image embeddings.
828
+ device: (`torch.device`, *optional*):
829
+ Torch device.
830
+ num_images_per_prompt (`int`, defaults to 1):
831
+ Number of images that should be generated per prompt.
832
+ do_classifier_free_guidance (`bool`, defaults to True):
833
+ Whether to use classifier free guidance or not.
834
+ """
835
+ device = device or self._execution_device
836
+
837
+ if ip_adapter_image_embeds is not None:
838
+ if do_classifier_free_guidance:
839
+ single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
840
+ else:
841
+ single_image_embeds = ip_adapter_image_embeds
842
+ elif ip_adapter_image is not None:
843
+ single_image_embeds = self.encode_image(ip_adapter_image, device)
844
+ if do_classifier_free_guidance:
845
+ single_negative_image_embeds = torch.zeros_like(single_image_embeds)
846
+ else:
847
+ raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
848
+
849
+ image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
850
+
851
+ if do_classifier_free_guidance:
852
+ negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
853
+ image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
854
+
855
+ return image_embeds.to(device=device)
856
+
857
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
858
+ def enable_sequential_cpu_offload(self, *args, **kwargs):
859
+ if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
860
+ logger.warning(
861
+ "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
862
+ "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
863
+ "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
864
+ )
865
+
866
+ super().enable_sequential_cpu_offload(*args, **kwargs)
867
+
780
868
  @torch.no_grad()
781
869
  @replace_example_docstring(EXAMPLE_DOC_STRING)
782
870
  def __call__(
@@ -805,6 +893,8 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
805
893
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
806
894
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
807
895
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
896
+ ip_adapter_image: Optional[PipelineImageInput] = None,
897
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
808
898
  output_type: Optional[str] = "pil",
809
899
  return_dict: bool = True,
810
900
  joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -898,6 +988,12 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
898
988
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
899
989
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
900
990
  input argument.
991
+ ip_adapter_image (`PipelineImageInput`, *optional*):
992
+ Optional image input to work with IP Adapters.
993
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
994
+ Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
995
+ emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
996
+ `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
901
997
  output_type (`str`, *optional*, defaults to `"pil"`):
902
998
  The output format of the generate image. Choose between
903
999
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -1059,7 +1155,22 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa
1059
1155
  ]
1060
1156
  controlnet_keep.append(keeps[0] if isinstance(self.controlnet, SD3ControlNetModel) else keeps)
1061
1157
 
1062
- # 7. Denoising loop
1158
+ # 7. Prepare image embeddings
1159
+ if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
1160
+ ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
1161
+ ip_adapter_image,
1162
+ ip_adapter_image_embeds,
1163
+ device,
1164
+ batch_size * num_images_per_prompt,
1165
+ self.do_classifier_free_guidance,
1166
+ )
1167
+
1168
+ if self.joint_attention_kwargs is None:
1169
+ self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
1170
+ else:
1171
+ self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
1172
+
1173
+ # 8. Denoising loop
1063
1174
  with self.progress_bar(total=num_inference_steps) as progress_bar:
1064
1175
  for i, t in enumerate(timesteps):
1065
1176
  if self.interrupt:
@@ -30,6 +30,7 @@ from ...schedulers import KarrasDiffusionSchedulers
30
30
  from ...utils import (
31
31
  USE_PEFT_BACKEND,
32
32
  deprecate,
33
+ is_torch_xla_available,
33
34
  logging,
34
35
  replace_example_docstring,
35
36
  scale_lora_layers,
@@ -41,6 +42,13 @@ from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
41
42
  from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
42
43
 
43
44
 
45
+ if is_torch_xla_available():
46
+ import torch_xla.core.xla_model as xm
47
+
48
+ XLA_AVAILABLE = True
49
+ else:
50
+ XLA_AVAILABLE = False
51
+
44
52
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
45
53
 
46
54
 
@@ -178,7 +186,7 @@ class StableDiffusionControlNetXSPipeline(
178
186
  safety_checker=safety_checker,
179
187
  feature_extractor=feature_extractor,
180
188
  )
181
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
189
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
182
190
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
183
191
  self.control_image_processor = VaeImageProcessor(
184
192
  vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
@@ -884,6 +892,9 @@ class StableDiffusionControlNetXSPipeline(
884
892
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
885
893
  progress_bar.update()
886
894
 
895
+ if XLA_AVAILABLE:
896
+ xm.mark_step()
897
+
887
898
  # If we do sequential model offloading, let's offload unet and controlnet
888
899
  # manually for max memory savings
889
900
  if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: