diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (389) hide show
  1. diffusers/__init__.py +186 -3
  2. diffusers/configuration_utils.py +40 -12
  3. diffusers/dependency_versions_table.py +9 -2
  4. diffusers/hooks/__init__.py +9 -0
  5. diffusers/hooks/faster_cache.py +653 -0
  6. diffusers/hooks/group_offloading.py +793 -0
  7. diffusers/hooks/hooks.py +236 -0
  8. diffusers/hooks/layerwise_casting.py +245 -0
  9. diffusers/hooks/pyramid_attention_broadcast.py +311 -0
  10. diffusers/loaders/__init__.py +6 -0
  11. diffusers/loaders/ip_adapter.py +38 -30
  12. diffusers/loaders/lora_base.py +121 -86
  13. diffusers/loaders/lora_conversion_utils.py +504 -44
  14. diffusers/loaders/lora_pipeline.py +1769 -181
  15. diffusers/loaders/peft.py +167 -57
  16. diffusers/loaders/single_file.py +17 -2
  17. diffusers/loaders/single_file_model.py +53 -5
  18. diffusers/loaders/single_file_utils.py +646 -72
  19. diffusers/loaders/textual_inversion.py +9 -9
  20. diffusers/loaders/transformer_flux.py +8 -9
  21. diffusers/loaders/transformer_sd3.py +120 -39
  22. diffusers/loaders/unet.py +20 -7
  23. diffusers/models/__init__.py +22 -0
  24. diffusers/models/activations.py +9 -9
  25. diffusers/models/attention.py +0 -1
  26. diffusers/models/attention_processor.py +163 -25
  27. diffusers/models/auto_model.py +169 -0
  28. diffusers/models/autoencoders/__init__.py +2 -0
  29. diffusers/models/autoencoders/autoencoder_asym_kl.py +2 -0
  30. diffusers/models/autoencoders/autoencoder_dc.py +106 -4
  31. diffusers/models/autoencoders/autoencoder_kl.py +0 -4
  32. diffusers/models/autoencoders/autoencoder_kl_allegro.py +5 -23
  33. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +17 -55
  34. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +17 -97
  35. diffusers/models/autoencoders/autoencoder_kl_ltx.py +326 -107
  36. diffusers/models/autoencoders/autoencoder_kl_magvit.py +1094 -0
  37. diffusers/models/autoencoders/autoencoder_kl_mochi.py +21 -56
  38. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -42
  39. diffusers/models/autoencoders/autoencoder_kl_wan.py +855 -0
  40. diffusers/models/autoencoders/autoencoder_oobleck.py +1 -0
  41. diffusers/models/autoencoders/autoencoder_tiny.py +0 -4
  42. diffusers/models/autoencoders/consistency_decoder_vae.py +3 -1
  43. diffusers/models/autoencoders/vae.py +31 -141
  44. diffusers/models/autoencoders/vq_model.py +3 -0
  45. diffusers/models/cache_utils.py +108 -0
  46. diffusers/models/controlnets/__init__.py +1 -0
  47. diffusers/models/controlnets/controlnet.py +3 -8
  48. diffusers/models/controlnets/controlnet_flux.py +14 -42
  49. diffusers/models/controlnets/controlnet_sd3.py +58 -34
  50. diffusers/models/controlnets/controlnet_sparsectrl.py +4 -7
  51. diffusers/models/controlnets/controlnet_union.py +27 -18
  52. diffusers/models/controlnets/controlnet_xs.py +7 -46
  53. diffusers/models/controlnets/multicontrolnet_union.py +196 -0
  54. diffusers/models/embeddings.py +18 -7
  55. diffusers/models/model_loading_utils.py +122 -80
  56. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  57. diffusers/models/modeling_flax_utils.py +1 -1
  58. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  59. diffusers/models/modeling_utils.py +617 -272
  60. diffusers/models/normalization.py +67 -14
  61. diffusers/models/resnet.py +1 -1
  62. diffusers/models/transformers/__init__.py +6 -0
  63. diffusers/models/transformers/auraflow_transformer_2d.py +9 -35
  64. diffusers/models/transformers/cogvideox_transformer_3d.py +13 -24
  65. diffusers/models/transformers/consisid_transformer_3d.py +789 -0
  66. diffusers/models/transformers/dit_transformer_2d.py +5 -19
  67. diffusers/models/transformers/hunyuan_transformer_2d.py +4 -3
  68. diffusers/models/transformers/latte_transformer_3d.py +20 -15
  69. diffusers/models/transformers/lumina_nextdit2d.py +3 -1
  70. diffusers/models/transformers/pixart_transformer_2d.py +4 -19
  71. diffusers/models/transformers/prior_transformer.py +5 -1
  72. diffusers/models/transformers/sana_transformer.py +144 -40
  73. diffusers/models/transformers/stable_audio_transformer.py +5 -20
  74. diffusers/models/transformers/transformer_2d.py +7 -22
  75. diffusers/models/transformers/transformer_allegro.py +9 -17
  76. diffusers/models/transformers/transformer_cogview3plus.py +6 -17
  77. diffusers/models/transformers/transformer_cogview4.py +462 -0
  78. diffusers/models/transformers/transformer_easyanimate.py +527 -0
  79. diffusers/models/transformers/transformer_flux.py +68 -110
  80. diffusers/models/transformers/transformer_hunyuan_video.py +404 -46
  81. diffusers/models/transformers/transformer_ltx.py +53 -35
  82. diffusers/models/transformers/transformer_lumina2.py +548 -0
  83. diffusers/models/transformers/transformer_mochi.py +6 -17
  84. diffusers/models/transformers/transformer_omnigen.py +469 -0
  85. diffusers/models/transformers/transformer_sd3.py +56 -86
  86. diffusers/models/transformers/transformer_temporal.py +5 -11
  87. diffusers/models/transformers/transformer_wan.py +469 -0
  88. diffusers/models/unets/unet_1d.py +3 -1
  89. diffusers/models/unets/unet_2d.py +21 -20
  90. diffusers/models/unets/unet_2d_blocks.py +19 -243
  91. diffusers/models/unets/unet_2d_condition.py +4 -6
  92. diffusers/models/unets/unet_3d_blocks.py +14 -127
  93. diffusers/models/unets/unet_3d_condition.py +8 -12
  94. diffusers/models/unets/unet_i2vgen_xl.py +5 -13
  95. diffusers/models/unets/unet_kandinsky3.py +0 -4
  96. diffusers/models/unets/unet_motion_model.py +20 -114
  97. diffusers/models/unets/unet_spatio_temporal_condition.py +7 -8
  98. diffusers/models/unets/unet_stable_cascade.py +8 -35
  99. diffusers/models/unets/uvit_2d.py +1 -4
  100. diffusers/optimization.py +2 -2
  101. diffusers/pipelines/__init__.py +57 -8
  102. diffusers/pipelines/allegro/pipeline_allegro.py +22 -2
  103. diffusers/pipelines/amused/pipeline_amused.py +15 -2
  104. diffusers/pipelines/amused/pipeline_amused_img2img.py +15 -2
  105. diffusers/pipelines/amused/pipeline_amused_inpaint.py +15 -2
  106. diffusers/pipelines/animatediff/pipeline_animatediff.py +15 -2
  107. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +15 -3
  108. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +24 -4
  109. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +15 -2
  110. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +16 -4
  111. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +16 -4
  112. diffusers/pipelines/audioldm/pipeline_audioldm.py +13 -2
  113. diffusers/pipelines/audioldm2/modeling_audioldm2.py +13 -68
  114. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +39 -9
  115. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +63 -7
  116. diffusers/pipelines/auto_pipeline.py +35 -14
  117. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  118. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -8
  119. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +12 -0
  120. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +22 -6
  121. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +22 -6
  122. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +22 -5
  123. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +22 -6
  124. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +12 -4
  125. diffusers/pipelines/cogview4/__init__.py +49 -0
  126. diffusers/pipelines/cogview4/pipeline_cogview4.py +684 -0
  127. diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
  128. diffusers/pipelines/cogview4/pipeline_output.py +21 -0
  129. diffusers/pipelines/consisid/__init__.py +49 -0
  130. diffusers/pipelines/consisid/consisid_utils.py +357 -0
  131. diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
  132. diffusers/pipelines/consisid/pipeline_output.py +20 -0
  133. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +11 -0
  134. diffusers/pipelines/controlnet/pipeline_controlnet.py +6 -5
  135. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +13 -0
  136. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +17 -5
  137. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +31 -12
  138. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +26 -7
  139. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +20 -3
  140. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +22 -3
  141. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +26 -25
  142. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +224 -109
  143. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +25 -29
  144. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +7 -4
  145. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +3 -5
  146. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +121 -10
  147. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +122 -11
  148. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +12 -1
  149. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +20 -3
  150. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +14 -2
  151. diffusers/pipelines/ddim/pipeline_ddim.py +14 -1
  152. diffusers/pipelines/ddpm/pipeline_ddpm.py +15 -1
  153. diffusers/pipelines/deepfloyd_if/pipeline_if.py +12 -0
  154. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +12 -0
  155. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +14 -1
  156. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +12 -0
  157. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +14 -1
  158. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +14 -1
  159. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -7
  160. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -7
  161. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  162. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +10 -6
  163. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py +2 -2
  164. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +11 -7
  165. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +1 -1
  166. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +1 -1
  167. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +1 -1
  168. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +10 -105
  169. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +1 -1
  170. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  171. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  172. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -1
  173. diffusers/pipelines/dit/pipeline_dit.py +15 -2
  174. diffusers/pipelines/easyanimate/__init__.py +52 -0
  175. diffusers/pipelines/easyanimate/pipeline_easyanimate.py +770 -0
  176. diffusers/pipelines/easyanimate/pipeline_easyanimate_control.py +994 -0
  177. diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.py +1234 -0
  178. diffusers/pipelines/easyanimate/pipeline_output.py +20 -0
  179. diffusers/pipelines/flux/pipeline_flux.py +53 -21
  180. diffusers/pipelines/flux/pipeline_flux_control.py +9 -12
  181. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +6 -10
  182. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +8 -10
  183. diffusers/pipelines/flux/pipeline_flux_controlnet.py +185 -13
  184. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +8 -10
  185. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +16 -16
  186. diffusers/pipelines/flux/pipeline_flux_fill.py +107 -39
  187. diffusers/pipelines/flux/pipeline_flux_img2img.py +193 -15
  188. diffusers/pipelines/flux/pipeline_flux_inpaint.py +199 -19
  189. diffusers/pipelines/free_noise_utils.py +3 -3
  190. diffusers/pipelines/hunyuan_video/__init__.py +4 -0
  191. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_skyreels_image2video.py +804 -0
  192. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +90 -23
  193. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +924 -0
  194. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +3 -5
  195. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +13 -1
  196. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +12 -0
  197. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  198. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +12 -0
  199. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +13 -1
  200. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +12 -0
  201. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +12 -1
  202. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +13 -0
  203. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +12 -0
  204. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +12 -1
  205. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +12 -1
  206. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +12 -0
  207. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +12 -0
  208. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +12 -0
  209. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +12 -0
  210. diffusers/pipelines/kolors/pipeline_kolors.py +10 -8
  211. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +6 -4
  212. diffusers/pipelines/kolors/text_encoder.py +7 -34
  213. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +12 -1
  214. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +13 -1
  215. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +14 -13
  216. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +12 -1
  217. diffusers/pipelines/latte/pipeline_latte.py +36 -7
  218. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +67 -13
  219. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +60 -15
  220. diffusers/pipelines/ltx/__init__.py +2 -0
  221. diffusers/pipelines/ltx/pipeline_ltx.py +25 -13
  222. diffusers/pipelines/ltx/pipeline_ltx_condition.py +1194 -0
  223. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +31 -17
  224. diffusers/pipelines/lumina/__init__.py +2 -2
  225. diffusers/pipelines/lumina/pipeline_lumina.py +83 -20
  226. diffusers/pipelines/lumina2/__init__.py +48 -0
  227. diffusers/pipelines/lumina2/pipeline_lumina2.py +790 -0
  228. diffusers/pipelines/marigold/__init__.py +2 -0
  229. diffusers/pipelines/marigold/marigold_image_processing.py +127 -14
  230. diffusers/pipelines/marigold/pipeline_marigold_depth.py +31 -16
  231. diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py +721 -0
  232. diffusers/pipelines/marigold/pipeline_marigold_normals.py +31 -16
  233. diffusers/pipelines/mochi/pipeline_mochi.py +14 -18
  234. diffusers/pipelines/musicldm/pipeline_musicldm.py +16 -1
  235. diffusers/pipelines/omnigen/__init__.py +50 -0
  236. diffusers/pipelines/omnigen/pipeline_omnigen.py +512 -0
  237. diffusers/pipelines/omnigen/processor_omnigen.py +327 -0
  238. diffusers/pipelines/onnx_utils.py +5 -3
  239. diffusers/pipelines/pag/pag_utils.py +1 -1
  240. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +12 -1
  241. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +15 -4
  242. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +20 -3
  243. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +20 -3
  244. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +1 -3
  245. diffusers/pipelines/pag/pipeline_pag_kolors.py +6 -4
  246. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +16 -3
  247. diffusers/pipelines/pag/pipeline_pag_sana.py +65 -8
  248. diffusers/pipelines/pag/pipeline_pag_sd.py +23 -7
  249. diffusers/pipelines/pag/pipeline_pag_sd_3.py +3 -5
  250. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +3 -5
  251. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +13 -1
  252. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +23 -7
  253. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +26 -10
  254. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +12 -4
  255. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +7 -3
  256. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +10 -6
  257. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +13 -3
  258. diffusers/pipelines/pia/pipeline_pia.py +13 -1
  259. diffusers/pipelines/pipeline_flax_utils.py +7 -7
  260. diffusers/pipelines/pipeline_loading_utils.py +193 -83
  261. diffusers/pipelines/pipeline_utils.py +221 -106
  262. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +17 -5
  263. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +17 -4
  264. diffusers/pipelines/sana/__init__.py +2 -0
  265. diffusers/pipelines/sana/pipeline_sana.py +183 -58
  266. diffusers/pipelines/sana/pipeline_sana_sprint.py +889 -0
  267. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +12 -2
  268. diffusers/pipelines/shap_e/pipeline_shap_e.py +12 -0
  269. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +12 -0
  270. diffusers/pipelines/shap_e/renderer.py +6 -6
  271. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +1 -1
  272. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +15 -4
  273. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +12 -8
  274. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +12 -1
  275. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +3 -2
  276. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +14 -10
  277. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +3 -3
  278. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +14 -10
  279. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  280. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +4 -3
  281. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +5 -4
  282. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +2 -2
  283. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -13
  284. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +30 -8
  285. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +24 -10
  286. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +28 -12
  287. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +39 -18
  288. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -6
  289. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +13 -3
  290. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +20 -3
  291. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +14 -2
  292. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +13 -1
  293. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +16 -17
  294. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +136 -18
  295. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +150 -21
  296. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +15 -3
  297. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +26 -11
  298. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +15 -3
  299. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +22 -4
  300. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -13
  301. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +12 -4
  302. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +15 -3
  303. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +15 -3
  304. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +26 -12
  305. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +16 -4
  306. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  307. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +12 -4
  308. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +7 -3
  309. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +10 -6
  310. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +11 -4
  311. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +13 -2
  312. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +18 -4
  313. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +26 -5
  314. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +13 -1
  315. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +13 -1
  316. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -6
  317. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +26 -4
  318. diffusers/pipelines/transformers_loading_utils.py +121 -0
  319. diffusers/pipelines/unclip/pipeline_unclip.py +11 -1
  320. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +11 -1
  321. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +19 -2
  322. diffusers/pipelines/wan/__init__.py +51 -0
  323. diffusers/pipelines/wan/pipeline_output.py +20 -0
  324. diffusers/pipelines/wan/pipeline_wan.py +593 -0
  325. diffusers/pipelines/wan/pipeline_wan_i2v.py +722 -0
  326. diffusers/pipelines/wan/pipeline_wan_video2video.py +725 -0
  327. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +7 -31
  328. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +12 -1
  329. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +12 -1
  330. diffusers/quantizers/auto.py +5 -1
  331. diffusers/quantizers/base.py +5 -9
  332. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +41 -29
  333. diffusers/quantizers/bitsandbytes/utils.py +30 -20
  334. diffusers/quantizers/gguf/gguf_quantizer.py +1 -0
  335. diffusers/quantizers/gguf/utils.py +4 -2
  336. diffusers/quantizers/quantization_config.py +59 -4
  337. diffusers/quantizers/quanto/__init__.py +1 -0
  338. diffusers/quantizers/quanto/quanto_quantizer.py +177 -0
  339. diffusers/quantizers/quanto/utils.py +60 -0
  340. diffusers/quantizers/torchao/__init__.py +1 -1
  341. diffusers/quantizers/torchao/torchao_quantizer.py +47 -2
  342. diffusers/schedulers/__init__.py +2 -1
  343. diffusers/schedulers/scheduling_consistency_models.py +1 -2
  344. diffusers/schedulers/scheduling_ddim_inverse.py +1 -1
  345. diffusers/schedulers/scheduling_ddpm.py +2 -3
  346. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -2
  347. diffusers/schedulers/scheduling_dpmsolver_multistep.py +12 -4
  348. diffusers/schedulers/scheduling_edm_euler.py +45 -10
  349. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +116 -28
  350. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +7 -6
  351. diffusers/schedulers/scheduling_heun_discrete.py +1 -1
  352. diffusers/schedulers/scheduling_lcm.py +1 -2
  353. diffusers/schedulers/scheduling_lms_discrete.py +1 -1
  354. diffusers/schedulers/scheduling_repaint.py +5 -1
  355. diffusers/schedulers/scheduling_scm.py +265 -0
  356. diffusers/schedulers/scheduling_tcd.py +1 -2
  357. diffusers/schedulers/scheduling_utils.py +2 -1
  358. diffusers/training_utils.py +14 -7
  359. diffusers/utils/__init__.py +9 -1
  360. diffusers/utils/constants.py +13 -1
  361. diffusers/utils/deprecation_utils.py +1 -1
  362. diffusers/utils/dummy_bitsandbytes_objects.py +17 -0
  363. diffusers/utils/dummy_gguf_objects.py +17 -0
  364. diffusers/utils/dummy_optimum_quanto_objects.py +17 -0
  365. diffusers/utils/dummy_pt_objects.py +233 -0
  366. diffusers/utils/dummy_torch_and_transformers_and_opencv_objects.py +17 -0
  367. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  368. diffusers/utils/dummy_torchao_objects.py +17 -0
  369. diffusers/utils/dynamic_modules_utils.py +1 -1
  370. diffusers/utils/export_utils.py +28 -3
  371. diffusers/utils/hub_utils.py +52 -102
  372. diffusers/utils/import_utils.py +121 -221
  373. diffusers/utils/loading_utils.py +2 -1
  374. diffusers/utils/logging.py +1 -2
  375. diffusers/utils/peft_utils.py +6 -14
  376. diffusers/utils/remote_utils.py +425 -0
  377. diffusers/utils/source_code_parsing_utils.py +52 -0
  378. diffusers/utils/state_dict_utils.py +15 -1
  379. diffusers/utils/testing_utils.py +243 -13
  380. diffusers/utils/torch_utils.py +10 -0
  381. diffusers/utils/typing_utils.py +91 -0
  382. diffusers/video_processor.py +1 -1
  383. {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/METADATA +76 -44
  384. diffusers-0.33.0.dist-info/RECORD +608 -0
  385. {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/WHEEL +1 -1
  386. diffusers-0.32.2.dist-info/RECORD +0 -550
  387. {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/LICENSE +0 -0
  388. {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/entry_points.txt +0 -0
  389. {diffusers-0.32.2.dist-info → diffusers-0.33.0.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ from ...schedulers import KarrasDiffusionSchedulers
28
28
  from ...utils import (
29
29
  USE_PEFT_BACKEND,
30
30
  deprecate,
31
+ is_torch_xla_available,
31
32
  logging,
32
33
  replace_example_docstring,
33
34
  scale_lora_layers,
@@ -38,8 +39,16 @@ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffu
38
39
  from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
39
40
 
40
41
 
42
+ if is_torch_xla_available():
43
+ import torch_xla.core.xla_model as xm
44
+
45
+ XLA_AVAILABLE = True
46
+ else:
47
+ XLA_AVAILABLE = False
48
+
41
49
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
50
 
51
+
43
52
  EXAMPLE_DOC_STRING = """
44
53
  Examples:
45
54
  ```py
@@ -155,7 +164,7 @@ class StableUnCLIPImg2ImgPipeline(
155
164
  vae=vae,
156
165
  )
157
166
 
158
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
167
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
159
168
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
160
169
 
161
170
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
@@ -829,6 +838,9 @@ class StableUnCLIPImg2ImgPipeline(
829
838
  step_idx = i // getattr(self.scheduler, "order", 1)
830
839
  callback(step_idx, t, latents)
831
840
 
841
+ if XLA_AVAILABLE:
842
+ xm.mark_step()
843
+
832
844
  # 9. Post-processing
833
845
  if not output_type == "latent":
834
846
  image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
@@ -17,10 +17,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
18
  import torch
19
19
  from transformers import (
20
- BaseImageProcessor,
21
20
  CLIPTextModelWithProjection,
22
21
  CLIPTokenizer,
23
- PreTrainedModel,
22
+ SiglipImageProcessor,
23
+ SiglipVisionModel,
24
24
  T5EncoderModel,
25
25
  T5TokenizerFast,
26
26
  )
@@ -76,7 +76,7 @@ def calculate_shift(
76
76
  base_seq_len: int = 256,
77
77
  max_seq_len: int = 4096,
78
78
  base_shift: float = 0.5,
79
- max_shift: float = 1.16,
79
+ max_shift: float = 1.15,
80
80
  ):
81
81
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
82
82
  b = base_shift - m * base_seq_len
@@ -176,9 +176,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
176
176
  tokenizer_3 (`T5TokenizerFast`):
177
177
  Tokenizer of class
178
178
  [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
179
- image_encoder (`PreTrainedModel`, *optional*):
179
+ image_encoder (`SiglipVisionModel`, *optional*):
180
180
  Pre-trained Vision Model for IP Adapter.
181
- feature_extractor (`BaseImageProcessor`, *optional*):
181
+ feature_extractor (`SiglipImageProcessor`, *optional*):
182
182
  Image processor for IP Adapter.
183
183
  """
184
184
 
@@ -197,8 +197,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
197
197
  tokenizer_2: CLIPTokenizer,
198
198
  text_encoder_3: T5EncoderModel,
199
199
  tokenizer_3: T5TokenizerFast,
200
- image_encoder: PreTrainedModel = None,
201
- feature_extractor: BaseImageProcessor = None,
200
+ image_encoder: SiglipVisionModel = None,
201
+ feature_extractor: SiglipImageProcessor = None,
202
202
  ):
203
203
  super().__init__()
204
204
 
@@ -215,9 +215,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
215
215
  image_encoder=image_encoder,
216
216
  feature_extractor=feature_extractor,
217
217
  )
218
- self.vae_scale_factor = (
219
- 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
220
- )
218
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
221
219
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
222
220
  self.tokenizer_max_length = (
223
221
  self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
@@ -385,9 +383,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
385
383
  negative_prompt_2 (`str` or `List[str]`, *optional*):
386
384
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
387
385
  `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
388
- negative_prompt_2 (`str` or `List[str]`, *optional*):
386
+ negative_prompt_3 (`str` or `List[str]`, *optional*):
389
387
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
390
- `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
388
+ `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
391
389
  prompt_embeds (`torch.FloatTensor`, *optional*):
392
390
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
393
391
  provided, text embeddings will be generated from `prompt` input argument.
@@ -870,7 +868,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
870
868
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
871
869
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
872
870
  input argument.
873
- ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
871
+ ip_adapter_image (`PipelineImageInput`, *optional*):
872
+ Optional image input to work with IP Adapters.
874
873
  ip_adapter_image_embeds (`torch.Tensor`, *optional*):
875
874
  Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
876
875
  emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
@@ -1014,10 +1013,10 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
1014
1013
  )
1015
1014
  mu = calculate_shift(
1016
1015
  image_seq_len,
1017
- self.scheduler.config.base_image_seq_len,
1018
- self.scheduler.config.max_image_seq_len,
1019
- self.scheduler.config.base_shift,
1020
- self.scheduler.config.max_shift,
1016
+ self.scheduler.config.get("base_image_seq_len", 256),
1017
+ self.scheduler.config.get("max_image_seq_len", 4096),
1018
+ self.scheduler.config.get("base_shift", 0.5),
1019
+ self.scheduler.config.get("max_shift", 1.16),
1021
1020
  )
1022
1021
  scheduler_kwargs["mu"] = mu
1023
1022
  elif mu is not None:
@@ -20,12 +20,14 @@ import torch
20
20
  from transformers import (
21
21
  CLIPTextModelWithProjection,
22
22
  CLIPTokenizer,
23
+ SiglipImageProcessor,
24
+ SiglipVisionModel,
23
25
  T5EncoderModel,
24
26
  T5TokenizerFast,
25
27
  )
26
28
 
27
29
  from ...image_processor import PipelineImageInput, VaeImageProcessor
28
- from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
30
+ from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
29
31
  from ...models.autoencoders import AutoencoderKL
30
32
  from ...models.transformers import SD3Transformer2DModel
31
33
  from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -81,7 +83,7 @@ def calculate_shift(
81
83
  base_seq_len: int = 256,
82
84
  max_seq_len: int = 4096,
83
85
  base_shift: float = 0.5,
84
- max_shift: float = 1.16,
86
+ max_shift: float = 1.15,
85
87
  ):
86
88
  m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
87
89
  b = base_shift - m * base_seq_len
@@ -163,7 +165,7 @@ def retrieve_timesteps(
163
165
  return timesteps, num_inference_steps
164
166
 
165
167
 
166
- class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
168
+ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
167
169
  r"""
168
170
  Args:
169
171
  transformer ([`SD3Transformer2DModel`]):
@@ -195,10 +197,14 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
195
197
  tokenizer_3 (`T5TokenizerFast`):
196
198
  Tokenizer of class
197
199
  [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
200
+ image_encoder (`SiglipVisionModel`, *optional*):
201
+ Pre-trained Vision Model for IP Adapter.
202
+ feature_extractor (`SiglipImageProcessor`, *optional*):
203
+ Image processor for IP Adapter.
198
204
  """
199
205
 
200
- model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
201
- _optional_components = []
206
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
207
+ _optional_components = ["image_encoder", "feature_extractor"]
202
208
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
203
209
 
204
210
  def __init__(
@@ -212,6 +218,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
212
218
  tokenizer_2: CLIPTokenizer,
213
219
  text_encoder_3: T5EncoderModel,
214
220
  tokenizer_3: T5TokenizerFast,
221
+ image_encoder: Optional[SiglipVisionModel] = None,
222
+ feature_extractor: Optional[SiglipImageProcessor] = None,
215
223
  ):
216
224
  super().__init__()
217
225
 
@@ -225,13 +233,22 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
225
233
  tokenizer_3=tokenizer_3,
226
234
  transformer=transformer,
227
235
  scheduler=scheduler,
236
+ image_encoder=image_encoder,
237
+ feature_extractor=feature_extractor,
228
238
  )
229
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
239
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
240
+ latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
230
241
  self.image_processor = VaeImageProcessor(
231
- vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
242
+ vae_scale_factor=self.vae_scale_factor, vae_latent_channels=latent_channels
243
+ )
244
+ self.tokenizer_max_length = (
245
+ self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
246
+ )
247
+ self.default_sample_size = (
248
+ self.transformer.config.sample_size
249
+ if hasattr(self, "transformer") and self.transformer is not None
250
+ else 128
232
251
  )
233
- self.tokenizer_max_length = self.tokenizer.model_max_length
234
- self.default_sample_size = self.transformer.config.sample_size
235
252
  self.patch_size = (
236
253
  self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
237
254
  )
@@ -393,9 +410,9 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
393
410
  negative_prompt_2 (`str` or `List[str]`, *optional*):
394
411
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
395
412
  `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
396
- negative_prompt_2 (`str` or `List[str]`, *optional*):
413
+ negative_prompt_3 (`str` or `List[str]`, *optional*):
397
414
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
398
- `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
415
+ `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
399
416
  prompt_embeds (`torch.FloatTensor`, *optional*):
400
417
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
401
418
  provided, text embeddings will be generated from `prompt` input argument.
@@ -731,6 +748,84 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
731
748
  def interrupt(self):
732
749
  return self._interrupt
733
750
 
751
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
752
+ def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
753
+ """Encodes the given image into a feature representation using a pre-trained image encoder.
754
+
755
+ Args:
756
+ image (`PipelineImageInput`):
757
+ Input image to be encoded.
758
+ device: (`torch.device`):
759
+ Torch device.
760
+
761
+ Returns:
762
+ `torch.Tensor`: The encoded image feature representation.
763
+ """
764
+ if not isinstance(image, torch.Tensor):
765
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
766
+
767
+ image = image.to(device=device, dtype=self.dtype)
768
+
769
+ return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
770
+
771
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
772
+ def prepare_ip_adapter_image_embeds(
773
+ self,
774
+ ip_adapter_image: Optional[PipelineImageInput] = None,
775
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
776
+ device: Optional[torch.device] = None,
777
+ num_images_per_prompt: int = 1,
778
+ do_classifier_free_guidance: bool = True,
779
+ ) -> torch.Tensor:
780
+ """Prepares image embeddings for use in the IP-Adapter.
781
+
782
+ Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
783
+
784
+ Args:
785
+ ip_adapter_image (`PipelineImageInput`, *optional*):
786
+ The input image to extract features from for IP-Adapter.
787
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
788
+ Precomputed image embeddings.
789
+ device: (`torch.device`, *optional*):
790
+ Torch device.
791
+ num_images_per_prompt (`int`, defaults to 1):
792
+ Number of images that should be generated per prompt.
793
+ do_classifier_free_guidance (`bool`, defaults to True):
794
+ Whether to use classifier free guidance or not.
795
+ """
796
+ device = device or self._execution_device
797
+
798
+ if ip_adapter_image_embeds is not None:
799
+ if do_classifier_free_guidance:
800
+ single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
801
+ else:
802
+ single_image_embeds = ip_adapter_image_embeds
803
+ elif ip_adapter_image is not None:
804
+ single_image_embeds = self.encode_image(ip_adapter_image, device)
805
+ if do_classifier_free_guidance:
806
+ single_negative_image_embeds = torch.zeros_like(single_image_embeds)
807
+ else:
808
+ raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
809
+
810
+ image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
811
+
812
+ if do_classifier_free_guidance:
813
+ negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
814
+ image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
815
+
816
+ return image_embeds.to(device=device)
817
+
818
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
819
+ def enable_sequential_cpu_offload(self, *args, **kwargs):
820
+ if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
821
+ logger.warning(
822
+ "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
823
+ "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
824
+ "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
825
+ )
826
+
827
+ super().enable_sequential_cpu_offload(*args, **kwargs)
828
+
734
829
  @torch.no_grad()
735
830
  @replace_example_docstring(EXAMPLE_DOC_STRING)
736
831
  def __call__(
@@ -756,6 +851,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
756
851
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
757
852
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
758
853
  output_type: Optional[str] = "pil",
854
+ ip_adapter_image: Optional[PipelineImageInput] = None,
855
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
759
856
  return_dict: bool = True,
760
857
  joint_attention_kwargs: Optional[Dict[str, Any]] = None,
761
858
  clip_skip: Optional[int] = None,
@@ -777,9 +874,9 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
777
874
  prompt_3 (`str` or `List[str]`, *optional*):
778
875
  The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
779
876
  will be used instead
780
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
877
+ height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
781
878
  The height in pixels of the generated image. This is set to 1024 by default for the best results.
782
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
879
+ width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
783
880
  The width in pixels of the generated image. This is set to 1024 by default for the best results.
784
881
  num_inference_steps (`int`, *optional*, defaults to 50):
785
882
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -827,6 +924,12 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
827
924
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
828
925
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
829
926
  input argument.
927
+ ip_adapter_image (`PipelineImageInput`, *optional*):
928
+ Optional image input to work with IP Adapters.
929
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
930
+ Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
931
+ emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
932
+ `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
830
933
  output_type (`str`, *optional*, defaults to `"pil"`):
831
934
  The output format of the generate image. Choose between
832
935
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -936,10 +1039,10 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
936
1039
  )
937
1040
  mu = calculate_shift(
938
1041
  image_seq_len,
939
- self.scheduler.config.base_image_seq_len,
940
- self.scheduler.config.max_image_seq_len,
941
- self.scheduler.config.base_shift,
942
- self.scheduler.config.max_shift,
1042
+ self.scheduler.config.get("base_image_seq_len", 256),
1043
+ self.scheduler.config.get("max_image_seq_len", 4096),
1044
+ self.scheduler.config.get("base_shift", 0.5),
1045
+ self.scheduler.config.get("max_shift", 1.16),
943
1046
  )
944
1047
  scheduler_kwargs["mu"] = mu
945
1048
  elif mu is not None:
@@ -962,7 +1065,22 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
962
1065
  generator,
963
1066
  )
964
1067
 
965
- # 6. Denoising loop
1068
+ # 6. Prepare image embeddings
1069
+ if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
1070
+ ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
1071
+ ip_adapter_image,
1072
+ ip_adapter_image_embeds,
1073
+ device,
1074
+ batch_size * num_images_per_prompt,
1075
+ self.do_classifier_free_guidance,
1076
+ )
1077
+
1078
+ if self.joint_attention_kwargs is None:
1079
+ self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
1080
+ else:
1081
+ self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
1082
+
1083
+ # 7. Denoising loop
966
1084
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
967
1085
  self._num_timesteps = len(timesteps)
968
1086
  with self.progress_bar(total=num_inference_steps) as progress_bar: