diffusers 0.27.1__py3-none-any.whl → 0.32.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (445) hide show
  1. diffusers/__init__.py +233 -6
  2. diffusers/callbacks.py +209 -0
  3. diffusers/commands/env.py +102 -6
  4. diffusers/configuration_utils.py +45 -16
  5. diffusers/dependency_versions_table.py +4 -3
  6. diffusers/image_processor.py +434 -110
  7. diffusers/loaders/__init__.py +42 -9
  8. diffusers/loaders/ip_adapter.py +626 -36
  9. diffusers/loaders/lora_base.py +900 -0
  10. diffusers/loaders/lora_conversion_utils.py +991 -125
  11. diffusers/loaders/lora_pipeline.py +3812 -0
  12. diffusers/loaders/peft.py +571 -7
  13. diffusers/loaders/single_file.py +405 -173
  14. diffusers/loaders/single_file_model.py +385 -0
  15. diffusers/loaders/single_file_utils.py +1783 -713
  16. diffusers/loaders/textual_inversion.py +41 -23
  17. diffusers/loaders/transformer_flux.py +181 -0
  18. diffusers/loaders/transformer_sd3.py +89 -0
  19. diffusers/loaders/unet.py +464 -540
  20. diffusers/loaders/unet_loader_utils.py +163 -0
  21. diffusers/models/__init__.py +76 -7
  22. diffusers/models/activations.py +65 -10
  23. diffusers/models/adapter.py +53 -53
  24. diffusers/models/attention.py +605 -18
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +4304 -687
  27. diffusers/models/autoencoders/__init__.py +8 -0
  28. diffusers/models/autoencoders/autoencoder_asym_kl.py +15 -17
  29. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  30. diffusers/models/autoencoders/autoencoder_kl.py +110 -28
  31. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  32. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1482 -0
  33. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  34. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  35. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  36. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +19 -24
  37. diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
  38. diffusers/models/autoencoders/autoencoder_tiny.py +21 -18
  39. diffusers/models/autoencoders/consistency_decoder_vae.py +45 -20
  40. diffusers/models/autoencoders/vae.py +41 -29
  41. diffusers/models/autoencoders/vq_model.py +182 -0
  42. diffusers/models/controlnet.py +47 -800
  43. diffusers/models/controlnet_flux.py +70 -0
  44. diffusers/models/controlnet_sd3.py +68 -0
  45. diffusers/models/controlnet_sparsectrl.py +116 -0
  46. diffusers/models/controlnets/__init__.py +23 -0
  47. diffusers/models/controlnets/controlnet.py +872 -0
  48. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +9 -9
  49. diffusers/models/controlnets/controlnet_flux.py +536 -0
  50. diffusers/models/controlnets/controlnet_hunyuan.py +401 -0
  51. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  52. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  53. diffusers/models/controlnets/controlnet_union.py +832 -0
  54. diffusers/models/controlnets/controlnet_xs.py +1946 -0
  55. diffusers/models/controlnets/multicontrolnet.py +183 -0
  56. diffusers/models/downsampling.py +85 -18
  57. diffusers/models/embeddings.py +1856 -158
  58. diffusers/models/embeddings_flax.py +23 -9
  59. diffusers/models/model_loading_utils.py +480 -0
  60. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  61. diffusers/models/modeling_flax_utils.py +2 -7
  62. diffusers/models/modeling_outputs.py +14 -0
  63. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  64. diffusers/models/modeling_utils.py +611 -146
  65. diffusers/models/normalization.py +361 -20
  66. diffusers/models/resnet.py +18 -23
  67. diffusers/models/transformers/__init__.py +16 -0
  68. diffusers/models/transformers/auraflow_transformer_2d.py +544 -0
  69. diffusers/models/transformers/cogvideox_transformer_3d.py +542 -0
  70. diffusers/models/transformers/dit_transformer_2d.py +240 -0
  71. diffusers/models/transformers/dual_transformer_2d.py +9 -8
  72. diffusers/models/transformers/hunyuan_transformer_2d.py +578 -0
  73. diffusers/models/transformers/latte_transformer_3d.py +327 -0
  74. diffusers/models/transformers/lumina_nextdit2d.py +340 -0
  75. diffusers/models/transformers/pixart_transformer_2d.py +445 -0
  76. diffusers/models/transformers/prior_transformer.py +13 -13
  77. diffusers/models/transformers/sana_transformer.py +488 -0
  78. diffusers/models/transformers/stable_audio_transformer.py +458 -0
  79. diffusers/models/transformers/t5_film_transformer.py +17 -19
  80. diffusers/models/transformers/transformer_2d.py +297 -187
  81. diffusers/models/transformers/transformer_allegro.py +422 -0
  82. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  83. diffusers/models/transformers/transformer_flux.py +593 -0
  84. diffusers/models/transformers/transformer_hunyuan_video.py +791 -0
  85. diffusers/models/transformers/transformer_ltx.py +469 -0
  86. diffusers/models/transformers/transformer_mochi.py +499 -0
  87. diffusers/models/transformers/transformer_sd3.py +461 -0
  88. diffusers/models/transformers/transformer_temporal.py +21 -19
  89. diffusers/models/unets/unet_1d.py +8 -8
  90. diffusers/models/unets/unet_1d_blocks.py +31 -31
  91. diffusers/models/unets/unet_2d.py +17 -10
  92. diffusers/models/unets/unet_2d_blocks.py +225 -149
  93. diffusers/models/unets/unet_2d_condition.py +41 -40
  94. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  95. diffusers/models/unets/unet_3d_blocks.py +192 -1057
  96. diffusers/models/unets/unet_3d_condition.py +22 -27
  97. diffusers/models/unets/unet_i2vgen_xl.py +22 -18
  98. diffusers/models/unets/unet_kandinsky3.py +2 -2
  99. diffusers/models/unets/unet_motion_model.py +1413 -89
  100. diffusers/models/unets/unet_spatio_temporal_condition.py +40 -16
  101. diffusers/models/unets/unet_stable_cascade.py +19 -18
  102. diffusers/models/unets/uvit_2d.py +2 -2
  103. diffusers/models/upsampling.py +95 -26
  104. diffusers/models/vq_model.py +12 -164
  105. diffusers/optimization.py +1 -1
  106. diffusers/pipelines/__init__.py +202 -3
  107. diffusers/pipelines/allegro/__init__.py +48 -0
  108. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  109. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  110. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  111. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  112. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  113. diffusers/pipelines/animatediff/__init__.py +8 -0
  114. diffusers/pipelines/animatediff/pipeline_animatediff.py +122 -109
  115. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1106 -0
  116. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1288 -0
  117. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1010 -0
  118. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +236 -180
  119. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  120. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  121. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  122. diffusers/pipelines/audioldm2/modeling_audioldm2.py +58 -39
  123. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +121 -36
  124. diffusers/pipelines/aura_flow/__init__.py +48 -0
  125. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +584 -0
  126. diffusers/pipelines/auto_pipeline.py +196 -28
  127. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  128. diffusers/pipelines/blip_diffusion/modeling_blip2.py +6 -6
  129. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  130. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  131. diffusers/pipelines/cogvideo/__init__.py +54 -0
  132. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +772 -0
  133. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
  134. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +885 -0
  135. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +851 -0
  136. diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
  137. diffusers/pipelines/cogview3/__init__.py +47 -0
  138. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  139. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  140. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +6 -6
  141. diffusers/pipelines/controlnet/__init__.py +86 -80
  142. diffusers/pipelines/controlnet/multicontrolnet.py +7 -182
  143. diffusers/pipelines/controlnet/pipeline_controlnet.py +134 -87
  144. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  145. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +93 -77
  146. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +88 -197
  147. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +136 -90
  148. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +176 -80
  149. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +125 -89
  150. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  151. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  152. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  153. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  154. diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
  155. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1060 -0
  156. diffusers/pipelines/controlnet_sd3/__init__.py +57 -0
  157. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1133 -0
  158. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  159. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  160. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +916 -0
  161. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1111 -0
  162. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  163. diffusers/pipelines/deepfloyd_if/pipeline_if.py +16 -30
  164. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +20 -35
  165. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +23 -41
  166. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +22 -38
  167. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +25 -41
  168. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +19 -34
  169. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  170. diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
  171. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  172. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +70 -30
  173. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +48 -25
  174. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  175. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  176. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +21 -20
  177. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +27 -29
  178. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +33 -27
  179. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +33 -23
  180. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +36 -30
  181. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +102 -69
  182. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  183. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  184. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  185. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  186. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  187. diffusers/pipelines/dit/pipeline_dit.py +7 -4
  188. diffusers/pipelines/flux/__init__.py +69 -0
  189. diffusers/pipelines/flux/modeling_flux.py +47 -0
  190. diffusers/pipelines/flux/pipeline_flux.py +957 -0
  191. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  192. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  193. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  194. diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
  195. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
  196. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
  197. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  198. diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
  199. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
  200. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  201. diffusers/pipelines/flux/pipeline_output.py +37 -0
  202. diffusers/pipelines/free_init_utils.py +41 -38
  203. diffusers/pipelines/free_noise_utils.py +596 -0
  204. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  205. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  206. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  207. diffusers/pipelines/hunyuandit/__init__.py +48 -0
  208. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +916 -0
  209. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  210. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  211. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +32 -29
  212. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  213. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  214. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  215. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  216. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +34 -31
  217. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  218. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  219. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  220. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  221. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  222. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  223. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  224. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +22 -35
  225. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +26 -37
  226. diffusers/pipelines/kolors/__init__.py +54 -0
  227. diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
  228. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1250 -0
  229. diffusers/pipelines/kolors/pipeline_output.py +21 -0
  230. diffusers/pipelines/kolors/text_encoder.py +889 -0
  231. diffusers/pipelines/kolors/tokenizer.py +338 -0
  232. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +82 -62
  233. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +77 -60
  234. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +12 -12
  235. diffusers/pipelines/latte/__init__.py +48 -0
  236. diffusers/pipelines/latte/pipeline_latte.py +881 -0
  237. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +80 -74
  238. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +85 -76
  239. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  240. diffusers/pipelines/ltx/__init__.py +50 -0
  241. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  242. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  243. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  244. diffusers/pipelines/lumina/__init__.py +48 -0
  245. diffusers/pipelines/lumina/pipeline_lumina.py +890 -0
  246. diffusers/pipelines/marigold/__init__.py +50 -0
  247. diffusers/pipelines/marigold/marigold_image_processing.py +576 -0
  248. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  249. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  250. diffusers/pipelines/mochi/__init__.py +48 -0
  251. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  252. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  253. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  254. diffusers/pipelines/pag/__init__.py +80 -0
  255. diffusers/pipelines/pag/pag_utils.py +243 -0
  256. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1328 -0
  257. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
  258. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1610 -0
  259. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
  260. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +969 -0
  261. diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
  262. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +865 -0
  263. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  264. diffusers/pipelines/pag/pipeline_pag_sd.py +1062 -0
  265. diffusers/pipelines/pag/pipeline_pag_sd_3.py +994 -0
  266. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  267. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +866 -0
  268. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
  269. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  270. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1345 -0
  271. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1544 -0
  272. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1776 -0
  273. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  274. diffusers/pipelines/pia/pipeline_pia.py +74 -164
  275. diffusers/pipelines/pipeline_flax_utils.py +5 -10
  276. diffusers/pipelines/pipeline_loading_utils.py +515 -53
  277. diffusers/pipelines/pipeline_utils.py +411 -222
  278. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  279. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +76 -93
  280. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +873 -0
  281. diffusers/pipelines/sana/__init__.py +47 -0
  282. diffusers/pipelines/sana/pipeline_output.py +21 -0
  283. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  284. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +27 -23
  285. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  286. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  287. diffusers/pipelines/shap_e/renderer.py +1 -1
  288. diffusers/pipelines/stable_audio/__init__.py +50 -0
  289. diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
  290. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +756 -0
  291. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +71 -25
  292. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  293. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +35 -34
  294. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  295. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +20 -11
  296. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  297. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  298. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  299. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +145 -79
  300. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +43 -28
  301. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  302. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +100 -68
  303. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +109 -201
  304. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +131 -32
  305. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +247 -87
  306. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +30 -29
  307. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +35 -27
  308. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +49 -42
  309. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  310. diffusers/pipelines/stable_diffusion_3/__init__.py +54 -0
  311. diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
  312. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +1140 -0
  313. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +1036 -0
  314. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1250 -0
  315. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +29 -20
  316. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +59 -58
  317. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +31 -25
  318. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +38 -22
  319. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -24
  320. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -23
  321. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +107 -67
  322. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +316 -69
  323. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  324. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  325. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +98 -30
  326. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +121 -83
  327. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +161 -105
  328. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +142 -218
  329. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -29
  330. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  331. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  332. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +69 -39
  333. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +105 -74
  334. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  335. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +29 -49
  336. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +32 -93
  337. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +37 -25
  338. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +54 -40
  339. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  340. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  341. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  342. diffusers/pipelines/unidiffuser/modeling_uvit.py +12 -12
  343. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +29 -28
  344. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  345. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  346. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +6 -8
  347. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  348. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  349. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +15 -14
  350. diffusers/{models/dual_transformer_2d.py → quantizers/__init__.py} +2 -6
  351. diffusers/quantizers/auto.py +139 -0
  352. diffusers/quantizers/base.py +233 -0
  353. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  354. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
  355. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  356. diffusers/quantizers/gguf/__init__.py +1 -0
  357. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  358. diffusers/quantizers/gguf/utils.py +456 -0
  359. diffusers/quantizers/quantization_config.py +669 -0
  360. diffusers/quantizers/torchao/__init__.py +15 -0
  361. diffusers/quantizers/torchao/torchao_quantizer.py +292 -0
  362. diffusers/schedulers/__init__.py +12 -2
  363. diffusers/schedulers/deprecated/__init__.py +1 -1
  364. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  365. diffusers/schedulers/scheduling_amused.py +5 -5
  366. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  367. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  368. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
  369. diffusers/schedulers/scheduling_ddim.py +27 -26
  370. diffusers/schedulers/scheduling_ddim_cogvideox.py +452 -0
  371. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  372. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  373. diffusers/schedulers/scheduling_ddim_parallel.py +32 -31
  374. diffusers/schedulers/scheduling_ddpm.py +27 -30
  375. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  376. diffusers/schedulers/scheduling_ddpm_parallel.py +33 -36
  377. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  378. diffusers/schedulers/scheduling_deis_multistep.py +150 -50
  379. diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
  380. diffusers/schedulers/scheduling_dpmsolver_multistep.py +221 -84
  381. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  382. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +158 -52
  383. diffusers/schedulers/scheduling_dpmsolver_sde.py +153 -34
  384. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +275 -86
  385. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +81 -57
  386. diffusers/schedulers/scheduling_edm_euler.py +62 -39
  387. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +30 -29
  388. diffusers/schedulers/scheduling_euler_discrete.py +255 -74
  389. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +458 -0
  390. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +320 -0
  391. diffusers/schedulers/scheduling_heun_discrete.py +174 -46
  392. diffusers/schedulers/scheduling_ipndm.py +9 -9
  393. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +138 -29
  394. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +132 -26
  395. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  396. diffusers/schedulers/scheduling_lcm.py +23 -29
  397. diffusers/schedulers/scheduling_lms_discrete.py +105 -28
  398. diffusers/schedulers/scheduling_pndm.py +20 -20
  399. diffusers/schedulers/scheduling_repaint.py +21 -21
  400. diffusers/schedulers/scheduling_sasolver.py +157 -60
  401. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  402. diffusers/schedulers/scheduling_tcd.py +41 -36
  403. diffusers/schedulers/scheduling_unclip.py +19 -16
  404. diffusers/schedulers/scheduling_unipc_multistep.py +243 -47
  405. diffusers/schedulers/scheduling_utils.py +12 -5
  406. diffusers/schedulers/scheduling_utils_flax.py +1 -3
  407. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  408. diffusers/training_utils.py +214 -30
  409. diffusers/utils/__init__.py +17 -1
  410. diffusers/utils/constants.py +3 -0
  411. diffusers/utils/doc_utils.py +1 -0
  412. diffusers/utils/dummy_pt_objects.py +592 -7
  413. diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
  414. diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
  415. diffusers/utils/dummy_torch_and_transformers_objects.py +1001 -71
  416. diffusers/utils/dynamic_modules_utils.py +34 -29
  417. diffusers/utils/export_utils.py +50 -6
  418. diffusers/utils/hub_utils.py +131 -17
  419. diffusers/utils/import_utils.py +210 -8
  420. diffusers/utils/loading_utils.py +118 -5
  421. diffusers/utils/logging.py +4 -2
  422. diffusers/utils/peft_utils.py +37 -7
  423. diffusers/utils/state_dict_utils.py +13 -2
  424. diffusers/utils/testing_utils.py +193 -11
  425. diffusers/utils/torch_utils.py +4 -0
  426. diffusers/video_processor.py +113 -0
  427. {diffusers-0.27.1.dist-info → diffusers-0.32.2.dist-info}/METADATA +82 -91
  428. diffusers-0.32.2.dist-info/RECORD +550 -0
  429. {diffusers-0.27.1.dist-info → diffusers-0.32.2.dist-info}/WHEEL +1 -1
  430. diffusers/loaders/autoencoder.py +0 -146
  431. diffusers/loaders/controlnet.py +0 -136
  432. diffusers/loaders/lora.py +0 -1349
  433. diffusers/models/prior_transformer.py +0 -12
  434. diffusers/models/t5_film_transformer.py +0 -70
  435. diffusers/models/transformer_2d.py +0 -25
  436. diffusers/models/transformer_temporal.py +0 -34
  437. diffusers/models/unet_1d.py +0 -26
  438. diffusers/models/unet_1d_blocks.py +0 -203
  439. diffusers/models/unet_2d.py +0 -27
  440. diffusers/models/unet_2d_blocks.py +0 -375
  441. diffusers/models/unet_2d_condition.py +0 -25
  442. diffusers-0.27.1.dist-info/RECORD +0 -399
  443. {diffusers-0.27.1.dist-info → diffusers-0.32.2.dist-info}/LICENSE +0 -0
  444. {diffusers-0.27.1.dist-info → diffusers-0.32.2.dist-info}/entry_points.txt +0 -0
  445. {diffusers-0.27.1.dist-info → diffusers-0.32.2.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,7 @@ from transformers import (
25
25
  CLIPVisionModelWithProjection,
26
26
  )
27
27
 
28
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
28
29
  from ...image_processor import PipelineImageInput, VaeImageProcessor
29
30
  from ...loaders import (
30
31
  FromSingleFileMixin,
@@ -35,8 +36,6 @@ from ...loaders import (
35
36
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
36
37
  from ...models.attention_processor import (
37
38
  AttnProcessor2_0,
38
- LoRAAttnProcessor2_0,
39
- LoRAXFormersAttnProcessor,
40
39
  XFormersAttnProcessor,
41
40
  )
42
41
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -91,9 +90,21 @@ EXAMPLE_DOC_STRING = """
91
90
 
92
91
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
93
92
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
94
- """
95
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
96
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
93
+ r"""
94
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
95
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
96
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
97
+
98
+ Args:
99
+ noise_cfg (`torch.Tensor`):
100
+ The predicted noise tensor for the guided diffusion process.
101
+ noise_pred_text (`torch.Tensor`):
102
+ The predicted noise tensor for the text-guided diffusion process.
103
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
104
+ A rescale factor applied to the noise predictions.
105
+
106
+ Returns:
107
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
97
108
  """
98
109
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
99
110
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -124,9 +135,10 @@ def retrieve_timesteps(
124
135
  num_inference_steps: Optional[int] = None,
125
136
  device: Optional[Union[str, torch.device]] = None,
126
137
  timesteps: Optional[List[int]] = None,
138
+ sigmas: Optional[List[float]] = None,
127
139
  **kwargs,
128
140
  ):
129
- """
141
+ r"""
130
142
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
131
143
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
132
144
 
@@ -134,19 +146,23 @@ def retrieve_timesteps(
134
146
  scheduler (`SchedulerMixin`):
135
147
  The scheduler to get timesteps from.
136
148
  num_inference_steps (`int`):
137
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
138
- `timesteps` must be `None`.
149
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
150
+ must be `None`.
139
151
  device (`str` or `torch.device`, *optional*):
140
152
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
141
153
  timesteps (`List[int]`, *optional*):
142
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
143
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
144
- must be `None`.
154
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
155
+ `num_inference_steps` and `sigmas` must be `None`.
156
+ sigmas (`List[float]`, *optional*):
157
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
158
+ `num_inference_steps` and `timesteps` must be `None`.
145
159
 
146
160
  Returns:
147
161
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
148
162
  second element is the number of inference steps.
149
163
  """
164
+ if timesteps is not None and sigmas is not None:
165
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
150
166
  if timesteps is not None:
151
167
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
152
168
  if not accepts_timesteps:
@@ -157,6 +173,16 @@ def retrieve_timesteps(
157
173
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
158
174
  timesteps = scheduler.timesteps
159
175
  num_inference_steps = len(timesteps)
176
+ elif sigmas is not None:
177
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
178
+ if not accept_sigmas:
179
+ raise ValueError(
180
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
181
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
182
+ )
183
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
184
+ timesteps = scheduler.timesteps
185
+ num_inference_steps = len(timesteps)
160
186
  else:
161
187
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
162
188
  timesteps = scheduler.timesteps
@@ -231,11 +257,8 @@ class StableDiffusionXLImg2ImgPipeline(
231
257
  _callback_tensor_inputs = [
232
258
  "latents",
233
259
  "prompt_embeds",
234
- "negative_prompt_embeds",
235
260
  "add_text_embeds",
236
261
  "add_time_ids",
237
- "negative_pooled_prompt_embeds",
238
- "add_neg_time_ids",
239
262
  ]
240
263
 
241
264
  def __init__(
@@ -288,10 +311,10 @@ class StableDiffusionXLImg2ImgPipeline(
288
311
  do_classifier_free_guidance: bool = True,
289
312
  negative_prompt: Optional[str] = None,
290
313
  negative_prompt_2: Optional[str] = None,
291
- prompt_embeds: Optional[torch.FloatTensor] = None,
292
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
293
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
294
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
314
+ prompt_embeds: Optional[torch.Tensor] = None,
315
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
316
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
317
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
295
318
  lora_scale: Optional[float] = None,
296
319
  clip_skip: Optional[int] = None,
297
320
  ):
@@ -317,17 +340,17 @@ class StableDiffusionXLImg2ImgPipeline(
317
340
  negative_prompt_2 (`str` or `List[str]`, *optional*):
318
341
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
319
342
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
320
- prompt_embeds (`torch.FloatTensor`, *optional*):
343
+ prompt_embeds (`torch.Tensor`, *optional*):
321
344
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
322
345
  provided, text embeddings will be generated from `prompt` input argument.
323
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
346
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
324
347
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
325
348
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
326
349
  argument.
327
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
350
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
328
351
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
329
352
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
330
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
353
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
331
354
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
332
355
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
333
356
  input argument.
@@ -626,14 +649,16 @@ class StableDiffusionXLImg2ImgPipeline(
626
649
  if denoising_start is None:
627
650
  init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
628
651
  t_start = max(num_inference_steps - init_timestep, 0)
629
- else:
630
- t_start = 0
631
652
 
632
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
653
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
654
+ if hasattr(self.scheduler, "set_begin_index"):
655
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
656
+
657
+ return timesteps, num_inference_steps - t_start
633
658
 
634
- # Strength is irrelevant if we directly request a timestep to start at;
635
- # that is, strength is determined by the denoising_start instead.
636
- if denoising_start is not None:
659
+ else:
660
+ # Strength is irrelevant if we directly request a timestep to start at;
661
+ # that is, strength is determined by the denoising_start instead.
637
662
  discrete_timestep_cutoff = int(
638
663
  round(
639
664
  self.scheduler.config.num_train_timesteps
@@ -641,22 +666,23 @@ class StableDiffusionXLImg2ImgPipeline(
641
666
  )
642
667
  )
643
668
 
644
- num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
669
+ num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
645
670
  if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
646
671
  # if the scheduler is a 2nd order scheduler we might have to do +1
647
672
  # because `num_inference_steps` might be even given that every timestep
648
673
  # (except the highest one) is duplicated. If `num_inference_steps` is even it would
649
674
  # mean that we cut the timesteps in the middle of the denoising step
650
- # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
675
+ # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
651
676
  # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
652
677
  num_inference_steps = num_inference_steps + 1
653
678
 
654
679
  # because t_n+1 >= t_n, we slice the timesteps starting from the end
655
- timesteps = timesteps[-num_inference_steps:]
680
+ t_start = len(self.scheduler.timesteps) - num_inference_steps
681
+ timesteps = self.scheduler.timesteps[t_start:]
682
+ if hasattr(self.scheduler, "set_begin_index"):
683
+ self.scheduler.set_begin_index(t_start)
656
684
  return timesteps, num_inference_steps
657
685
 
658
- return timesteps, num_inference_steps - t_start
659
-
660
686
  def prepare_latents(
661
687
  self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
662
688
  ):
@@ -665,6 +691,12 @@ class StableDiffusionXLImg2ImgPipeline(
665
691
  f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
666
692
  )
667
693
 
694
+ latents_mean = latents_std = None
695
+ if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
696
+ latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
697
+ if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
698
+ latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
699
+
668
700
  # Offload text encoder if `enable_model_cpu_offload` was enabled
669
701
  if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
670
702
  self.text_encoder_2.to("cpu")
@@ -690,6 +722,13 @@ class StableDiffusionXLImg2ImgPipeline(
690
722
  )
691
723
 
692
724
  elif isinstance(generator, list):
725
+ if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
726
+ image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
727
+ elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
728
+ raise ValueError(
729
+ f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
730
+ )
731
+
693
732
  init_latents = [
694
733
  retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
695
734
  for i in range(batch_size)
@@ -702,7 +741,12 @@ class StableDiffusionXLImg2ImgPipeline(
702
741
  self.vae.to(dtype)
703
742
 
704
743
  init_latents = init_latents.to(dtype)
705
- init_latents = self.vae.config.scaling_factor * init_latents
744
+ if latents_mean is not None and latents_std is not None:
745
+ latents_mean = latents_mean.to(device=device, dtype=dtype)
746
+ latents_std = latents_std.to(device=device, dtype=dtype)
747
+ init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
748
+ else:
749
+ init_latents = self.vae.config.scaling_factor * init_latents
706
750
 
707
751
  if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
708
752
  # expand init_latents for batch_size
@@ -754,6 +798,9 @@ class StableDiffusionXLImg2ImgPipeline(
754
798
  def prepare_ip_adapter_image_embeds(
755
799
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
756
800
  ):
801
+ image_embeds = []
802
+ if do_classifier_free_guidance:
803
+ negative_image_embeds = []
757
804
  if ip_adapter_image_embeds is None:
758
805
  if not isinstance(ip_adapter_image, list):
759
806
  ip_adapter_image = [ip_adapter_image]
@@ -763,7 +810,6 @@ class StableDiffusionXLImg2ImgPipeline(
763
810
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
764
811
  )
765
812
 
766
- image_embeds = []
767
813
  for single_ip_adapter_image, image_proj_layer in zip(
768
814
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
769
815
  ):
@@ -771,36 +817,28 @@ class StableDiffusionXLImg2ImgPipeline(
771
817
  single_image_embeds, single_negative_image_embeds = self.encode_image(
772
818
  single_ip_adapter_image, device, 1, output_hidden_state
773
819
  )
774
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
775
- single_negative_image_embeds = torch.stack(
776
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
777
- )
778
820
 
821
+ image_embeds.append(single_image_embeds[None, :])
779
822
  if do_classifier_free_guidance:
780
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
781
- single_image_embeds = single_image_embeds.to(device)
782
-
783
- image_embeds.append(single_image_embeds)
823
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
784
824
  else:
785
- repeat_dims = [1]
786
- image_embeds = []
787
825
  for single_image_embeds in ip_adapter_image_embeds:
788
826
  if do_classifier_free_guidance:
789
827
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
790
- single_image_embeds = single_image_embeds.repeat(
791
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
792
- )
793
- single_negative_image_embeds = single_negative_image_embeds.repeat(
794
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
795
- )
796
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
797
- else:
798
- single_image_embeds = single_image_embeds.repeat(
799
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
800
- )
828
+ negative_image_embeds.append(single_negative_image_embeds)
801
829
  image_embeds.append(single_image_embeds)
802
830
 
803
- return image_embeds
831
+ ip_adapter_image_embeds = []
832
+ for i, single_image_embeds in enumerate(image_embeds):
833
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
834
+ if do_classifier_free_guidance:
835
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
836
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
837
+
838
+ single_image_embeds = single_image_embeds.to(device=device)
839
+ ip_adapter_image_embeds.append(single_image_embeds)
840
+
841
+ return ip_adapter_image_embeds
804
842
 
805
843
  def _get_add_time_ids(
806
844
  self,
@@ -862,8 +900,6 @@ class StableDiffusionXLImg2ImgPipeline(
862
900
  (
863
901
  AttnProcessor2_0,
864
902
  XFormersAttnProcessor,
865
- LoRAXFormersAttnProcessor,
866
- LoRAAttnProcessor2_0,
867
903
  ),
868
904
  )
869
905
  # if xformers or torch_2_0 is used attention block does not need
@@ -874,20 +910,22 @@ class StableDiffusionXLImg2ImgPipeline(
874
910
  self.vae.decoder.mid_block.to(dtype)
875
911
 
876
912
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
877
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
913
+ def get_guidance_scale_embedding(
914
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
915
+ ) -> torch.Tensor:
878
916
  """
879
917
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
880
918
 
881
919
  Args:
882
- timesteps (`torch.Tensor`):
883
- generate embedding vectors at these timesteps
920
+ w (`torch.Tensor`):
921
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
884
922
  embedding_dim (`int`, *optional*, defaults to 512):
885
- dimension of the embeddings to generate
886
- dtype:
887
- data type of the generated embeddings
923
+ Dimension of the embeddings to generate.
924
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
925
+ Data type of the generated embeddings.
888
926
 
889
927
  Returns:
890
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
928
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
891
929
  """
892
930
  assert len(w.shape) == 1
893
931
  w = w * 1000.0
@@ -951,6 +989,7 @@ class StableDiffusionXLImg2ImgPipeline(
951
989
  strength: float = 0.3,
952
990
  num_inference_steps: int = 50,
953
991
  timesteps: List[int] = None,
992
+ sigmas: List[float] = None,
954
993
  denoising_start: Optional[float] = None,
955
994
  denoising_end: Optional[float] = None,
956
995
  guidance_scale: float = 5.0,
@@ -959,13 +998,13 @@ class StableDiffusionXLImg2ImgPipeline(
959
998
  num_images_per_prompt: Optional[int] = 1,
960
999
  eta: float = 0.0,
961
1000
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
962
- latents: Optional[torch.FloatTensor] = None,
963
- prompt_embeds: Optional[torch.FloatTensor] = None,
964
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
965
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
966
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1001
+ latents: Optional[torch.Tensor] = None,
1002
+ prompt_embeds: Optional[torch.Tensor] = None,
1003
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
1004
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
1005
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
967
1006
  ip_adapter_image: Optional[PipelineImageInput] = None,
968
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
1007
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
969
1008
  output_type: Optional[str] = "pil",
970
1009
  return_dict: bool = True,
971
1010
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -979,7 +1018,9 @@ class StableDiffusionXLImg2ImgPipeline(
979
1018
  aesthetic_score: float = 6.0,
980
1019
  negative_aesthetic_score: float = 2.5,
981
1020
  clip_skip: Optional[int] = None,
982
- callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
1021
+ callback_on_step_end: Optional[
1022
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
1023
+ ] = None,
983
1024
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
984
1025
  **kwargs,
985
1026
  ):
@@ -993,7 +1034,7 @@ class StableDiffusionXLImg2ImgPipeline(
993
1034
  prompt_2 (`str` or `List[str]`, *optional*):
994
1035
  The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
995
1036
  used in both text-encoders
996
- image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
1037
+ image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
997
1038
  The image(s) to modify with the pipeline.
998
1039
  strength (`float`, *optional*, defaults to 0.3):
999
1040
  Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
@@ -1009,6 +1050,10 @@ class StableDiffusionXLImg2ImgPipeline(
1009
1050
  Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
1010
1051
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
1011
1052
  passed will be used. Must be in descending order.
1053
+ sigmas (`List[float]`, *optional*):
1054
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
1055
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
1056
+ will be used.
1012
1057
  denoising_start (`float`, *optional*):
1013
1058
  When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
1014
1059
  bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
@@ -1045,30 +1090,30 @@ class StableDiffusionXLImg2ImgPipeline(
1045
1090
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
1046
1091
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
1047
1092
  to make generation deterministic.
1048
- latents (`torch.FloatTensor`, *optional*):
1093
+ latents (`torch.Tensor`, *optional*):
1049
1094
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1050
1095
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1051
1096
  tensor will ge generated by sampling using the supplied random `generator`.
1052
- prompt_embeds (`torch.FloatTensor`, *optional*):
1097
+ prompt_embeds (`torch.Tensor`, *optional*):
1053
1098
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1054
1099
  provided, text embeddings will be generated from `prompt` input argument.
1055
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1100
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
1056
1101
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1057
1102
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1058
1103
  argument.
1059
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1104
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
1060
1105
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
1061
1106
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
1062
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1107
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
1063
1108
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1064
1109
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1065
1110
  input argument.
1066
1111
  ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1067
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
1068
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
1069
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
1070
- if `do_classifier_free_guidance` is set to `True`.
1071
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1112
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
1113
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
1114
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
1115
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
1116
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
1072
1117
  output_type (`str`, *optional*, defaults to `"pil"`):
1073
1118
  The output format of the generate image. Choose between
1074
1119
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -1124,11 +1169,11 @@ class StableDiffusionXLImg2ImgPipeline(
1124
1169
  clip_skip (`int`, *optional*):
1125
1170
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1126
1171
  the output of the pre-final layer will be used for computing the prompt embeddings.
1127
- callback_on_step_end (`Callable`, *optional*):
1128
- A function that calls at the end of each denoising steps during the inference. The function is called
1129
- with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1130
- callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1131
- `callback_on_step_end_tensor_inputs`.
1172
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1173
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1174
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1175
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1176
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1132
1177
  callback_on_step_end_tensor_inputs (`List`, *optional*):
1133
1178
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1134
1179
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1158,6 +1203,9 @@ class StableDiffusionXLImg2ImgPipeline(
1158
1203
  "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
1159
1204
  )
1160
1205
 
1206
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1207
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1208
+
1161
1209
  # 1. Check inputs. Raise error if not correct
1162
1210
  self.check_inputs(
1163
1211
  prompt,
@@ -1224,7 +1272,9 @@ class StableDiffusionXLImg2ImgPipeline(
1224
1272
  def denoising_value_valid(dnv):
1225
1273
  return isinstance(dnv, float) and 0 < dnv < 1
1226
1274
 
1227
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1275
+ timesteps, num_inference_steps = retrieve_timesteps(
1276
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1277
+ )
1228
1278
  timesteps, num_inference_steps = self.get_timesteps(
1229
1279
  num_inference_steps,
1230
1280
  strength,
@@ -1234,17 +1284,19 @@ class StableDiffusionXLImg2ImgPipeline(
1234
1284
  latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1235
1285
 
1236
1286
  add_noise = True if self.denoising_start is None else False
1287
+
1237
1288
  # 6. Prepare latent variables
1238
- latents = self.prepare_latents(
1239
- image,
1240
- latent_timestep,
1241
- batch_size,
1242
- num_images_per_prompt,
1243
- prompt_embeds.dtype,
1244
- device,
1245
- generator,
1246
- add_noise,
1247
- )
1289
+ if latents is None:
1290
+ latents = self.prepare_latents(
1291
+ image,
1292
+ latent_timestep,
1293
+ batch_size,
1294
+ num_images_per_prompt,
1295
+ prompt_embeds.dtype,
1296
+ device,
1297
+ generator,
1298
+ add_noise,
1299
+ )
1248
1300
  # 7. Prepare extra step kwargs.
1249
1301
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1250
1302
 
@@ -1368,7 +1420,12 @@ class StableDiffusionXLImg2ImgPipeline(
1368
1420
  noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
1369
1421
 
1370
1422
  # compute the previous noisy sample x_t -> x_t-1
1423
+ latents_dtype = latents.dtype
1371
1424
  latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1425
+ if latents.dtype != latents_dtype:
1426
+ if torch.backends.mps.is_available():
1427
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1428
+ latents = latents.to(latents_dtype)
1372
1429
 
1373
1430
  if callback_on_step_end is not None:
1374
1431
  callback_kwargs = {}
@@ -1378,13 +1435,8 @@ class StableDiffusionXLImg2ImgPipeline(
1378
1435
 
1379
1436
  latents = callback_outputs.pop("latents", latents)
1380
1437
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1381
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1382
1438
  add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1383
- negative_pooled_prompt_embeds = callback_outputs.pop(
1384
- "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
1385
- )
1386
1439
  add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1387
- add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
1388
1440
 
1389
1441
  # call the callback, if provided
1390
1442
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -1403,6 +1455,10 @@ class StableDiffusionXLImg2ImgPipeline(
1403
1455
  if needs_upcasting:
1404
1456
  self.upcast_vae()
1405
1457
  latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1458
+ elif latents.dtype != self.vae.dtype:
1459
+ if torch.backends.mps.is_available():
1460
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1461
+ self.vae = self.vae.to(latents.dtype)
1406
1462
 
1407
1463
  # unscale/denormalize the latents
1408
1464
  # denormalize with the mean and std if available and not None