diffusers 0.27.0__py3-none-any.whl → 0.32.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (445) hide show
  1. diffusers/__init__.py +233 -6
  2. diffusers/callbacks.py +209 -0
  3. diffusers/commands/env.py +102 -6
  4. diffusers/configuration_utils.py +45 -16
  5. diffusers/dependency_versions_table.py +4 -3
  6. diffusers/image_processor.py +434 -110
  7. diffusers/loaders/__init__.py +42 -9
  8. diffusers/loaders/ip_adapter.py +626 -36
  9. diffusers/loaders/lora_base.py +900 -0
  10. diffusers/loaders/lora_conversion_utils.py +991 -125
  11. diffusers/loaders/lora_pipeline.py +3812 -0
  12. diffusers/loaders/peft.py +571 -7
  13. diffusers/loaders/single_file.py +405 -173
  14. diffusers/loaders/single_file_model.py +385 -0
  15. diffusers/loaders/single_file_utils.py +1783 -713
  16. diffusers/loaders/textual_inversion.py +41 -23
  17. diffusers/loaders/transformer_flux.py +181 -0
  18. diffusers/loaders/transformer_sd3.py +89 -0
  19. diffusers/loaders/unet.py +464 -540
  20. diffusers/loaders/unet_loader_utils.py +163 -0
  21. diffusers/models/__init__.py +76 -7
  22. diffusers/models/activations.py +65 -10
  23. diffusers/models/adapter.py +53 -53
  24. diffusers/models/attention.py +605 -18
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +4304 -687
  27. diffusers/models/autoencoders/__init__.py +8 -0
  28. diffusers/models/autoencoders/autoencoder_asym_kl.py +15 -17
  29. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  30. diffusers/models/autoencoders/autoencoder_kl.py +110 -28
  31. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  32. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1482 -0
  33. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  34. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  35. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  36. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +19 -24
  37. diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
  38. diffusers/models/autoencoders/autoencoder_tiny.py +21 -18
  39. diffusers/models/autoencoders/consistency_decoder_vae.py +45 -20
  40. diffusers/models/autoencoders/vae.py +41 -29
  41. diffusers/models/autoencoders/vq_model.py +182 -0
  42. diffusers/models/controlnet.py +47 -800
  43. diffusers/models/controlnet_flux.py +70 -0
  44. diffusers/models/controlnet_sd3.py +68 -0
  45. diffusers/models/controlnet_sparsectrl.py +116 -0
  46. diffusers/models/controlnets/__init__.py +23 -0
  47. diffusers/models/controlnets/controlnet.py +872 -0
  48. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +9 -9
  49. diffusers/models/controlnets/controlnet_flux.py +536 -0
  50. diffusers/models/controlnets/controlnet_hunyuan.py +401 -0
  51. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  52. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  53. diffusers/models/controlnets/controlnet_union.py +832 -0
  54. diffusers/models/controlnets/controlnet_xs.py +1946 -0
  55. diffusers/models/controlnets/multicontrolnet.py +183 -0
  56. diffusers/models/downsampling.py +85 -18
  57. diffusers/models/embeddings.py +1856 -158
  58. diffusers/models/embeddings_flax.py +23 -9
  59. diffusers/models/model_loading_utils.py +480 -0
  60. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  61. diffusers/models/modeling_flax_utils.py +2 -7
  62. diffusers/models/modeling_outputs.py +14 -0
  63. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  64. diffusers/models/modeling_utils.py +611 -146
  65. diffusers/models/normalization.py +361 -20
  66. diffusers/models/resnet.py +18 -23
  67. diffusers/models/transformers/__init__.py +16 -0
  68. diffusers/models/transformers/auraflow_transformer_2d.py +544 -0
  69. diffusers/models/transformers/cogvideox_transformer_3d.py +542 -0
  70. diffusers/models/transformers/dit_transformer_2d.py +240 -0
  71. diffusers/models/transformers/dual_transformer_2d.py +9 -8
  72. diffusers/models/transformers/hunyuan_transformer_2d.py +578 -0
  73. diffusers/models/transformers/latte_transformer_3d.py +327 -0
  74. diffusers/models/transformers/lumina_nextdit2d.py +340 -0
  75. diffusers/models/transformers/pixart_transformer_2d.py +445 -0
  76. diffusers/models/transformers/prior_transformer.py +13 -13
  77. diffusers/models/transformers/sana_transformer.py +488 -0
  78. diffusers/models/transformers/stable_audio_transformer.py +458 -0
  79. diffusers/models/transformers/t5_film_transformer.py +17 -19
  80. diffusers/models/transformers/transformer_2d.py +297 -187
  81. diffusers/models/transformers/transformer_allegro.py +422 -0
  82. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  83. diffusers/models/transformers/transformer_flux.py +593 -0
  84. diffusers/models/transformers/transformer_hunyuan_video.py +791 -0
  85. diffusers/models/transformers/transformer_ltx.py +469 -0
  86. diffusers/models/transformers/transformer_mochi.py +499 -0
  87. diffusers/models/transformers/transformer_sd3.py +461 -0
  88. diffusers/models/transformers/transformer_temporal.py +21 -19
  89. diffusers/models/unets/unet_1d.py +8 -8
  90. diffusers/models/unets/unet_1d_blocks.py +31 -31
  91. diffusers/models/unets/unet_2d.py +17 -10
  92. diffusers/models/unets/unet_2d_blocks.py +225 -149
  93. diffusers/models/unets/unet_2d_condition.py +50 -53
  94. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  95. diffusers/models/unets/unet_3d_blocks.py +192 -1057
  96. diffusers/models/unets/unet_3d_condition.py +22 -27
  97. diffusers/models/unets/unet_i2vgen_xl.py +22 -18
  98. diffusers/models/unets/unet_kandinsky3.py +2 -2
  99. diffusers/models/unets/unet_motion_model.py +1413 -89
  100. diffusers/models/unets/unet_spatio_temporal_condition.py +40 -16
  101. diffusers/models/unets/unet_stable_cascade.py +19 -18
  102. diffusers/models/unets/uvit_2d.py +2 -2
  103. diffusers/models/upsampling.py +95 -26
  104. diffusers/models/vq_model.py +12 -164
  105. diffusers/optimization.py +1 -1
  106. diffusers/pipelines/__init__.py +202 -3
  107. diffusers/pipelines/allegro/__init__.py +48 -0
  108. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  109. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  110. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  111. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  112. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  113. diffusers/pipelines/animatediff/__init__.py +8 -0
  114. diffusers/pipelines/animatediff/pipeline_animatediff.py +122 -109
  115. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1106 -0
  116. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1288 -0
  117. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1010 -0
  118. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +236 -180
  119. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  120. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  121. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  122. diffusers/pipelines/audioldm2/modeling_audioldm2.py +58 -39
  123. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +121 -36
  124. diffusers/pipelines/aura_flow/__init__.py +48 -0
  125. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +584 -0
  126. diffusers/pipelines/auto_pipeline.py +196 -28
  127. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  128. diffusers/pipelines/blip_diffusion/modeling_blip2.py +6 -6
  129. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  130. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  131. diffusers/pipelines/cogvideo/__init__.py +54 -0
  132. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +772 -0
  133. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
  134. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +885 -0
  135. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +851 -0
  136. diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
  137. diffusers/pipelines/cogview3/__init__.py +47 -0
  138. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  139. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  140. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +6 -6
  141. diffusers/pipelines/controlnet/__init__.py +86 -80
  142. diffusers/pipelines/controlnet/multicontrolnet.py +7 -182
  143. diffusers/pipelines/controlnet/pipeline_controlnet.py +134 -87
  144. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  145. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +93 -77
  146. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +88 -197
  147. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +136 -90
  148. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +176 -80
  149. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +125 -89
  150. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  151. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  152. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  153. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  154. diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
  155. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1060 -0
  156. diffusers/pipelines/controlnet_sd3/__init__.py +57 -0
  157. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1133 -0
  158. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  159. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  160. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +916 -0
  161. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1111 -0
  162. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  163. diffusers/pipelines/deepfloyd_if/pipeline_if.py +16 -30
  164. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +20 -35
  165. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +23 -41
  166. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +22 -38
  167. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +25 -41
  168. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +19 -34
  169. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  170. diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
  171. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  172. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +70 -30
  173. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +48 -25
  174. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  175. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  176. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +21 -20
  177. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +27 -29
  178. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +33 -27
  179. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +33 -23
  180. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +36 -30
  181. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +102 -69
  182. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  183. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  184. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  185. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  186. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  187. diffusers/pipelines/dit/pipeline_dit.py +7 -4
  188. diffusers/pipelines/flux/__init__.py +69 -0
  189. diffusers/pipelines/flux/modeling_flux.py +47 -0
  190. diffusers/pipelines/flux/pipeline_flux.py +957 -0
  191. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  192. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  193. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  194. diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
  195. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
  196. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
  197. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  198. diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
  199. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
  200. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  201. diffusers/pipelines/flux/pipeline_output.py +37 -0
  202. diffusers/pipelines/free_init_utils.py +41 -38
  203. diffusers/pipelines/free_noise_utils.py +596 -0
  204. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  205. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  206. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  207. diffusers/pipelines/hunyuandit/__init__.py +48 -0
  208. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +916 -0
  209. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  210. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  211. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +32 -29
  212. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  213. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  214. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  215. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  216. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +34 -31
  217. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  218. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  219. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  220. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  221. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  222. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  223. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  224. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +22 -35
  225. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +26 -37
  226. diffusers/pipelines/kolors/__init__.py +54 -0
  227. diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
  228. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1250 -0
  229. diffusers/pipelines/kolors/pipeline_output.py +21 -0
  230. diffusers/pipelines/kolors/text_encoder.py +889 -0
  231. diffusers/pipelines/kolors/tokenizer.py +338 -0
  232. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +82 -62
  233. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +77 -60
  234. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +12 -12
  235. diffusers/pipelines/latte/__init__.py +48 -0
  236. diffusers/pipelines/latte/pipeline_latte.py +881 -0
  237. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +80 -74
  238. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +85 -76
  239. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  240. diffusers/pipelines/ltx/__init__.py +50 -0
  241. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  242. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  243. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  244. diffusers/pipelines/lumina/__init__.py +48 -0
  245. diffusers/pipelines/lumina/pipeline_lumina.py +890 -0
  246. diffusers/pipelines/marigold/__init__.py +50 -0
  247. diffusers/pipelines/marigold/marigold_image_processing.py +576 -0
  248. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  249. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  250. diffusers/pipelines/mochi/__init__.py +48 -0
  251. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  252. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  253. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  254. diffusers/pipelines/pag/__init__.py +80 -0
  255. diffusers/pipelines/pag/pag_utils.py +243 -0
  256. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1328 -0
  257. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
  258. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1610 -0
  259. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
  260. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +969 -0
  261. diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
  262. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +865 -0
  263. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  264. diffusers/pipelines/pag/pipeline_pag_sd.py +1062 -0
  265. diffusers/pipelines/pag/pipeline_pag_sd_3.py +994 -0
  266. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  267. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +866 -0
  268. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
  269. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  270. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1345 -0
  271. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1544 -0
  272. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1776 -0
  273. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  274. diffusers/pipelines/pia/pipeline_pia.py +74 -164
  275. diffusers/pipelines/pipeline_flax_utils.py +5 -10
  276. diffusers/pipelines/pipeline_loading_utils.py +515 -53
  277. diffusers/pipelines/pipeline_utils.py +411 -222
  278. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  279. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +76 -93
  280. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +873 -0
  281. diffusers/pipelines/sana/__init__.py +47 -0
  282. diffusers/pipelines/sana/pipeline_output.py +21 -0
  283. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  284. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +27 -23
  285. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  286. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  287. diffusers/pipelines/shap_e/renderer.py +1 -1
  288. diffusers/pipelines/stable_audio/__init__.py +50 -0
  289. diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
  290. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +756 -0
  291. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +71 -25
  292. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  293. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +35 -34
  294. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  295. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +20 -11
  296. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  297. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  298. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  299. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +145 -79
  300. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +43 -28
  301. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  302. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +100 -68
  303. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +109 -201
  304. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +131 -32
  305. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +247 -87
  306. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +30 -29
  307. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +35 -27
  308. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +49 -42
  309. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  310. diffusers/pipelines/stable_diffusion_3/__init__.py +54 -0
  311. diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
  312. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +1140 -0
  313. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +1036 -0
  314. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1250 -0
  315. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +29 -20
  316. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +59 -58
  317. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +31 -25
  318. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +38 -22
  319. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +30 -24
  320. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -23
  321. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +107 -67
  322. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +316 -69
  323. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  324. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  325. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +98 -30
  326. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +121 -83
  327. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +161 -105
  328. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +142 -218
  329. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -29
  330. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  331. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  332. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +69 -39
  333. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +105 -74
  334. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  335. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +29 -49
  336. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +32 -93
  337. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +37 -25
  338. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +54 -40
  339. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  340. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  341. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  342. diffusers/pipelines/unidiffuser/modeling_uvit.py +12 -12
  343. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +29 -28
  344. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  345. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  346. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +6 -8
  347. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  348. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  349. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +15 -14
  350. diffusers/{models/dual_transformer_2d.py → quantizers/__init__.py} +2 -6
  351. diffusers/quantizers/auto.py +139 -0
  352. diffusers/quantizers/base.py +233 -0
  353. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  354. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
  355. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  356. diffusers/quantizers/gguf/__init__.py +1 -0
  357. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  358. diffusers/quantizers/gguf/utils.py +456 -0
  359. diffusers/quantizers/quantization_config.py +669 -0
  360. diffusers/quantizers/torchao/__init__.py +15 -0
  361. diffusers/quantizers/torchao/torchao_quantizer.py +292 -0
  362. diffusers/schedulers/__init__.py +12 -2
  363. diffusers/schedulers/deprecated/__init__.py +1 -1
  364. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  365. diffusers/schedulers/scheduling_amused.py +5 -5
  366. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  367. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  368. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
  369. diffusers/schedulers/scheduling_ddim.py +27 -26
  370. diffusers/schedulers/scheduling_ddim_cogvideox.py +452 -0
  371. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  372. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  373. diffusers/schedulers/scheduling_ddim_parallel.py +32 -31
  374. diffusers/schedulers/scheduling_ddpm.py +27 -30
  375. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  376. diffusers/schedulers/scheduling_ddpm_parallel.py +33 -36
  377. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  378. diffusers/schedulers/scheduling_deis_multistep.py +150 -50
  379. diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
  380. diffusers/schedulers/scheduling_dpmsolver_multistep.py +221 -84
  381. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  382. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +158 -52
  383. diffusers/schedulers/scheduling_dpmsolver_sde.py +153 -34
  384. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +275 -86
  385. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +81 -57
  386. diffusers/schedulers/scheduling_edm_euler.py +62 -39
  387. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +30 -29
  388. diffusers/schedulers/scheduling_euler_discrete.py +255 -74
  389. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +458 -0
  390. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +320 -0
  391. diffusers/schedulers/scheduling_heun_discrete.py +174 -46
  392. diffusers/schedulers/scheduling_ipndm.py +9 -9
  393. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +138 -29
  394. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +132 -26
  395. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  396. diffusers/schedulers/scheduling_lcm.py +23 -29
  397. diffusers/schedulers/scheduling_lms_discrete.py +105 -28
  398. diffusers/schedulers/scheduling_pndm.py +20 -20
  399. diffusers/schedulers/scheduling_repaint.py +21 -21
  400. diffusers/schedulers/scheduling_sasolver.py +157 -60
  401. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  402. diffusers/schedulers/scheduling_tcd.py +41 -36
  403. diffusers/schedulers/scheduling_unclip.py +19 -16
  404. diffusers/schedulers/scheduling_unipc_multistep.py +243 -47
  405. diffusers/schedulers/scheduling_utils.py +12 -5
  406. diffusers/schedulers/scheduling_utils_flax.py +1 -3
  407. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  408. diffusers/training_utils.py +214 -30
  409. diffusers/utils/__init__.py +17 -1
  410. diffusers/utils/constants.py +3 -0
  411. diffusers/utils/doc_utils.py +1 -0
  412. diffusers/utils/dummy_pt_objects.py +592 -7
  413. diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
  414. diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
  415. diffusers/utils/dummy_torch_and_transformers_objects.py +1001 -71
  416. diffusers/utils/dynamic_modules_utils.py +34 -29
  417. diffusers/utils/export_utils.py +50 -6
  418. diffusers/utils/hub_utils.py +131 -17
  419. diffusers/utils/import_utils.py +210 -8
  420. diffusers/utils/loading_utils.py +118 -5
  421. diffusers/utils/logging.py +4 -2
  422. diffusers/utils/peft_utils.py +37 -7
  423. diffusers/utils/state_dict_utils.py +13 -2
  424. diffusers/utils/testing_utils.py +193 -11
  425. diffusers/utils/torch_utils.py +4 -0
  426. diffusers/video_processor.py +113 -0
  427. {diffusers-0.27.0.dist-info → diffusers-0.32.2.dist-info}/METADATA +82 -91
  428. diffusers-0.32.2.dist-info/RECORD +550 -0
  429. {diffusers-0.27.0.dist-info → diffusers-0.32.2.dist-info}/WHEEL +1 -1
  430. diffusers/loaders/autoencoder.py +0 -146
  431. diffusers/loaders/controlnet.py +0 -136
  432. diffusers/loaders/lora.py +0 -1349
  433. diffusers/models/prior_transformer.py +0 -12
  434. diffusers/models/t5_film_transformer.py +0 -70
  435. diffusers/models/transformer_2d.py +0 -25
  436. diffusers/models/transformer_temporal.py +0 -34
  437. diffusers/models/unet_1d.py +0 -26
  438. diffusers/models/unet_1d_blocks.py +0 -203
  439. diffusers/models/unet_2d.py +0 -27
  440. diffusers/models/unet_2d_blocks.py +0 -375
  441. diffusers/models/unet_2d_condition.py +0 -25
  442. diffusers-0.27.0.dist-info/RECORD +0 -399
  443. {diffusers-0.27.0.dist-info → diffusers-0.32.2.dist-info}/LICENSE +0 -0
  444. {diffusers-0.27.0.dist-info → diffusers-0.32.2.dist-info}/entry_points.txt +0 -0
  445. {diffusers-0.27.0.dist-info → diffusers-0.32.2.dist-info}/top_level.txt +0 -0
@@ -13,13 +13,13 @@
13
13
 
14
14
  import copy
15
15
  import inspect
16
- from typing import Any, Callable, Dict, List, Optional, Union
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
17
 
18
18
  import torch
19
19
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
20
20
 
21
21
  from ...image_processor import PipelineImageInput, VaeImageProcessor
22
- from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
22
+ from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
23
23
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
24
24
  from ...models.lora import adjust_lora_scale_text_encoder
25
25
  from ...schedulers import DDIMScheduler
@@ -59,8 +59,99 @@ EXAMPLE_DOC_STRING = """
59
59
  """
60
60
 
61
61
 
62
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
63
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
64
+ r"""
65
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
66
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
67
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
68
+
69
+ Args:
70
+ noise_cfg (`torch.Tensor`):
71
+ The predicted noise tensor for the guided diffusion process.
72
+ noise_pred_text (`torch.Tensor`):
73
+ The predicted noise tensor for the text-guided diffusion process.
74
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
75
+ A rescale factor applied to the noise predictions.
76
+
77
+ Returns:
78
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
79
+ """
80
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
81
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
82
+ # rescale the results from guidance (fixes overexposure)
83
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
84
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
85
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
86
+ return noise_cfg
87
+
88
+
89
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
90
+ def retrieve_timesteps(
91
+ scheduler,
92
+ num_inference_steps: Optional[int] = None,
93
+ device: Optional[Union[str, torch.device]] = None,
94
+ timesteps: Optional[List[int]] = None,
95
+ sigmas: Optional[List[float]] = None,
96
+ **kwargs,
97
+ ):
98
+ r"""
99
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
100
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
101
+
102
+ Args:
103
+ scheduler (`SchedulerMixin`):
104
+ The scheduler to get timesteps from.
105
+ num_inference_steps (`int`):
106
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
107
+ must be `None`.
108
+ device (`str` or `torch.device`, *optional*):
109
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
110
+ timesteps (`List[int]`, *optional*):
111
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
112
+ `num_inference_steps` and `sigmas` must be `None`.
113
+ sigmas (`List[float]`, *optional*):
114
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
115
+ `num_inference_steps` and `timesteps` must be `None`.
116
+
117
+ Returns:
118
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
119
+ second element is the number of inference steps.
120
+ """
121
+ if timesteps is not None and sigmas is not None:
122
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
123
+ if timesteps is not None:
124
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
125
+ if not accepts_timesteps:
126
+ raise ValueError(
127
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
128
+ f" timestep schedules. Please check whether you are using the correct scheduler."
129
+ )
130
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
131
+ timesteps = scheduler.timesteps
132
+ num_inference_steps = len(timesteps)
133
+ elif sigmas is not None:
134
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
135
+ if not accept_sigmas:
136
+ raise ValueError(
137
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
138
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
139
+ )
140
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
141
+ timesteps = scheduler.timesteps
142
+ num_inference_steps = len(timesteps)
143
+ else:
144
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
145
+ timesteps = scheduler.timesteps
146
+ return timesteps, num_inference_steps
147
+
148
+
62
149
  class StableDiffusionPanoramaPipeline(
63
- DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
150
+ DiffusionPipeline,
151
+ StableDiffusionMixin,
152
+ TextualInversionLoaderMixin,
153
+ StableDiffusionLoraLoaderMixin,
154
+ IPAdapterMixin,
64
155
  ):
65
156
  r"""
66
157
  Pipeline for text-to-image generation using MultiDiffusion.
@@ -70,8 +161,8 @@ class StableDiffusionPanoramaPipeline(
70
161
 
71
162
  The pipeline also inherits the following loading methods:
72
163
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
73
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
74
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
164
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
165
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
75
166
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
76
167
 
77
168
  Args:
@@ -97,6 +188,7 @@ class StableDiffusionPanoramaPipeline(
97
188
  model_cpu_offload_seq = "text_encoder->unet->vae"
98
189
  _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
99
190
  _exclude_from_cpu_offload = ["safety_checker"]
191
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
100
192
 
101
193
  def __init__(
102
194
  self,
@@ -150,8 +242,8 @@ class StableDiffusionPanoramaPipeline(
150
242
  num_images_per_prompt,
151
243
  do_classifier_free_guidance,
152
244
  negative_prompt=None,
153
- prompt_embeds: Optional[torch.FloatTensor] = None,
154
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
245
+ prompt_embeds: Optional[torch.Tensor] = None,
246
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
155
247
  lora_scale: Optional[float] = None,
156
248
  **kwargs,
157
249
  ):
@@ -183,8 +275,8 @@ class StableDiffusionPanoramaPipeline(
183
275
  num_images_per_prompt,
184
276
  do_classifier_free_guidance,
185
277
  negative_prompt=None,
186
- prompt_embeds: Optional[torch.FloatTensor] = None,
187
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
278
+ prompt_embeds: Optional[torch.Tensor] = None,
279
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
188
280
  lora_scale: Optional[float] = None,
189
281
  clip_skip: Optional[int] = None,
190
282
  ):
@@ -204,10 +296,10 @@ class StableDiffusionPanoramaPipeline(
204
296
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
205
297
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
206
298
  less than `1`).
207
- prompt_embeds (`torch.FloatTensor`, *optional*):
299
+ prompt_embeds (`torch.Tensor`, *optional*):
208
300
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
209
301
  provided, text embeddings will be generated from `prompt` input argument.
210
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
302
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
211
303
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
212
304
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
213
305
  argument.
@@ -219,7 +311,7 @@ class StableDiffusionPanoramaPipeline(
219
311
  """
220
312
  # set lora scale so that monkey patched LoRA
221
313
  # function of text encoder can correctly access it
222
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
314
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
223
315
  self._lora_scale = lora_scale
224
316
 
225
317
  # dynamically adjust the LoRA scale
@@ -351,9 +443,10 @@ class StableDiffusionPanoramaPipeline(
351
443
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
352
444
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
353
445
 
354
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
355
- # Retrieve the original scale by scaling back the LoRA layers
356
- unscale_lora_layers(self.text_encoder, lora_scale)
446
+ if self.text_encoder is not None:
447
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
448
+ # Retrieve the original scale by scaling back the LoRA layers
449
+ unscale_lora_layers(self.text_encoder, lora_scale)
357
450
 
358
451
  return prompt_embeds, negative_prompt_embeds
359
452
 
@@ -386,6 +479,9 @@ class StableDiffusionPanoramaPipeline(
386
479
  def prepare_ip_adapter_image_embeds(
387
480
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
388
481
  ):
482
+ image_embeds = []
483
+ if do_classifier_free_guidance:
484
+ negative_image_embeds = []
389
485
  if ip_adapter_image_embeds is None:
390
486
  if not isinstance(ip_adapter_image, list):
391
487
  ip_adapter_image = [ip_adapter_image]
@@ -395,7 +491,6 @@ class StableDiffusionPanoramaPipeline(
395
491
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
396
492
  )
397
493
 
398
- image_embeds = []
399
494
  for single_ip_adapter_image, image_proj_layer in zip(
400
495
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
401
496
  ):
@@ -403,36 +498,28 @@ class StableDiffusionPanoramaPipeline(
403
498
  single_image_embeds, single_negative_image_embeds = self.encode_image(
404
499
  single_ip_adapter_image, device, 1, output_hidden_state
405
500
  )
406
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
407
- single_negative_image_embeds = torch.stack(
408
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
409
- )
410
501
 
502
+ image_embeds.append(single_image_embeds[None, :])
411
503
  if do_classifier_free_guidance:
412
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
413
- single_image_embeds = single_image_embeds.to(device)
414
-
415
- image_embeds.append(single_image_embeds)
504
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
416
505
  else:
417
- repeat_dims = [1]
418
- image_embeds = []
419
506
  for single_image_embeds in ip_adapter_image_embeds:
420
507
  if do_classifier_free_guidance:
421
508
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
422
- single_image_embeds = single_image_embeds.repeat(
423
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
424
- )
425
- single_negative_image_embeds = single_negative_image_embeds.repeat(
426
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
427
- )
428
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
429
- else:
430
- single_image_embeds = single_image_embeds.repeat(
431
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
432
- )
509
+ negative_image_embeds.append(single_negative_image_embeds)
433
510
  image_embeds.append(single_image_embeds)
434
511
 
435
- return image_embeds
512
+ ip_adapter_image_embeds = []
513
+ for i, single_image_embeds in enumerate(image_embeds):
514
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
515
+ if do_classifier_free_guidance:
516
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
517
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
518
+
519
+ single_image_embeds = single_image_embeds.to(device=device)
520
+ ip_adapter_image_embeds.append(single_image_embeds)
521
+
522
+ return ip_adapter_image_embeds
436
523
 
437
524
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
438
525
  def run_safety_checker(self, image, device, dtype):
@@ -461,10 +548,23 @@ class StableDiffusionPanoramaPipeline(
461
548
  image = image.cpu().permute(0, 2, 3, 1).float().numpy()
462
549
  return image
463
550
 
464
- def decode_latents_with_padding(self, latents, padding=8):
465
- # Add padding to latents for circular inference
466
- # padding is the number of latents to add on each side
467
- # it would slightly increase the memory usage, but remove the boundary artifacts
551
+ def decode_latents_with_padding(self, latents: torch.Tensor, padding: int = 8) -> torch.Tensor:
552
+ """
553
+ Decode the given latents with padding for circular inference.
554
+
555
+ Args:
556
+ latents (torch.Tensor): The input latents to decode.
557
+ padding (int, optional): The number of latents to add on each side for padding. Defaults to 8.
558
+
559
+ Returns:
560
+ torch.Tensor: The decoded image with padding removed.
561
+
562
+ Notes:
563
+ - The padding is added to remove boundary artifacts and improve the output quality.
564
+ - This would slightly increase the memory usage.
565
+ - The padding pixels are then removed from the decoded image.
566
+
567
+ """
468
568
  latents = 1 / self.vae.config.scaling_factor * latents
469
569
  latents_left = latents[..., :padding]
470
570
  latents_right = latents[..., -padding:]
@@ -564,7 +664,12 @@ class StableDiffusionPanoramaPipeline(
564
664
 
565
665
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
566
666
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
567
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
667
+ shape = (
668
+ batch_size,
669
+ num_channels_latents,
670
+ int(height) // self.vae_scale_factor,
671
+ int(width) // self.vae_scale_factor,
672
+ )
568
673
  if isinstance(generator, list) and len(generator) != batch_size:
569
674
  raise ValueError(
570
675
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -580,9 +685,62 @@ class StableDiffusionPanoramaPipeline(
580
685
  latents = latents * self.scheduler.init_noise_sigma
581
686
  return latents
582
687
 
583
- def get_views(self, panorama_height, panorama_width, window_size=64, stride=8, circular_padding=False):
584
- # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
585
- # if panorama's height/width < window_size, num_blocks of height/width should return 1
688
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
689
+ def get_guidance_scale_embedding(
690
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
691
+ ) -> torch.Tensor:
692
+ """
693
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
694
+
695
+ Args:
696
+ w (`torch.Tensor`):
697
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
698
+ embedding_dim (`int`, *optional*, defaults to 512):
699
+ Dimension of the embeddings to generate.
700
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
701
+ Data type of the generated embeddings.
702
+
703
+ Returns:
704
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
705
+ """
706
+ assert len(w.shape) == 1
707
+ w = w * 1000.0
708
+
709
+ half_dim = embedding_dim // 2
710
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
711
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
712
+ emb = w.to(dtype)[:, None] * emb[None, :]
713
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
714
+ if embedding_dim % 2 == 1: # zero pad
715
+ emb = torch.nn.functional.pad(emb, (0, 1))
716
+ assert emb.shape == (w.shape[0], embedding_dim)
717
+ return emb
718
+
719
+ def get_views(
720
+ self,
721
+ panorama_height: int,
722
+ panorama_width: int,
723
+ window_size: int = 64,
724
+ stride: int = 8,
725
+ circular_padding: bool = False,
726
+ ) -> List[Tuple[int, int, int, int]]:
727
+ """
728
+ Generates a list of views based on the given parameters. Here, we define the mappings F_i (see Eq. 7 in the
729
+ MultiDiffusion paper https://arxiv.org/abs/2302.08113). If panorama's height/width < window_size, num_blocks of
730
+ height/width should return 1.
731
+
732
+ Args:
733
+ panorama_height (int): The height of the panorama.
734
+ panorama_width (int): The width of the panorama.
735
+ window_size (int, optional): The size of the window. Defaults to 64.
736
+ stride (int, optional): The stride value. Defaults to 8.
737
+ circular_padding (bool, optional): Whether to apply circular padding. Defaults to False.
738
+
739
+ Returns:
740
+ List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains four integers
741
+ representing the start and end coordinates of the window in the panorama.
742
+
743
+ """
586
744
  panorama_height /= 8
587
745
  panorama_width /= 8
588
746
  num_blocks_height = (panorama_height - window_size) // stride + 1 if panorama_height > window_size else 1
@@ -600,6 +758,34 @@ class StableDiffusionPanoramaPipeline(
600
758
  views.append((h_start, h_end, w_start, w_end))
601
759
  return views
602
760
 
761
+ @property
762
+ def guidance_scale(self):
763
+ return self._guidance_scale
764
+
765
+ @property
766
+ def guidance_rescale(self):
767
+ return self._guidance_rescale
768
+
769
+ @property
770
+ def cross_attention_kwargs(self):
771
+ return self._cross_attention_kwargs
772
+
773
+ @property
774
+ def clip_skip(self):
775
+ return self._clip_skip
776
+
777
+ @property
778
+ def do_classifier_free_guidance(self):
779
+ return False
780
+
781
+ @property
782
+ def num_timesteps(self):
783
+ return self._num_timesteps
784
+
785
+ @property
786
+ def interrupt(self):
787
+ return self._interrupt
788
+
603
789
  @torch.no_grad()
604
790
  @replace_example_docstring(EXAMPLE_DOC_STRING)
605
791
  def __call__(
@@ -608,24 +794,27 @@ class StableDiffusionPanoramaPipeline(
608
794
  height: Optional[int] = 512,
609
795
  width: Optional[int] = 2048,
610
796
  num_inference_steps: int = 50,
797
+ timesteps: List[int] = None,
611
798
  guidance_scale: float = 7.5,
612
799
  view_batch_size: int = 1,
613
800
  negative_prompt: Optional[Union[str, List[str]]] = None,
614
801
  num_images_per_prompt: Optional[int] = 1,
615
802
  eta: float = 0.0,
616
803
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
617
- latents: Optional[torch.FloatTensor] = None,
618
- prompt_embeds: Optional[torch.FloatTensor] = None,
619
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
804
+ latents: Optional[torch.Tensor] = None,
805
+ prompt_embeds: Optional[torch.Tensor] = None,
806
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
620
807
  ip_adapter_image: Optional[PipelineImageInput] = None,
621
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
808
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
622
809
  output_type: Optional[str] = "pil",
623
810
  return_dict: bool = True,
624
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
625
- callback_steps: Optional[int] = 1,
626
811
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
812
+ guidance_rescale: float = 0.0,
627
813
  circular_padding: bool = False,
628
814
  clip_skip: Optional[int] = None,
815
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
816
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
817
+ **kwargs: Any,
629
818
  ):
630
819
  r"""
631
820
  The call function to the pipeline for generation.
@@ -641,6 +830,9 @@ class StableDiffusionPanoramaPipeline(
641
830
  num_inference_steps (`int`, *optional*, defaults to 50):
642
831
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
643
832
  expense of slower inference.
833
+ timesteps (`List[int]`, *optional*):
834
+ The timesteps at which to generate the images. If not specified, then the default timestep spacing
835
+ strategy of the scheduler is used.
644
836
  guidance_scale (`float`, *optional*, defaults to 7.5):
645
837
  A higher guidance scale value encourages the model to generate images closely linked to the text
646
838
  `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
@@ -658,38 +850,34 @@ class StableDiffusionPanoramaPipeline(
658
850
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
659
851
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
660
852
  generation deterministic.
661
- latents (`torch.FloatTensor`, *optional*):
853
+ latents (`torch.Tensor`, *optional*):
662
854
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
663
855
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
664
856
  tensor is generated by sampling using the supplied random `generator`.
665
- prompt_embeds (`torch.FloatTensor`, *optional*):
857
+ prompt_embeds (`torch.Tensor`, *optional*):
666
858
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
667
859
  provided, text embeddings are generated from the `prompt` input argument.
668
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
860
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
669
861
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
670
862
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
671
863
  ip_adapter_image: (`PipelineImageInput`, *optional*):
672
864
  Optional image input to work with IP Adapters.
673
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
674
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
675
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
676
- if `do_classifier_free_guidance` is set to `True`.
677
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
865
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
866
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
867
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
868
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
869
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
678
870
  output_type (`str`, *optional*, defaults to `"pil"`):
679
871
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
680
872
  return_dict (`bool`, *optional*, defaults to `True`):
681
873
  Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
682
874
  plain tuple.
683
- callback (`Callable`, *optional*):
684
- A function that calls every `callback_steps` steps during inference. The function is called with the
685
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
686
- callback_steps (`int`, *optional*, defaults to 1):
687
- The frequency at which the `callback` function is called. If not specified, the callback is called at
688
- every step.
689
875
  cross_attention_kwargs (`dict`, *optional*):
690
876
  A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
691
877
  `self.processor` in
692
878
  [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
879
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
880
+ A rescaling factor for the guidance embeddings. A value of 0.0 means no rescaling is applied.
693
881
  circular_padding (`bool`, *optional*, defaults to `False`):
694
882
  If set to `True`, circular padding is applied to ensure there are no stitching artifacts. Circular
695
883
  padding allows the model to seamlessly generate a transition from the rightmost part of the image to
@@ -697,6 +885,15 @@ class StableDiffusionPanoramaPipeline(
697
885
  clip_skip (`int`, *optional*):
698
886
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
699
887
  the output of the pre-final layer will be used for computing the prompt embeddings.
888
+ callback_on_step_end (`Callable`, *optional*):
889
+ A function that calls at the end of each denoising steps during the inference. The function is called
890
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
891
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
892
+ `callback_on_step_end_tensor_inputs`.
893
+ callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
894
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
895
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
896
+ `._callback_tensor_inputs` attribute of your pipeline class.
700
897
  Examples:
701
898
 
702
899
  Returns:
@@ -706,6 +903,22 @@ class StableDiffusionPanoramaPipeline(
706
903
  second element is a list of `bool`s indicating whether the corresponding generated image contains
707
904
  "not-safe-for-work" (nsfw) content.
708
905
  """
906
+ callback = kwargs.pop("callback", None)
907
+ callback_steps = kwargs.pop("callback_steps", None)
908
+
909
+ if callback is not None:
910
+ deprecate(
911
+ "callback",
912
+ "1.0.0",
913
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
914
+ )
915
+ if callback_steps is not None:
916
+ deprecate(
917
+ "callback_steps",
918
+ "1.0.0",
919
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
920
+ )
921
+
709
922
  # 0. Default height and width to unet
710
923
  height = height or self.unet.config.sample_size * self.vae_scale_factor
711
924
  width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -721,8 +934,15 @@ class StableDiffusionPanoramaPipeline(
721
934
  negative_prompt_embeds,
722
935
  ip_adapter_image,
723
936
  ip_adapter_image_embeds,
937
+ callback_on_step_end_tensor_inputs,
724
938
  )
725
939
 
940
+ self._guidance_scale = guidance_scale
941
+ self._guidance_rescale = guidance_rescale
942
+ self._clip_skip = clip_skip
943
+ self._cross_attention_kwargs = cross_attention_kwargs
944
+ self._interrupt = False
945
+
726
946
  # 2. Define call parameters
727
947
  if prompt is not None and isinstance(prompt, str):
728
948
  batch_size = 1
@@ -768,8 +988,7 @@ class StableDiffusionPanoramaPipeline(
768
988
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
769
989
 
770
990
  # 4. Prepare timesteps
771
- self.scheduler.set_timesteps(num_inference_steps, device=device)
772
- timesteps = self.scheduler.timesteps
991
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
773
992
 
774
993
  # 5. Prepare latent variables
775
994
  num_channels_latents = self.unet.config.in_channels
@@ -802,12 +1021,23 @@ class StableDiffusionPanoramaPipeline(
802
1021
  else None
803
1022
  )
804
1023
 
1024
+ # 7.2 Optionally get Guidance Scale Embedding
1025
+ timestep_cond = None
1026
+ if self.unet.config.time_cond_proj_dim is not None:
1027
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1028
+ timestep_cond = self.get_guidance_scale_embedding(
1029
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1030
+ ).to(device=device, dtype=latents.dtype)
1031
+
805
1032
  # 8. Denoising loop
806
1033
  # Each denoising step also includes refinement of the latents with respect to the
807
1034
  # views.
808
1035
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1036
+ self._num_timesteps = len(timesteps)
809
1037
  with self.progress_bar(total=num_inference_steps) as progress_bar:
810
1038
  for i, t in enumerate(timesteps):
1039
+ if self.interrupt:
1040
+ continue
811
1041
  count.zero_()
812
1042
  value.zero_()
813
1043
 
@@ -863,6 +1093,7 @@ class StableDiffusionPanoramaPipeline(
863
1093
  latent_model_input,
864
1094
  t,
865
1095
  encoder_hidden_states=prompt_embeds_input,
1096
+ timestep_cond=timestep_cond,
866
1097
  cross_attention_kwargs=cross_attention_kwargs,
867
1098
  added_cond_kwargs=added_cond_kwargs,
868
1099
  ).sample
@@ -872,6 +1103,12 @@ class StableDiffusionPanoramaPipeline(
872
1103
  noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
873
1104
  noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
874
1105
 
1106
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
1107
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1108
+ noise_pred = rescale_noise_cfg(
1109
+ noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
1110
+ )
1111
+
875
1112
  # compute the previous noisy sample x_t -> x_t-1
876
1113
  latents_denoised_batch = self.scheduler.step(
877
1114
  noise_pred, t, latents_for_view, **extra_step_kwargs
@@ -901,6 +1138,16 @@ class StableDiffusionPanoramaPipeline(
901
1138
  # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
902
1139
  latents = torch.where(count > 0, value / count, value)
903
1140
 
1141
+ if callback_on_step_end is not None:
1142
+ callback_kwargs = {}
1143
+ for k in callback_on_step_end_tensor_inputs:
1144
+ callback_kwargs[k] = locals()[k]
1145
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1146
+
1147
+ latents = callback_outputs.pop("latents", latents)
1148
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1149
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1150
+
904
1151
  # call the callback, if provided
905
1152
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
906
1153
  progress_bar.update()
@@ -908,7 +1155,7 @@ class StableDiffusionPanoramaPipeline(
908
1155
  step_idx = i // getattr(self.scheduler, "order", 1)
909
1156
  callback(step_idx, t, latents)
910
1157
 
911
- if not output_type == "latent":
1158
+ if output_type != "latent":
912
1159
  if circular_padding:
913
1160
  image = self.decode_latents_with_padding(latents)
914
1161
  else: