diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. diffusers/__init__.py +97 -4
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +13 -1
  4. diffusers/image_processor.py +282 -71
  5. diffusers/loaders/__init__.py +24 -3
  6. diffusers/loaders/ip_adapter.py +543 -16
  7. diffusers/loaders/lora_base.py +138 -125
  8. diffusers/loaders/lora_conversion_utils.py +647 -0
  9. diffusers/loaders/lora_pipeline.py +2216 -230
  10. diffusers/loaders/peft.py +380 -0
  11. diffusers/loaders/single_file_model.py +71 -4
  12. diffusers/loaders/single_file_utils.py +597 -10
  13. diffusers/loaders/textual_inversion.py +5 -3
  14. diffusers/loaders/transformer_flux.py +181 -0
  15. diffusers/loaders/transformer_sd3.py +89 -0
  16. diffusers/loaders/unet.py +56 -12
  17. diffusers/models/__init__.py +49 -12
  18. diffusers/models/activations.py +22 -9
  19. diffusers/models/adapter.py +53 -53
  20. diffusers/models/attention.py +98 -13
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2160 -346
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +73 -12
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +70 -0
  36. diffusers/models/controlnet_sd3.py +26 -376
  37. diffusers/models/controlnet_sparsectrl.py +46 -719
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +996 -92
  49. diffusers/models/embeddings_flax.py +23 -9
  50. diffusers/models/model_loading_utils.py +264 -14
  51. diffusers/models/modeling_flax_utils.py +1 -1
  52. diffusers/models/modeling_utils.py +334 -51
  53. diffusers/models/normalization.py +157 -13
  54. diffusers/models/transformers/__init__.py +6 -0
  55. diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
  56. diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
  57. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  58. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  59. diffusers/models/transformers/pixart_transformer_2d.py +10 -2
  60. diffusers/models/transformers/sana_transformer.py +488 -0
  61. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  62. diffusers/models/transformers/transformer_2d.py +1 -1
  63. diffusers/models/transformers/transformer_allegro.py +422 -0
  64. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  65. diffusers/models/transformers/transformer_flux.py +189 -51
  66. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  67. diffusers/models/transformers/transformer_ltx.py +469 -0
  68. diffusers/models/transformers/transformer_mochi.py +499 -0
  69. diffusers/models/transformers/transformer_sd3.py +112 -18
  70. diffusers/models/transformers/transformer_temporal.py +1 -1
  71. diffusers/models/unets/unet_1d_blocks.py +1 -1
  72. diffusers/models/unets/unet_2d.py +8 -1
  73. diffusers/models/unets/unet_2d_blocks.py +88 -21
  74. diffusers/models/unets/unet_2d_condition.py +9 -9
  75. diffusers/models/unets/unet_3d_blocks.py +9 -7
  76. diffusers/models/unets/unet_motion_model.py +46 -68
  77. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  78. diffusers/models/unets/unet_stable_cascade.py +2 -2
  79. diffusers/models/unets/uvit_2d.py +1 -1
  80. diffusers/models/upsampling.py +14 -6
  81. diffusers/pipelines/__init__.py +69 -6
  82. diffusers/pipelines/allegro/__init__.py +48 -0
  83. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  84. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  85. diffusers/pipelines/animatediff/__init__.py +2 -0
  86. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  87. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
  88. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  89. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
  90. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
  91. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  92. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  93. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
  94. diffusers/pipelines/auto_pipeline.py +88 -10
  95. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  96. diffusers/pipelines/cogvideo/__init__.py +2 -0
  97. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
  98. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
  99. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
  100. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
  101. diffusers/pipelines/cogview3/__init__.py +47 -0
  102. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  103. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  104. diffusers/pipelines/controlnet/__init__.py +86 -80
  105. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  106. diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
  107. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
  108. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
  109. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
  110. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
  111. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
  112. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  113. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  114. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  115. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
  116. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  117. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
  118. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  119. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  120. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  121. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  122. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  123. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
  124. diffusers/pipelines/flux/__init__.py +23 -1
  125. diffusers/pipelines/flux/modeling_flux.py +47 -0
  126. diffusers/pipelines/flux/pipeline_flux.py +256 -48
  127. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  128. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  129. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  130. diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
  131. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
  132. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
  133. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  134. diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
  135. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
  136. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  137. diffusers/pipelines/flux/pipeline_output.py +16 -0
  138. diffusers/pipelines/free_noise_utils.py +365 -5
  139. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  140. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  141. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  142. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
  143. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  145. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  146. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  147. diffusers/pipelines/kolors/text_encoder.py +2 -2
  148. diffusers/pipelines/kolors/tokenizer.py +4 -0
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  155. diffusers/pipelines/ltx/__init__.py +50 -0
  156. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  157. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  158. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  159. diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
  160. diffusers/pipelines/mochi/__init__.py +48 -0
  161. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  162. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  163. diffusers/pipelines/pag/__init__.py +13 -0
  164. diffusers/pipelines/pag/pag_utils.py +8 -2
  165. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
  166. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
  167. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
  168. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
  169. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
  170. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  171. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
  172. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  173. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  174. diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
  175. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  176. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  177. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
  178. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  179. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  180. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  181. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  182. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  183. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  184. diffusers/pipelines/pipeline_loading_utils.py +250 -31
  185. diffusers/pipelines/pipeline_utils.py +158 -186
  186. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
  187. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
  188. diffusers/pipelines/sana/__init__.py +47 -0
  189. diffusers/pipelines/sana/pipeline_output.py +21 -0
  190. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  191. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  192. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  193. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  194. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
  195. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  196. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  197. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  198. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
  199. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
  200. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
  201. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  202. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  203. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  204. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  205. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
  206. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
  208. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  209. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  210. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  211. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  212. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  213. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  214. diffusers/quantizers/__init__.py +16 -0
  215. diffusers/quantizers/auto.py +139 -0
  216. diffusers/quantizers/base.py +233 -0
  217. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  218. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
  219. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  220. diffusers/quantizers/gguf/__init__.py +1 -0
  221. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  222. diffusers/quantizers/gguf/utils.py +456 -0
  223. diffusers/quantizers/quantization_config.py +669 -0
  224. diffusers/quantizers/torchao/__init__.py +15 -0
  225. diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
  226. diffusers/schedulers/scheduling_ddim.py +4 -1
  227. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  228. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  229. diffusers/schedulers/scheduling_ddpm.py +6 -7
  230. diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
  231. diffusers/schedulers/scheduling_deis_multistep.py +102 -6
  232. diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
  236. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  237. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  238. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  239. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  240. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  241. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  244. diffusers/schedulers/scheduling_lcm.py +2 -6
  245. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  246. diffusers/schedulers/scheduling_repaint.py +1 -1
  247. diffusers/schedulers/scheduling_sasolver.py +102 -6
  248. diffusers/schedulers/scheduling_tcd.py +2 -6
  249. diffusers/schedulers/scheduling_unclip.py +4 -1
  250. diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
  251. diffusers/training_utils.py +63 -19
  252. diffusers/utils/__init__.py +7 -1
  253. diffusers/utils/constants.py +1 -0
  254. diffusers/utils/dummy_pt_objects.py +240 -0
  255. diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
  256. diffusers/utils/dynamic_modules_utils.py +3 -3
  257. diffusers/utils/hub_utils.py +44 -40
  258. diffusers/utils/import_utils.py +98 -8
  259. diffusers/utils/loading_utils.py +28 -4
  260. diffusers/utils/peft_utils.py +6 -3
  261. diffusers/utils/testing_utils.py +115 -1
  262. diffusers/utils/torch_utils.py +3 -0
  263. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
  264. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
  265. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
  266. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
  267. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
  268. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ from transformers import (
25
25
 
26
26
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
27
27
  from ...image_processor import PipelineImageInput, VaeImageProcessor
28
- from ...loaders import SD3LoraLoaderMixin
28
+ from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
29
29
  from ...models.autoencoders import AutoencoderKL
30
30
  from ...models.transformers import SD3Transformer2DModel
31
31
  from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -74,6 +74,20 @@ EXAMPLE_DOC_STRING = """
74
74
  """
75
75
 
76
76
 
77
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
78
+ def calculate_shift(
79
+ image_seq_len,
80
+ base_seq_len: int = 256,
81
+ max_seq_len: int = 4096,
82
+ base_shift: float = 0.5,
83
+ max_shift: float = 1.16,
84
+ ):
85
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
86
+ b = base_shift - m * base_seq_len
87
+ mu = image_seq_len * m + b
88
+ return mu
89
+
90
+
77
91
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
78
92
  def retrieve_latents(
79
93
  encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -97,7 +111,7 @@ def retrieve_timesteps(
97
111
  sigmas: Optional[List[float]] = None,
98
112
  **kwargs,
99
113
  ):
100
- """
114
+ r"""
101
115
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
102
116
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
103
117
 
@@ -148,7 +162,7 @@ def retrieve_timesteps(
148
162
  return timesteps, num_inference_steps
149
163
 
150
164
 
151
- class StableDiffusion3InpaintPipeline(DiffusionPipeline):
165
+ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
152
166
  r"""
153
167
  Args:
154
168
  transformer ([`SD3Transformer2DModel`]):
@@ -224,6 +238,9 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
224
238
  )
225
239
  self.tokenizer_max_length = self.tokenizer.model_max_length
226
240
  self.default_sample_size = self.transformer.config.sample_size
241
+ self.patch_size = (
242
+ self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
243
+ )
227
244
 
228
245
  # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
229
246
  def _get_t5_prompt_embeds(
@@ -538,6 +555,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
538
555
  prompt,
539
556
  prompt_2,
540
557
  prompt_3,
558
+ height,
559
+ width,
541
560
  strength,
542
561
  negative_prompt=None,
543
562
  negative_prompt_2=None,
@@ -549,6 +568,15 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
549
568
  callback_on_step_end_tensor_inputs=None,
550
569
  max_sequence_length=None,
551
570
  ):
571
+ if (
572
+ height % (self.vae_scale_factor * self.patch_size) != 0
573
+ or width % (self.vae_scale_factor * self.patch_size) != 0
574
+ ):
575
+ raise ValueError(
576
+ f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
577
+ f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
578
+ )
579
+
552
580
  if strength < 0 or strength > 1:
553
581
  raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
554
582
 
@@ -806,7 +834,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
806
834
  padding_mask_crop: Optional[int] = None,
807
835
  strength: float = 0.6,
808
836
  num_inference_steps: int = 50,
809
- timesteps: List[int] = None,
837
+ sigmas: Optional[List[float]] = None,
810
838
  guidance_scale: float = 7.0,
811
839
  negative_prompt: Optional[Union[str, List[str]]] = None,
812
840
  negative_prompt_2: Optional[Union[str, List[str]]] = None,
@@ -824,6 +852,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
824
852
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
825
853
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
826
854
  max_sequence_length: int = 256,
855
+ mu: Optional[float] = None,
827
856
  ):
828
857
  r"""
829
858
  Function invoked when calling the pipeline for generation.
@@ -874,10 +903,10 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
874
903
  num_inference_steps (`int`, *optional*, defaults to 50):
875
904
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
876
905
  expense of slower inference.
877
- timesteps (`List[int]`, *optional*):
878
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
879
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
880
- passed will be used. Must be in descending order.
906
+ sigmas (`List[float]`, *optional*):
907
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
908
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
909
+ will be used.
881
910
  guidance_scale (`float`, *optional*, defaults to 7.0):
882
911
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
883
912
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -921,8 +950,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
921
950
  The output format of the generate image. Choose between
922
951
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
923
952
  return_dict (`bool`, *optional*, defaults to `True`):
924
- Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
925
- of a plain tuple.
953
+ Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
954
+ a plain tuple.
926
955
  callback_on_step_end (`Callable`, *optional*):
927
956
  A function that calls at the end of each denoising steps during the inference. The function is called
928
957
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -933,6 +962,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
933
962
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
934
963
  `._callback_tensor_inputs` attribute of your pipeline class.
935
964
  max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
965
+ mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
936
966
 
937
967
  Examples:
938
968
 
@@ -953,6 +983,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
953
983
  prompt,
954
984
  prompt_2,
955
985
  prompt_3,
986
+ height,
987
+ width,
956
988
  strength,
957
989
  negative_prompt=negative_prompt,
958
990
  negative_prompt_2=negative_prompt_2,
@@ -1007,7 +1039,24 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
1007
1039
  pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
1008
1040
 
1009
1041
  # 3. Prepare timesteps
1010
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1042
+ scheduler_kwargs = {}
1043
+ if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
1044
+ image_seq_len = (int(height) // self.vae_scale_factor // self.transformer.config.patch_size) * (
1045
+ int(width) // self.vae_scale_factor // self.transformer.config.patch_size
1046
+ )
1047
+ mu = calculate_shift(
1048
+ image_seq_len,
1049
+ self.scheduler.config.base_image_seq_len,
1050
+ self.scheduler.config.max_image_seq_len,
1051
+ self.scheduler.config.base_shift,
1052
+ self.scheduler.config.max_shift,
1053
+ )
1054
+ scheduler_kwargs["mu"] = mu
1055
+ elif mu is not None:
1056
+ scheduler_kwargs["mu"] = mu
1057
+ timesteps, num_inference_steps = retrieve_timesteps(
1058
+ self.scheduler, num_inference_steps, device, sigmas=sigmas, **scheduler_kwargs
1059
+ )
1011
1060
  timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
1012
1061
  # check that number of inference steps is not < 1 - as this doesn't make sense
1013
1062
  if num_inference_steps < 1:
@@ -446,13 +446,14 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
446
446
  extra_step_kwargs["generator"] = generator
447
447
  return extra_step_kwargs
448
448
 
449
- # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
450
449
  def check_inputs(
451
450
  self,
452
451
  prompt,
453
452
  height,
454
453
  width,
455
454
  callback_steps,
455
+ gligen_images,
456
+ gligen_phrases,
456
457
  negative_prompt=None,
457
458
  prompt_embeds=None,
458
459
  negative_prompt_embeds=None,
@@ -499,6 +500,13 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
499
500
  f" {negative_prompt_embeds.shape}."
500
501
  )
501
502
 
503
+ if gligen_images is not None and gligen_phrases is not None:
504
+ if len(gligen_images) != len(gligen_phrases):
505
+ raise ValueError(
506
+ "`gligen_images` and `gligen_phrases` must have the same length when both are provided, but"
507
+ f" got: `gligen_images` with length {len(gligen_images)} != `gligen_phrases` with length {len(gligen_phrases)}."
508
+ )
509
+
502
510
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
503
511
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
504
512
  shape = (
@@ -814,6 +822,8 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
814
822
  height,
815
823
  width,
816
824
  callback_steps,
825
+ gligen_images,
826
+ gligen_phrases,
817
827
  negative_prompt,
818
828
  prompt_embeds,
819
829
  negative_prompt_embeds,
@@ -602,9 +602,9 @@ class StableDiffusionKDiffusionPipeline(
602
602
  sigma_min: float = self.k_diffusion_model.sigmas[0].item()
603
603
  sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
604
604
  sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
605
- sigmas = sigmas.to(device)
606
605
  else:
607
606
  sigmas = self.scheduler.sigmas
607
+ sigmas = sigmas.to(device)
608
608
  sigmas = sigmas.to(prompt_embeds.dtype)
609
609
 
610
610
  # 6. Prepare latent variables
@@ -61,9 +61,21 @@ EXAMPLE_DOC_STRING = """
61
61
 
62
62
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
63
63
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
64
- """
65
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
66
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
64
+ r"""
65
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
66
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
67
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
68
+
69
+ Args:
70
+ noise_cfg (`torch.Tensor`):
71
+ The predicted noise tensor for the guided diffusion process.
72
+ noise_pred_text (`torch.Tensor`):
73
+ The predicted noise tensor for the text-guided diffusion process.
74
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
75
+ A rescale factor applied to the noise predictions.
76
+
77
+ Returns:
78
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
67
79
  """
68
80
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
69
81
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -83,7 +95,7 @@ def retrieve_timesteps(
83
95
  sigmas: Optional[List[float]] = None,
84
96
  **kwargs,
85
97
  ):
86
- """
98
+ r"""
87
99
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
88
100
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
89
101
 
@@ -61,9 +61,21 @@ EXAMPLE_DOC_STRING = """
61
61
 
62
62
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
63
63
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
64
- """
65
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
66
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
64
+ r"""
65
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
66
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
67
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
68
+
69
+ Args:
70
+ noise_cfg (`torch.Tensor`):
71
+ The predicted noise tensor for the guided diffusion process.
72
+ noise_pred_text (`torch.Tensor`):
73
+ The predicted noise tensor for the text-guided diffusion process.
74
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
75
+ A rescale factor applied to the noise predictions.
76
+
77
+ Returns:
78
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
67
79
  """
68
80
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
69
81
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -83,7 +95,7 @@ def retrieve_timesteps(
83
95
  sigmas: Optional[List[float]] = None,
84
96
  **kwargs,
85
97
  ):
86
- """
98
+ r"""
87
99
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
88
100
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
89
101
 
@@ -87,9 +87,21 @@ EXAMPLE_DOC_STRING = """
87
87
 
88
88
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
89
89
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
90
- """
91
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
92
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
90
+ r"""
91
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
92
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
93
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
94
+
95
+ Args:
96
+ noise_cfg (`torch.Tensor`):
97
+ The predicted noise tensor for the guided diffusion process.
98
+ noise_pred_text (`torch.Tensor`):
99
+ The predicted noise tensor for the text-guided diffusion process.
100
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
101
+ A rescale factor applied to the noise predictions.
102
+
103
+ Returns:
104
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
93
105
  """
94
106
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
95
107
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -109,7 +121,7 @@ def retrieve_timesteps(
109
121
  sigmas: Optional[List[float]] = None,
110
122
  **kwargs,
111
123
  ):
112
- """
124
+ r"""
113
125
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
114
126
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
115
127
 
@@ -225,11 +237,8 @@ class StableDiffusionXLPipeline(
225
237
  _callback_tensor_inputs = [
226
238
  "latents",
227
239
  "prompt_embeds",
228
- "negative_prompt_embeds",
229
240
  "add_text_embeds",
230
241
  "add_time_ids",
231
- "negative_pooled_prompt_embeds",
232
- "negative_add_time_ids",
233
242
  ]
234
243
 
235
244
  def __init__(
@@ -1231,13 +1240,8 @@ class StableDiffusionXLPipeline(
1231
1240
 
1232
1241
  latents = callback_outputs.pop("latents", latents)
1233
1242
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1234
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1235
1243
  add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1236
- negative_pooled_prompt_embeds = callback_outputs.pop(
1237
- "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
1238
- )
1239
1244
  add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1240
- negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
1241
1245
 
1242
1246
  # call the callback, if provided
1243
1247
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -90,9 +90,21 @@ EXAMPLE_DOC_STRING = """
90
90
 
91
91
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
92
92
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
93
- """
94
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
95
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
93
+ r"""
94
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
95
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
96
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
97
+
98
+ Args:
99
+ noise_cfg (`torch.Tensor`):
100
+ The predicted noise tensor for the guided diffusion process.
101
+ noise_pred_text (`torch.Tensor`):
102
+ The predicted noise tensor for the text-guided diffusion process.
103
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
104
+ A rescale factor applied to the noise predictions.
105
+
106
+ Returns:
107
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
96
108
  """
97
109
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
98
110
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -126,7 +138,7 @@ def retrieve_timesteps(
126
138
  sigmas: Optional[List[float]] = None,
127
139
  **kwargs,
128
140
  ):
129
- """
141
+ r"""
130
142
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
131
143
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
132
144
 
@@ -245,11 +257,8 @@ class StableDiffusionXLImg2ImgPipeline(
245
257
  _callback_tensor_inputs = [
246
258
  "latents",
247
259
  "prompt_embeds",
248
- "negative_prompt_embeds",
249
260
  "add_text_embeds",
250
261
  "add_time_ids",
251
- "negative_pooled_prompt_embeds",
252
- "add_neg_time_ids",
253
262
  ]
254
263
 
255
264
  def __init__(
@@ -640,14 +649,16 @@ class StableDiffusionXLImg2ImgPipeline(
640
649
  if denoising_start is None:
641
650
  init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
642
651
  t_start = max(num_inference_steps - init_timestep, 0)
643
- else:
644
- t_start = 0
645
652
 
646
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
653
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
654
+ if hasattr(self.scheduler, "set_begin_index"):
655
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
656
+
657
+ return timesteps, num_inference_steps - t_start
647
658
 
648
- # Strength is irrelevant if we directly request a timestep to start at;
649
- # that is, strength is determined by the denoising_start instead.
650
- if denoising_start is not None:
659
+ else:
660
+ # Strength is irrelevant if we directly request a timestep to start at;
661
+ # that is, strength is determined by the denoising_start instead.
651
662
  discrete_timestep_cutoff = int(
652
663
  round(
653
664
  self.scheduler.config.num_train_timesteps
@@ -655,7 +666,7 @@ class StableDiffusionXLImg2ImgPipeline(
655
666
  )
656
667
  )
657
668
 
658
- num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
669
+ num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
659
670
  if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
660
671
  # if the scheduler is a 2nd order scheduler we might have to do +1
661
672
  # because `num_inference_steps` might be even given that every timestep
@@ -666,11 +677,12 @@ class StableDiffusionXLImg2ImgPipeline(
666
677
  num_inference_steps = num_inference_steps + 1
667
678
 
668
679
  # because t_n+1 >= t_n, we slice the timesteps starting from the end
669
- timesteps = timesteps[-num_inference_steps:]
680
+ t_start = len(self.scheduler.timesteps) - num_inference_steps
681
+ timesteps = self.scheduler.timesteps[t_start:]
682
+ if hasattr(self.scheduler, "set_begin_index"):
683
+ self.scheduler.set_begin_index(t_start)
670
684
  return timesteps, num_inference_steps
671
685
 
672
- return timesteps, num_inference_steps - t_start
673
-
674
686
  def prepare_latents(
675
687
  self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
676
688
  ):
@@ -1423,13 +1435,8 @@ class StableDiffusionXLImg2ImgPipeline(
1423
1435
 
1424
1436
  latents = callback_outputs.pop("latents", latents)
1425
1437
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1426
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1427
1438
  add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1428
- negative_pooled_prompt_embeds = callback_outputs.pop(
1429
- "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
1430
- )
1431
1439
  add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1432
- add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
1433
1440
 
1434
1441
  # call the callback, if provided
1435
1442
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -101,9 +101,21 @@ EXAMPLE_DOC_STRING = """
101
101
 
102
102
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
103
103
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
104
- """
105
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
106
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
104
+ r"""
105
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
106
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
107
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
108
+
109
+ Args:
110
+ noise_cfg (`torch.Tensor`):
111
+ The predicted noise tensor for the guided diffusion process.
112
+ noise_pred_text (`torch.Tensor`):
113
+ The predicted noise tensor for the text-guided diffusion process.
114
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
115
+ A rescale factor applied to the noise predictions.
116
+
117
+ Returns:
118
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
107
119
  """
108
120
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
109
121
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -153,7 +165,7 @@ def retrieve_timesteps(
153
165
  sigmas: Optional[List[float]] = None,
154
166
  **kwargs,
155
167
  ):
156
- """
168
+ r"""
157
169
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
158
170
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
159
171
 
@@ -273,11 +285,8 @@ class StableDiffusionXLInpaintPipeline(
273
285
  _callback_tensor_inputs = [
274
286
  "latents",
275
287
  "prompt_embeds",
276
- "negative_prompt_embeds",
277
288
  "add_text_embeds",
278
289
  "add_time_ids",
279
- "negative_pooled_prompt_embeds",
280
- "add_neg_time_ids",
281
290
  "mask",
282
291
  "masked_image_latents",
283
292
  ]
@@ -901,14 +910,16 @@ class StableDiffusionXLInpaintPipeline(
901
910
  if denoising_start is None:
902
911
  init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
903
912
  t_start = max(num_inference_steps - init_timestep, 0)
904
- else:
905
- t_start = 0
906
913
 
907
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
914
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
915
+ if hasattr(self.scheduler, "set_begin_index"):
916
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
917
+
918
+ return timesteps, num_inference_steps - t_start
908
919
 
909
- # Strength is irrelevant if we directly request a timestep to start at;
910
- # that is, strength is determined by the denoising_start instead.
911
- if denoising_start is not None:
920
+ else:
921
+ # Strength is irrelevant if we directly request a timestep to start at;
922
+ # that is, strength is determined by the denoising_start instead.
912
923
  discrete_timestep_cutoff = int(
913
924
  round(
914
925
  self.scheduler.config.num_train_timesteps
@@ -916,7 +927,7 @@ class StableDiffusionXLInpaintPipeline(
916
927
  )
917
928
  )
918
929
 
919
- num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
930
+ num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
920
931
  if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
921
932
  # if the scheduler is a 2nd order scheduler we might have to do +1
922
933
  # because `num_inference_steps` might be even given that every timestep
@@ -927,11 +938,12 @@ class StableDiffusionXLInpaintPipeline(
927
938
  num_inference_steps = num_inference_steps + 1
928
939
 
929
940
  # because t_n+1 >= t_n, we slice the timesteps starting from the end
930
- timesteps = timesteps[-num_inference_steps:]
941
+ t_start = len(self.scheduler.timesteps) - num_inference_steps
942
+ timesteps = self.scheduler.timesteps[t_start:]
943
+ if hasattr(self.scheduler, "set_begin_index"):
944
+ self.scheduler.set_begin_index(t_start)
931
945
  return timesteps, num_inference_steps
932
946
 
933
- return timesteps, num_inference_steps - t_start
934
-
935
947
  # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
936
948
  def _get_add_time_ids(
937
949
  self,
@@ -1656,13 +1668,8 @@ class StableDiffusionXLInpaintPipeline(
1656
1668
 
1657
1669
  latents = callback_outputs.pop("latents", latents)
1658
1670
  prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1659
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1660
1671
  add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1661
- negative_pooled_prompt_embeds = callback_outputs.pop(
1662
- "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
1663
- )
1664
1672
  add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1665
- add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
1666
1673
  mask = callback_outputs.pop("mask", mask)
1667
1674
  masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
1668
1675
 
@@ -71,7 +71,7 @@ def retrieve_timesteps(
71
71
  sigmas: Optional[List[float]] = None,
72
72
  **kwargs,
73
73
  ):
74
- """
74
+ r"""
75
75
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
76
76
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
77
77
 
@@ -127,7 +127,7 @@ def retrieve_timesteps(
127
127
  sigmas: Optional[List[float]] = None,
128
128
  **kwargs,
129
129
  ):
130
- """
130
+ r"""
131
131
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
132
132
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
133
133
 
@@ -119,9 +119,21 @@ def _preprocess_adapter_image(image, height, width):
119
119
 
120
120
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
121
121
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
122
- """
123
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
124
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
122
+ r"""
123
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
124
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
125
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
126
+
127
+ Args:
128
+ noise_cfg (`torch.Tensor`):
129
+ The predicted noise tensor for the guided diffusion process.
130
+ noise_pred_text (`torch.Tensor`):
131
+ The predicted noise tensor for the text-guided diffusion process.
132
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
133
+ A rescale factor applied to the noise predictions.
134
+
135
+ Returns:
136
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
125
137
  """
126
138
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
127
139
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -141,7 +153,7 @@ def retrieve_timesteps(
141
153
  sigmas: Optional[List[float]] = None,
142
154
  **kwargs,
143
155
  ):
144
- """
156
+ r"""
145
157
  Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
146
158
  custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
147
159
 
@@ -310,9 +310,21 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s
310
310
 
311
311
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
312
312
  def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
313
- """
314
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
315
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
313
+ r"""
314
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
315
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
316
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
317
+
318
+ Args:
319
+ noise_cfg (`torch.Tensor`):
320
+ The predicted noise tensor for the guided diffusion process.
321
+ noise_pred_text (`torch.Tensor`):
322
+ The predicted noise tensor for the text-guided diffusion process.
323
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
324
+ A rescale factor applied to the noise predictions.
325
+
326
+ Returns:
327
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
316
328
  """
317
329
  std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
318
330
  std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
@@ -104,8 +104,8 @@ class PatchEmbed(nn.Module):
104
104
 
105
105
  self.use_pos_embed = use_pos_embed
106
106
  if self.use_pos_embed:
107
- pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
108
- self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
107
+ pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5), output_type="pt")
108
+ self.register_buffer("pos_embed", pos_embed.float().unsqueeze(0), persistent=False)
109
109
 
110
110
  def forward(self, latent):
111
111
  latent = self.proj(latent)