diffusers 0.31.0__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. diffusers/__init__.py +66 -5
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +1 -1
  4. diffusers/dependency_versions_table.py +1 -1
  5. diffusers/image_processor.py +25 -17
  6. diffusers/loaders/__init__.py +22 -3
  7. diffusers/loaders/ip_adapter.py +538 -15
  8. diffusers/loaders/lora_base.py +124 -118
  9. diffusers/loaders/lora_conversion_utils.py +318 -3
  10. diffusers/loaders/lora_pipeline.py +1688 -368
  11. diffusers/loaders/peft.py +379 -0
  12. diffusers/loaders/single_file_model.py +71 -4
  13. diffusers/loaders/single_file_utils.py +519 -9
  14. diffusers/loaders/textual_inversion.py +3 -3
  15. diffusers/loaders/transformer_flux.py +181 -0
  16. diffusers/loaders/transformer_sd3.py +89 -0
  17. diffusers/loaders/unet.py +17 -4
  18. diffusers/models/__init__.py +47 -14
  19. diffusers/models/activations.py +22 -9
  20. diffusers/models/attention.py +13 -4
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2059 -281
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +2 -1
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +29 -495
  36. diffusers/models/controlnet_sd3.py +25 -379
  37. diffusers/models/controlnet_sparsectrl.py +46 -718
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +838 -43
  49. diffusers/models/model_loading_utils.py +88 -6
  50. diffusers/models/modeling_flax_utils.py +1 -1
  51. diffusers/models/modeling_utils.py +72 -26
  52. diffusers/models/normalization.py +78 -13
  53. diffusers/models/transformers/__init__.py +5 -0
  54. diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
  55. diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
  56. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  57. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  58. diffusers/models/transformers/pixart_transformer_2d.py +1 -1
  59. diffusers/models/transformers/sana_transformer.py +488 -0
  60. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  61. diffusers/models/transformers/transformer_2d.py +1 -1
  62. diffusers/models/transformers/transformer_allegro.py +422 -0
  63. diffusers/models/transformers/transformer_cogview3plus.py +1 -1
  64. diffusers/models/transformers/transformer_flux.py +30 -9
  65. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  66. diffusers/models/transformers/transformer_ltx.py +469 -0
  67. diffusers/models/transformers/transformer_mochi.py +499 -0
  68. diffusers/models/transformers/transformer_sd3.py +105 -17
  69. diffusers/models/transformers/transformer_temporal.py +1 -1
  70. diffusers/models/unets/unet_1d_blocks.py +1 -1
  71. diffusers/models/unets/unet_2d.py +8 -1
  72. diffusers/models/unets/unet_2d_blocks.py +88 -21
  73. diffusers/models/unets/unet_2d_condition.py +1 -1
  74. diffusers/models/unets/unet_3d_blocks.py +9 -7
  75. diffusers/models/unets/unet_motion_model.py +5 -5
  76. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  77. diffusers/models/unets/unet_stable_cascade.py +2 -2
  78. diffusers/models/unets/uvit_2d.py +1 -1
  79. diffusers/models/upsampling.py +8 -0
  80. diffusers/pipelines/__init__.py +34 -0
  81. diffusers/pipelines/allegro/__init__.py +48 -0
  82. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  83. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  84. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
  85. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
  86. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
  87. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
  88. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  89. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
  90. diffusers/pipelines/auto_pipeline.py +53 -6
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
  93. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
  94. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
  95. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
  96. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
  97. diffusers/pipelines/controlnet/__init__.py +86 -80
  98. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  99. diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
  100. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
  101. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
  102. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
  103. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
  104. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
  105. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  106. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  107. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  108. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
  109. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
  110. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  111. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
  112. diffusers/pipelines/flux/__init__.py +13 -1
  113. diffusers/pipelines/flux/modeling_flux.py +47 -0
  114. diffusers/pipelines/flux/pipeline_flux.py +204 -29
  115. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  116. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  117. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  118. diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
  119. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
  120. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
  121. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  122. diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
  123. diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
  124. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  125. diffusers/pipelines/flux/pipeline_output.py +16 -0
  126. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  127. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  128. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  129. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
  130. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  131. diffusers/pipelines/kolors/text_encoder.py +2 -2
  132. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  133. diffusers/pipelines/ltx/__init__.py +50 -0
  134. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  135. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  136. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  137. diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
  138. diffusers/pipelines/mochi/__init__.py +48 -0
  139. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  140. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  141. diffusers/pipelines/pag/__init__.py +7 -0
  142. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
  143. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
  144. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
  145. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
  146. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
  147. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
  148. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  149. diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
  150. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  151. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
  152. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  153. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  154. diffusers/pipelines/pipeline_loading_utils.py +25 -4
  155. diffusers/pipelines/pipeline_utils.py +35 -6
  156. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
  157. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
  158. diffusers/pipelines/sana/__init__.py +47 -0
  159. diffusers/pipelines/sana/pipeline_output.py +21 -0
  160. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  161. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
  163. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
  164. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
  165. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
  166. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  167. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
  168. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
  170. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  171. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  172. diffusers/quantizers/auto.py +14 -1
  173. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
  174. diffusers/quantizers/gguf/__init__.py +1 -0
  175. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  176. diffusers/quantizers/gguf/utils.py +456 -0
  177. diffusers/quantizers/quantization_config.py +280 -2
  178. diffusers/quantizers/torchao/__init__.py +15 -0
  179. diffusers/quantizers/torchao/torchao_quantizer.py +292 -0
  180. diffusers/schedulers/scheduling_ddpm.py +2 -6
  181. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
  182. diffusers/schedulers/scheduling_deis_multistep.py +28 -9
  183. diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
  184. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
  185. diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
  186. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
  187. diffusers/schedulers/scheduling_euler_discrete.py +4 -4
  188. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  189. diffusers/schedulers/scheduling_heun_discrete.py +4 -4
  190. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
  191. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
  192. diffusers/schedulers/scheduling_lcm.py +2 -6
  193. diffusers/schedulers/scheduling_lms_discrete.py +4 -4
  194. diffusers/schedulers/scheduling_repaint.py +1 -1
  195. diffusers/schedulers/scheduling_sasolver.py +28 -9
  196. diffusers/schedulers/scheduling_tcd.py +2 -6
  197. diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
  198. diffusers/training_utils.py +16 -2
  199. diffusers/utils/__init__.py +5 -0
  200. diffusers/utils/constants.py +1 -0
  201. diffusers/utils/dummy_pt_objects.py +180 -0
  202. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  203. diffusers/utils/dynamic_modules_utils.py +3 -3
  204. diffusers/utils/hub_utils.py +31 -39
  205. diffusers/utils/import_utils.py +67 -0
  206. diffusers/utils/peft_utils.py +3 -0
  207. diffusers/utils/testing_utils.py +56 -1
  208. diffusers/utils/torch_utils.py +3 -0
  209. {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/METADATA +6 -6
  210. {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/RECORD +214 -162
  211. {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/WHEEL +1 -1
  212. {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/LICENSE +0 -0
  213. {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/entry_points.txt +0 -0
  214. {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,7 @@ from ...models import AutoencoderOobleck, StableAudioDiTModel
26
26
  from ...models.embeddings import get_1d_rotary_pos_embed
27
27
  from ...schedulers import EDMDPMSolverMultistepScheduler
28
28
  from ...utils import (
29
+ is_torch_xla_available,
29
30
  logging,
30
31
  replace_example_docstring,
31
32
  )
@@ -34,6 +35,13 @@ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
34
35
  from .modeling_stable_audio import StableAudioProjectionModel
35
36
 
36
37
 
38
+ if is_torch_xla_available():
39
+ import torch_xla.core.xla_model as xm
40
+
41
+ XLA_AVAILABLE = True
42
+ else:
43
+ XLA_AVAILABLE = False
44
+
37
45
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
38
46
 
39
47
  EXAMPLE_DOC_STRING = """
@@ -438,7 +446,7 @@ class StableAudioPipeline(DiffusionPipeline):
438
446
  f"`initial_audio_waveforms` must be of shape `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)` but has `{initial_audio_waveforms.ndim}` dimensions"
439
447
  )
440
448
 
441
- audio_vae_length = self.transformer.config.sample_size * self.vae.hop_length
449
+ audio_vae_length = int(self.transformer.config.sample_size) * self.vae.hop_length
442
450
  audio_shape = (batch_size // num_waveforms_per_prompt, audio_channels, audio_vae_length)
443
451
 
444
452
  # check num_channels
@@ -726,6 +734,9 @@ class StableAudioPipeline(DiffusionPipeline):
726
734
  step_idx = i // getattr(self.scheduler, "order", 1)
727
735
  callback(step_idx, t, latents)
728
736
 
737
+ if XLA_AVAILABLE:
738
+ xm.mark_step()
739
+
729
740
  # 9. Post-processing
730
741
  if not output_type == "latent":
731
742
  audio = self.vae.decode(latents).sample
@@ -255,7 +255,12 @@ class StableDiffusionPipeline(
255
255
  is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
256
256
  version.parse(unet.config._diffusers_version).base_version
257
257
  ) < version.parse("0.9.0.dev0")
258
- is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
258
+ self._is_unet_config_sample_size_int = isinstance(unet.config.sample_size, int)
259
+ is_unet_sample_size_less_64 = (
260
+ hasattr(unet.config, "sample_size")
261
+ and self._is_unet_config_sample_size_int
262
+ and unet.config.sample_size < 64
263
+ )
259
264
  if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
260
265
  deprecation_message = (
261
266
  "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -902,8 +907,18 @@ class StableDiffusionPipeline(
902
907
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
903
908
 
904
909
  # 0. Default height and width to unet
905
- height = height or self.unet.config.sample_size * self.vae_scale_factor
906
- width = width or self.unet.config.sample_size * self.vae_scale_factor
910
+ if not height or not width:
911
+ height = (
912
+ self.unet.config.sample_size
913
+ if self._is_unet_config_sample_size_int
914
+ else self.unet.config.sample_size[0]
915
+ )
916
+ width = (
917
+ self.unet.config.sample_size
918
+ if self._is_unet_config_sample_size_int
919
+ else self.unet.config.sample_size[1]
920
+ )
921
+ height, width = height * self.vae_scale_factor, width * self.vae_scale_factor
907
922
  # to deal with lora scaling and other possible forward hooks
908
923
 
909
924
  # 1. Check inputs. Raise error if not correct
@@ -1,4 +1,4 @@
1
- # Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -17,14 +17,16 @@ from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
18
  import torch
19
19
  from transformers import (
20
+ BaseImageProcessor,
20
21
  CLIPTextModelWithProjection,
21
22
  CLIPTokenizer,
23
+ PreTrainedModel,
22
24
  T5EncoderModel,
23
25
  T5TokenizerFast,
24
26
  )
25
27
 
26
- from ...image_processor import VaeImageProcessor
27
- from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
28
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
29
+ from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
28
30
  from ...models.autoencoders import AutoencoderKL
29
31
  from ...models.transformers import SD3Transformer2DModel
30
32
  from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -68,6 +70,20 @@ EXAMPLE_DOC_STRING = """
68
70
  """
69
71
 
70
72
 
73
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
74
+ def calculate_shift(
75
+ image_seq_len,
76
+ base_seq_len: int = 256,
77
+ max_seq_len: int = 4096,
78
+ base_shift: float = 0.5,
79
+ max_shift: float = 1.16,
80
+ ):
81
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
82
+ b = base_shift - m * base_seq_len
83
+ mu = image_seq_len * m + b
84
+ return mu
85
+
86
+
71
87
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
72
88
  def retrieve_timesteps(
73
89
  scheduler,
@@ -128,7 +144,7 @@ def retrieve_timesteps(
128
144
  return timesteps, num_inference_steps
129
145
 
130
146
 
131
- class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
147
+ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
132
148
  r"""
133
149
  Args:
134
150
  transformer ([`SD3Transformer2DModel`]):
@@ -160,10 +176,14 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
160
176
  tokenizer_3 (`T5TokenizerFast`):
161
177
  Tokenizer of class
162
178
  [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
179
+ image_encoder (`PreTrainedModel`, *optional*):
180
+ Pre-trained Vision Model for IP Adapter.
181
+ feature_extractor (`BaseImageProcessor`, *optional*):
182
+ Image processor for IP Adapter.
163
183
  """
164
184
 
165
- model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
166
- _optional_components = []
185
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
186
+ _optional_components = ["image_encoder", "feature_extractor"]
167
187
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
168
188
 
169
189
  def __init__(
@@ -177,6 +197,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
177
197
  tokenizer_2: CLIPTokenizer,
178
198
  text_encoder_3: T5EncoderModel,
179
199
  tokenizer_3: T5TokenizerFast,
200
+ image_encoder: PreTrainedModel = None,
201
+ feature_extractor: BaseImageProcessor = None,
180
202
  ):
181
203
  super().__init__()
182
204
 
@@ -190,6 +212,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
190
212
  tokenizer_3=tokenizer_3,
191
213
  transformer=transformer,
192
214
  scheduler=scheduler,
215
+ image_encoder=image_encoder,
216
+ feature_extractor=feature_extractor,
193
217
  )
194
218
  self.vae_scale_factor = (
195
219
  2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
@@ -642,6 +666,10 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
642
666
  def guidance_scale(self):
643
667
  return self._guidance_scale
644
668
 
669
+ @property
670
+ def skip_guidance_layers(self):
671
+ return self._skip_guidance_layers
672
+
645
673
  @property
646
674
  def clip_skip(self):
647
675
  return self._clip_skip
@@ -665,6 +693,83 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
665
693
  def interrupt(self):
666
694
  return self._interrupt
667
695
 
696
+ # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_image
697
+ def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
698
+ """Encodes the given image into a feature representation using a pre-trained image encoder.
699
+
700
+ Args:
701
+ image (`PipelineImageInput`):
702
+ Input image to be encoded.
703
+ device: (`torch.device`):
704
+ Torch device.
705
+
706
+ Returns:
707
+ `torch.Tensor`: The encoded image feature representation.
708
+ """
709
+ if not isinstance(image, torch.Tensor):
710
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
711
+
712
+ image = image.to(device=device, dtype=self.dtype)
713
+
714
+ return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
715
+
716
+ # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.prepare_ip_adapter_image_embeds
717
+ def prepare_ip_adapter_image_embeds(
718
+ self,
719
+ ip_adapter_image: Optional[PipelineImageInput] = None,
720
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
721
+ device: Optional[torch.device] = None,
722
+ num_images_per_prompt: int = 1,
723
+ do_classifier_free_guidance: bool = True,
724
+ ) -> torch.Tensor:
725
+ """Prepares image embeddings for use in the IP-Adapter.
726
+
727
+ Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
728
+
729
+ Args:
730
+ ip_adapter_image (`PipelineImageInput`, *optional*):
731
+ The input image to extract features from for IP-Adapter.
732
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
733
+ Precomputed image embeddings.
734
+ device: (`torch.device`, *optional*):
735
+ Torch device.
736
+ num_images_per_prompt (`int`, defaults to 1):
737
+ Number of images that should be generated per prompt.
738
+ do_classifier_free_guidance (`bool`, defaults to True):
739
+ Whether to use classifier free guidance or not.
740
+ """
741
+ device = device or self._execution_device
742
+
743
+ if ip_adapter_image_embeds is not None:
744
+ if do_classifier_free_guidance:
745
+ single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
746
+ else:
747
+ single_image_embeds = ip_adapter_image_embeds
748
+ elif ip_adapter_image is not None:
749
+ single_image_embeds = self.encode_image(ip_adapter_image, device)
750
+ if do_classifier_free_guidance:
751
+ single_negative_image_embeds = torch.zeros_like(single_image_embeds)
752
+ else:
753
+ raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
754
+
755
+ image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
756
+
757
+ if do_classifier_free_guidance:
758
+ negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
759
+ image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
760
+
761
+ return image_embeds.to(device=device)
762
+
763
+ def enable_sequential_cpu_offload(self, *args, **kwargs):
764
+ if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
765
+ logger.warning(
766
+ "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
767
+ "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
768
+ "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
769
+ )
770
+
771
+ super().enable_sequential_cpu_offload(*args, **kwargs)
772
+
668
773
  @torch.no_grad()
669
774
  @replace_example_docstring(EXAMPLE_DOC_STRING)
670
775
  def __call__(
@@ -675,7 +780,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
675
780
  height: Optional[int] = None,
676
781
  width: Optional[int] = None,
677
782
  num_inference_steps: int = 28,
678
- timesteps: List[int] = None,
783
+ sigmas: Optional[List[float]] = None,
679
784
  guidance_scale: float = 7.0,
680
785
  negative_prompt: Optional[Union[str, List[str]]] = None,
681
786
  negative_prompt_2: Optional[Union[str, List[str]]] = None,
@@ -687,6 +792,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
687
792
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
688
793
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
689
794
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
795
+ ip_adapter_image: Optional[PipelineImageInput] = None,
796
+ ip_adapter_image_embeds: Optional[torch.Tensor] = None,
690
797
  output_type: Optional[str] = "pil",
691
798
  return_dict: bool = True,
692
799
  joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -694,6 +801,11 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
694
801
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
695
802
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
696
803
  max_sequence_length: int = 256,
804
+ skip_guidance_layers: List[int] = None,
805
+ skip_layer_guidance_scale: float = 2.8,
806
+ skip_layer_guidance_stop: float = 0.2,
807
+ skip_layer_guidance_start: float = 0.01,
808
+ mu: Optional[float] = None,
697
809
  ):
698
810
  r"""
699
811
  Function invoked when calling the pipeline for generation.
@@ -715,10 +827,10 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
715
827
  num_inference_steps (`int`, *optional*, defaults to 50):
716
828
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
717
829
  expense of slower inference.
718
- timesteps (`List[int]`, *optional*):
719
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
720
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
721
- passed will be used. Must be in descending order.
830
+ sigmas (`List[float]`, *optional*):
831
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
832
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
833
+ will be used.
722
834
  guidance_scale (`float`, *optional*, defaults to 7.0):
723
835
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
724
836
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -758,12 +870,17 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
758
870
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
759
871
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
760
872
  input argument.
873
+ ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
874
+ ip_adapter_image_embeds (`torch.Tensor`, *optional*):
875
+ Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
876
+ emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
877
+ `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
761
878
  output_type (`str`, *optional*, defaults to `"pil"`):
762
879
  The output format of the generate image. Choose between
763
880
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
764
881
  return_dict (`bool`, *optional*, defaults to `True`):
765
- Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
766
- of a plain tuple.
882
+ Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
883
+ a plain tuple.
767
884
  joint_attention_kwargs (`dict`, *optional*):
768
885
  A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
769
886
  `self.processor` in
@@ -778,6 +895,23 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
778
895
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
779
896
  `._callback_tensor_inputs` attribute of your pipeline class.
780
897
  max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
898
+ skip_guidance_layers (`List[int]`, *optional*):
899
+ A list of integers that specify layers to skip during guidance. If not provided, all layers will be
900
+ used for guidance. If provided, the guidance will only be applied to the layers specified in the list.
901
+ Recommended value by StabiltyAI for Stable Diffusion 3.5 Medium is [7, 8, 9].
902
+ skip_layer_guidance_scale (`int`, *optional*): The scale of the guidance for the layers specified in
903
+ `skip_guidance_layers`. The guidance will be applied to the layers specified in `skip_guidance_layers`
904
+ with a scale of `skip_layer_guidance_scale`. The guidance will be applied to the rest of the layers
905
+ with a scale of `1`.
906
+ skip_layer_guidance_stop (`int`, *optional*): The step at which the guidance for the layers specified in
907
+ `skip_guidance_layers` will stop. The guidance will be applied to the layers specified in
908
+ `skip_guidance_layers` until the fraction specified in `skip_layer_guidance_stop`. Recommended value by
909
+ StabiltyAI for Stable Diffusion 3.5 Medium is 0.2.
910
+ skip_layer_guidance_start (`int`, *optional*): The step at which the guidance for the layers specified in
911
+ `skip_guidance_layers` will start. The guidance will be applied to the layers specified in
912
+ `skip_guidance_layers` from the fraction specified in `skip_layer_guidance_start`. Recommended value by
913
+ StabiltyAI for Stable Diffusion 3.5 Medium is 0.01.
914
+ mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
781
915
 
782
916
  Examples:
783
917
 
@@ -809,6 +943,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
809
943
  )
810
944
 
811
945
  self._guidance_scale = guidance_scale
946
+ self._skip_layer_guidance_scale = skip_layer_guidance_scale
812
947
  self._clip_skip = clip_skip
813
948
  self._joint_attention_kwargs = joint_attention_kwargs
814
949
  self._interrupt = False
@@ -851,15 +986,13 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
851
986
  )
852
987
 
853
988
  if self.do_classifier_free_guidance:
989
+ if skip_guidance_layers is not None:
990
+ original_prompt_embeds = prompt_embeds
991
+ original_pooled_prompt_embeds = pooled_prompt_embeds
854
992
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
855
993
  pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
856
994
 
857
- # 4. Prepare timesteps
858
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
859
- num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
860
- self._num_timesteps = len(timesteps)
861
-
862
- # 5. Prepare latent variables
995
+ # 4. Prepare latent variables
863
996
  num_channels_latents = self.transformer.config.in_channels
864
997
  latents = self.prepare_latents(
865
998
  batch_size * num_images_per_prompt,
@@ -872,7 +1005,49 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
872
1005
  latents,
873
1006
  )
874
1007
 
875
- # 6. Denoising loop
1008
+ # 5. Prepare timesteps
1009
+ scheduler_kwargs = {}
1010
+ if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
1011
+ _, _, height, width = latents.shape
1012
+ image_seq_len = (height // self.transformer.config.patch_size) * (
1013
+ width // self.transformer.config.patch_size
1014
+ )
1015
+ mu = calculate_shift(
1016
+ image_seq_len,
1017
+ self.scheduler.config.base_image_seq_len,
1018
+ self.scheduler.config.max_image_seq_len,
1019
+ self.scheduler.config.base_shift,
1020
+ self.scheduler.config.max_shift,
1021
+ )
1022
+ scheduler_kwargs["mu"] = mu
1023
+ elif mu is not None:
1024
+ scheduler_kwargs["mu"] = mu
1025
+ timesteps, num_inference_steps = retrieve_timesteps(
1026
+ self.scheduler,
1027
+ num_inference_steps,
1028
+ device,
1029
+ sigmas=sigmas,
1030
+ **scheduler_kwargs,
1031
+ )
1032
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1033
+ self._num_timesteps = len(timesteps)
1034
+
1035
+ # 6. Prepare image embeddings
1036
+ if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
1037
+ ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
1038
+ ip_adapter_image,
1039
+ ip_adapter_image_embeds,
1040
+ device,
1041
+ batch_size * num_images_per_prompt,
1042
+ self.do_classifier_free_guidance,
1043
+ )
1044
+
1045
+ if self.joint_attention_kwargs is None:
1046
+ self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
1047
+ else:
1048
+ self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
1049
+
1050
+ # 7. Denoising loop
876
1051
  with self.progress_bar(total=num_inference_steps) as progress_bar:
877
1052
  for i, t in enumerate(timesteps):
878
1053
  if self.interrupt:
@@ -896,6 +1071,27 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
896
1071
  if self.do_classifier_free_guidance:
897
1072
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
898
1073
  noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1074
+ should_skip_layers = (
1075
+ True
1076
+ if i > num_inference_steps * skip_layer_guidance_start
1077
+ and i < num_inference_steps * skip_layer_guidance_stop
1078
+ else False
1079
+ )
1080
+ if skip_guidance_layers is not None and should_skip_layers:
1081
+ timestep = t.expand(latents.shape[0])
1082
+ latent_model_input = latents
1083
+ noise_pred_skip_layers = self.transformer(
1084
+ hidden_states=latent_model_input,
1085
+ timestep=timestep,
1086
+ encoder_hidden_states=original_prompt_embeds,
1087
+ pooled_projections=original_pooled_prompt_embeds,
1088
+ joint_attention_kwargs=self.joint_attention_kwargs,
1089
+ return_dict=False,
1090
+ skip_layers=skip_guidance_layers,
1091
+ )[0]
1092
+ noise_pred = (
1093
+ noise_pred + (noise_pred_text - noise_pred_skip_layers) * self._skip_layer_guidance_scale
1094
+ )
899
1095
 
900
1096
  # compute the previous noisy sample x_t -> x_t-1
901
1097
  latents_dtype = latents.dtype
@@ -75,6 +75,20 @@ EXAMPLE_DOC_STRING = """
75
75
  """
76
76
 
77
77
 
78
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
79
+ def calculate_shift(
80
+ image_seq_len,
81
+ base_seq_len: int = 256,
82
+ max_seq_len: int = 4096,
83
+ base_shift: float = 0.5,
84
+ max_shift: float = 1.16,
85
+ ):
86
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
87
+ b = base_shift - m * base_seq_len
88
+ mu = image_seq_len * m + b
89
+ return mu
90
+
91
+
78
92
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
79
93
  def retrieve_latents(
80
94
  encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -218,6 +232,9 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
218
232
  )
219
233
  self.tokenizer_max_length = self.tokenizer.model_max_length
220
234
  self.default_sample_size = self.transformer.config.sample_size
235
+ self.patch_size = (
236
+ self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
237
+ )
221
238
 
222
239
  # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
223
240
  def _get_t5_prompt_embeds(
@@ -531,6 +548,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
531
548
  prompt,
532
549
  prompt_2,
533
550
  prompt_3,
551
+ height,
552
+ width,
534
553
  strength,
535
554
  negative_prompt=None,
536
555
  negative_prompt_2=None,
@@ -542,6 +561,15 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
542
561
  callback_on_step_end_tensor_inputs=None,
543
562
  max_sequence_length=None,
544
563
  ):
564
+ if (
565
+ height % (self.vae_scale_factor * self.patch_size) != 0
566
+ or width % (self.vae_scale_factor * self.patch_size) != 0
567
+ ):
568
+ raise ValueError(
569
+ f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
570
+ f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
571
+ )
572
+
545
573
  if strength < 0 or strength > 1:
546
574
  raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
547
575
 
@@ -710,10 +738,12 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
710
738
  prompt: Union[str, List[str]] = None,
711
739
  prompt_2: Optional[Union[str, List[str]]] = None,
712
740
  prompt_3: Optional[Union[str, List[str]]] = None,
741
+ height: Optional[int] = None,
742
+ width: Optional[int] = None,
713
743
  image: PipelineImageInput = None,
714
744
  strength: float = 0.6,
715
745
  num_inference_steps: int = 50,
716
- timesteps: List[int] = None,
746
+ sigmas: Optional[List[float]] = None,
717
747
  guidance_scale: float = 7.0,
718
748
  negative_prompt: Optional[Union[str, List[str]]] = None,
719
749
  negative_prompt_2: Optional[Union[str, List[str]]] = None,
@@ -732,6 +762,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
732
762
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
733
763
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
734
764
  max_sequence_length: int = 256,
765
+ mu: Optional[float] = None,
735
766
  ):
736
767
  r"""
737
768
  Function invoked when calling the pipeline for generation.
@@ -753,10 +784,10 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
753
784
  num_inference_steps (`int`, *optional*, defaults to 50):
754
785
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
755
786
  expense of slower inference.
756
- timesteps (`List[int]`, *optional*):
757
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
758
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
759
- passed will be used. Must be in descending order.
787
+ sigmas (`List[float]`, *optional*):
788
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
789
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
790
+ will be used.
760
791
  guidance_scale (`float`, *optional*, defaults to 7.0):
761
792
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
762
793
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -800,8 +831,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
800
831
  The output format of the generate image. Choose between
801
832
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
802
833
  return_dict (`bool`, *optional*, defaults to `True`):
803
- Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
804
- of a plain tuple.
834
+ Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
835
+ a plain tuple.
805
836
  joint_attention_kwargs (`dict`, *optional*):
806
837
  A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
807
838
  `self.processor` in
@@ -816,6 +847,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
816
847
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
817
848
  `._callback_tensor_inputs` attribute of your pipeline class.
818
849
  max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
850
+ mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
819
851
 
820
852
  Examples:
821
853
 
@@ -824,12 +856,16 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
824
856
  [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
825
857
  `tuple`. When returning a tuple, the first element is a list with the generated images.
826
858
  """
859
+ height = height or self.default_sample_size * self.vae_scale_factor
860
+ width = width or self.default_sample_size * self.vae_scale_factor
827
861
 
828
862
  # 1. Check inputs. Raise error if not correct
829
863
  self.check_inputs(
830
864
  prompt,
831
865
  prompt_2,
832
866
  prompt_3,
867
+ height,
868
+ width,
833
869
  strength,
834
870
  negative_prompt=negative_prompt,
835
871
  negative_prompt_2=negative_prompt_2,
@@ -890,10 +926,27 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
890
926
  pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
891
927
 
892
928
  # 3. Preprocess image
893
- image = self.image_processor.preprocess(image)
929
+ image = self.image_processor.preprocess(image, height=height, width=width)
894
930
 
895
931
  # 4. Prepare timesteps
896
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
932
+ scheduler_kwargs = {}
933
+ if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
934
+ image_seq_len = (int(height) // self.vae_scale_factor // self.transformer.config.patch_size) * (
935
+ int(width) // self.vae_scale_factor // self.transformer.config.patch_size
936
+ )
937
+ mu = calculate_shift(
938
+ image_seq_len,
939
+ self.scheduler.config.base_image_seq_len,
940
+ self.scheduler.config.max_image_seq_len,
941
+ self.scheduler.config.base_shift,
942
+ self.scheduler.config.max_shift,
943
+ )
944
+ scheduler_kwargs["mu"] = mu
945
+ elif mu is not None:
946
+ scheduler_kwargs["mu"] = mu
947
+ timesteps, num_inference_steps = retrieve_timesteps(
948
+ self.scheduler, num_inference_steps, device, sigmas=sigmas, **scheduler_kwargs
949
+ )
897
950
  timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
898
951
  latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
899
952