diffusers 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. diffusers/__init__.py +94 -3
  2. diffusers/commands/env.py +1 -5
  3. diffusers/configuration_utils.py +4 -9
  4. diffusers/dependency_versions_table.py +2 -2
  5. diffusers/image_processor.py +1 -2
  6. diffusers/loaders/__init__.py +17 -2
  7. diffusers/loaders/ip_adapter.py +10 -7
  8. diffusers/loaders/lora_base.py +752 -0
  9. diffusers/loaders/lora_pipeline.py +2222 -0
  10. diffusers/loaders/peft.py +213 -5
  11. diffusers/loaders/single_file.py +1 -12
  12. diffusers/loaders/single_file_model.py +31 -10
  13. diffusers/loaders/single_file_utils.py +262 -2
  14. diffusers/loaders/textual_inversion.py +1 -6
  15. diffusers/loaders/unet.py +23 -208
  16. diffusers/models/__init__.py +20 -0
  17. diffusers/models/activations.py +22 -0
  18. diffusers/models/attention.py +386 -7
  19. diffusers/models/attention_processor.py +1795 -629
  20. diffusers/models/autoencoders/__init__.py +2 -0
  21. diffusers/models/autoencoders/autoencoder_kl.py +14 -3
  22. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1035 -0
  23. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  24. diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
  25. diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
  26. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  27. diffusers/models/autoencoders/vq_model.py +4 -4
  28. diffusers/models/controlnet.py +2 -3
  29. diffusers/models/controlnet_hunyuan.py +401 -0
  30. diffusers/models/controlnet_sd3.py +11 -11
  31. diffusers/models/controlnet_sparsectrl.py +789 -0
  32. diffusers/models/controlnet_xs.py +40 -10
  33. diffusers/models/downsampling.py +68 -0
  34. diffusers/models/embeddings.py +319 -36
  35. diffusers/models/model_loading_utils.py +1 -3
  36. diffusers/models/modeling_flax_utils.py +1 -6
  37. diffusers/models/modeling_utils.py +4 -16
  38. diffusers/models/normalization.py +203 -12
  39. diffusers/models/transformers/__init__.py +6 -0
  40. diffusers/models/transformers/auraflow_transformer_2d.py +527 -0
  41. diffusers/models/transformers/cogvideox_transformer_3d.py +345 -0
  42. diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
  43. diffusers/models/transformers/latte_transformer_3d.py +327 -0
  44. diffusers/models/transformers/lumina_nextdit2d.py +340 -0
  45. diffusers/models/transformers/pixart_transformer_2d.py +102 -1
  46. diffusers/models/transformers/prior_transformer.py +1 -1
  47. diffusers/models/transformers/stable_audio_transformer.py +458 -0
  48. diffusers/models/transformers/transformer_flux.py +455 -0
  49. diffusers/models/transformers/transformer_sd3.py +18 -4
  50. diffusers/models/unets/unet_1d_blocks.py +1 -1
  51. diffusers/models/unets/unet_2d_condition.py +8 -1
  52. diffusers/models/unets/unet_3d_blocks.py +51 -920
  53. diffusers/models/unets/unet_3d_condition.py +4 -1
  54. diffusers/models/unets/unet_i2vgen_xl.py +4 -1
  55. diffusers/models/unets/unet_kandinsky3.py +1 -1
  56. diffusers/models/unets/unet_motion_model.py +1330 -84
  57. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  58. diffusers/models/unets/unet_stable_cascade.py +1 -3
  59. diffusers/models/unets/uvit_2d.py +1 -1
  60. diffusers/models/upsampling.py +64 -0
  61. diffusers/models/vq_model.py +8 -4
  62. diffusers/optimization.py +1 -1
  63. diffusers/pipelines/__init__.py +100 -3
  64. diffusers/pipelines/animatediff/__init__.py +4 -0
  65. diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
  66. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
  70. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  71. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
  72. diffusers/pipelines/aura_flow/__init__.py +48 -0
  73. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
  74. diffusers/pipelines/auto_pipeline.py +97 -19
  75. diffusers/pipelines/cogvideo/__init__.py +48 -0
  76. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +687 -0
  77. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  78. diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
  79. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
  80. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
  81. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
  82. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
  83. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
  84. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  85. diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
  86. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
  87. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
  88. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
  90. diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
  91. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
  96. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
  97. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
  98. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
  99. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  100. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
  101. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
  103. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  104. diffusers/pipelines/flux/__init__.py +47 -0
  105. diffusers/pipelines/flux/pipeline_flux.py +749 -0
  106. diffusers/pipelines/flux/pipeline_output.py +21 -0
  107. diffusers/pipelines/free_init_utils.py +2 -0
  108. diffusers/pipelines/free_noise_utils.py +236 -0
  109. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
  110. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
  111. diffusers/pipelines/kolors/__init__.py +54 -0
  112. diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
  113. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
  114. diffusers/pipelines/kolors/pipeline_output.py +21 -0
  115. diffusers/pipelines/kolors/text_encoder.py +889 -0
  116. diffusers/pipelines/kolors/tokenizer.py +334 -0
  117. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
  118. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
  119. diffusers/pipelines/latte/__init__.py +48 -0
  120. diffusers/pipelines/latte/pipeline_latte.py +881 -0
  121. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
  122. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
  123. diffusers/pipelines/lumina/__init__.py +48 -0
  124. diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
  125. diffusers/pipelines/pag/__init__.py +67 -0
  126. diffusers/pipelines/pag/pag_utils.py +237 -0
  127. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
  128. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
  129. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
  130. diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
  131. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
  132. diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
  133. diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
  134. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
  135. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
  136. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
  137. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
  138. diffusers/pipelines/pia/pipeline_pia.py +30 -37
  139. diffusers/pipelines/pipeline_flax_utils.py +4 -9
  140. diffusers/pipelines/pipeline_loading_utils.py +0 -3
  141. diffusers/pipelines/pipeline_utils.py +2 -14
  142. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
  143. diffusers/pipelines/stable_audio/__init__.py +50 -0
  144. diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
  145. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
  146. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
  147. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  148. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
  149. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
  150. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
  151. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
  152. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
  153. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
  154. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
  155. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
  156. diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
  157. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
  158. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
  159. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
  160. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
  161. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
  162. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
  163. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
  164. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
  165. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
  166. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
  167. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
  168. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
  170. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
  171. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
  172. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
  173. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
  174. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
  175. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
  176. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
  177. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
  178. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
  179. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  180. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  181. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
  182. diffusers/schedulers/__init__.py +8 -0
  183. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
  184. diffusers/schedulers/scheduling_ddim.py +1 -1
  185. diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
  186. diffusers/schedulers/scheduling_ddpm.py +1 -1
  187. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
  188. diffusers/schedulers/scheduling_deis_multistep.py +2 -2
  189. diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
  190. diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
  191. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
  192. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
  193. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
  194. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
  195. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
  196. diffusers/schedulers/scheduling_ipndm.py +1 -1
  197. diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
  198. diffusers/schedulers/scheduling_utils.py +1 -3
  199. diffusers/schedulers/scheduling_utils_flax.py +1 -3
  200. diffusers/training_utils.py +99 -14
  201. diffusers/utils/__init__.py +2 -2
  202. diffusers/utils/dummy_pt_objects.py +210 -0
  203. diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
  204. diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
  205. diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
  206. diffusers/utils/dynamic_modules_utils.py +1 -11
  207. diffusers/utils/export_utils.py +1 -4
  208. diffusers/utils/hub_utils.py +45 -42
  209. diffusers/utils/import_utils.py +19 -16
  210. diffusers/utils/loading_utils.py +76 -3
  211. diffusers/utils/testing_utils.py +11 -8
  212. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/METADATA +73 -83
  213. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/RECORD +217 -164
  214. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/WHEEL +1 -1
  215. diffusers/loaders/autoencoder.py +0 -146
  216. diffusers/loaders/controlnet.py +0 -136
  217. diffusers/loaders/lora.py +0 -1728
  218. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/LICENSE +0 -0
  219. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/entry_points.txt +0 -0
  220. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/top_level.txt +0 -0
@@ -22,12 +22,10 @@ import torch
22
22
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
23
 
24
24
  from ...image_processor import PipelineImageInput, VaeImageProcessor
25
- from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
26
  from ...models import AutoencoderKL, UNet2DConditionModel
27
27
  from ...models.attention_processor import (
28
28
  AttnProcessor2_0,
29
- LoRAAttnProcessor2_0,
30
- LoRAXFormersAttnProcessor,
31
29
  XFormersAttnProcessor,
32
30
  )
33
31
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -68,7 +66,11 @@ def preprocess(image):
68
66
 
69
67
 
70
68
  class StableDiffusionUpscalePipeline(
71
- DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
69
+ DiffusionPipeline,
70
+ StableDiffusionMixin,
71
+ TextualInversionLoaderMixin,
72
+ StableDiffusionLoraLoaderMixin,
73
+ FromSingleFileMixin,
72
74
  ):
73
75
  r"""
74
76
  Pipeline for text-guided image super-resolution using Stable Diffusion 2.
@@ -78,8 +80,8 @@ class StableDiffusionUpscalePipeline(
78
80
 
79
81
  The pipeline also inherits the following loading methods:
80
82
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
81
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
82
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
83
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
84
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
83
85
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
84
86
 
85
87
  Args:
@@ -245,7 +247,7 @@ class StableDiffusionUpscalePipeline(
245
247
  """
246
248
  # set lora scale so that monkey patched LoRA
247
249
  # function of text encoder can correctly access it
248
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
250
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
249
251
  self._lora_scale = lora_scale
250
252
 
251
253
  # dynamically adjust the LoRA scale
@@ -378,7 +380,7 @@ class StableDiffusionUpscalePipeline(
378
380
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
379
381
 
380
382
  if self.text_encoder is not None:
381
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
383
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
382
384
  # Retrieve the original scale by scaling back the LoRA layers
383
385
  unscale_lora_layers(self.text_encoder, lora_scale)
384
386
 
@@ -520,8 +522,6 @@ class StableDiffusionUpscalePipeline(
520
522
  (
521
523
  AttnProcessor2_0,
522
524
  XFormersAttnProcessor,
523
- LoRAXFormersAttnProcessor,
524
- LoRAAttnProcessor2_0,
525
525
  ),
526
526
  )
527
527
  # if xformers or torch_2_0 is used attention block does not need
@@ -616,7 +616,7 @@ class StableDiffusionUpscalePipeline(
616
616
  >>> # load model and scheduler
617
617
  >>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
618
618
  >>> pipeline = StableDiffusionUpscalePipeline.from_pretrained(
619
- ... model_id, revision="fp16", torch_dtype=torch.float16
619
+ ... model_id, variant="fp16", torch_dtype=torch.float16
620
620
  ... )
621
621
  >>> pipeline = pipeline.to("cuda")
622
622
 
@@ -20,7 +20,7 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokeniz
20
20
  from transformers.models.clip.modeling_clip import CLIPTextModelOutput
21
21
 
22
22
  from ...image_processor import VaeImageProcessor
23
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
23
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
24
24
  from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
25
25
  from ...models.embeddings import get_timestep_embedding
26
26
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -58,7 +58,9 @@ EXAMPLE_DOC_STRING = """
58
58
  """
59
59
 
60
60
 
61
- class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
61
+ class StableUnCLIPPipeline(
62
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
63
+ ):
62
64
  """
63
65
  Pipeline for text-to-image generation using stable unCLIP.
64
66
 
@@ -67,8 +69,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
67
69
 
68
70
  The pipeline also inherits the following loading methods:
69
71
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
70
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
71
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
72
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
73
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
72
74
 
73
75
  Args:
74
76
  prior_tokenizer ([`CLIPTokenizer`]):
@@ -326,7 +328,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
326
328
  """
327
329
  # set lora scale so that monkey patched LoRA
328
330
  # function of text encoder can correctly access it
329
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
331
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
330
332
  self._lora_scale = lora_scale
331
333
 
332
334
  # dynamically adjust the LoRA scale
@@ -459,7 +461,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
459
461
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
460
462
 
461
463
  if self.text_encoder is not None:
462
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
464
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
463
465
  # Retrieve the original scale by scaling back the LoRA layers
464
466
  unscale_lora_layers(self.text_encoder, lora_scale)
465
467
 
@@ -20,7 +20,7 @@ import torch
20
20
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
21
21
 
22
22
  from ...image_processor import VaeImageProcessor
23
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
23
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
24
24
  from ...models import AutoencoderKL, UNet2DConditionModel
25
25
  from ...models.embeddings import get_timestep_embedding
26
26
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -70,7 +70,7 @@ EXAMPLE_DOC_STRING = """
70
70
 
71
71
 
72
72
  class StableUnCLIPImg2ImgPipeline(
73
- DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
73
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
74
74
  ):
75
75
  """
76
76
  Pipeline for text-guided image-to-image generation using stable unCLIP.
@@ -80,8 +80,8 @@ class StableUnCLIPImg2ImgPipeline(
80
80
 
81
81
  The pipeline also inherits the following loading methods:
82
82
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
83
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
84
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
83
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
84
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
85
85
 
86
86
  Args:
87
87
  feature_extractor ([`CLIPImageProcessor`]):
@@ -290,7 +290,7 @@ class StableUnCLIPImg2ImgPipeline(
290
290
  """
291
291
  # set lora scale so that monkey patched LoRA
292
292
  # function of text encoder can correctly access it
293
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
293
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
294
294
  self._lora_scale = lora_scale
295
295
 
296
296
  # dynamically adjust the LoRA scale
@@ -423,7 +423,7 @@ class StableUnCLIPImg2ImgPipeline(
423
423
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
424
424
 
425
425
  if self.text_encoder is not None:
426
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
426
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
427
427
  # Retrieve the original scale by scaling back the LoRA layers
428
428
  unscale_lora_layers(self.text_encoder, lora_scale)
429
429
 
@@ -25,6 +25,7 @@ except OptionalDependencyNotAvailable:
25
25
  else:
26
26
  _import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
27
27
  _import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
28
+ _import_structure["pipeline_stable_diffusion_3_inpaint"] = ["StableDiffusion3InpaintPipeline"]
28
29
 
29
30
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
30
31
  try:
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
35
36
  else:
36
37
  from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
37
38
  from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
39
+ from .pipeline_stable_diffusion_3_inpaint import StableDiffusion3InpaintPipeline
38
40
 
39
41
  else:
40
42
  import sys
@@ -29,9 +29,12 @@ from ...models.autoencoders import AutoencoderKL
29
29
  from ...models.transformers import SD3Transformer2DModel
30
30
  from ...schedulers import FlowMatchEulerDiscreteScheduler
31
31
  from ...utils import (
32
+ USE_PEFT_BACKEND,
32
33
  is_torch_xla_available,
33
34
  logging,
34
35
  replace_example_docstring,
36
+ scale_lora_layers,
37
+ unscale_lora_layers,
35
38
  )
36
39
  from ...utils.torch_utils import randn_tensor
37
40
  from ..pipeline_utils import DiffusionPipeline
@@ -329,6 +332,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
329
332
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
330
333
  clip_skip: Optional[int] = None,
331
334
  max_sequence_length: int = 256,
335
+ lora_scale: Optional[float] = None,
332
336
  ):
333
337
  r"""
334
338
 
@@ -374,9 +378,22 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
374
378
  clip_skip (`int`, *optional*):
375
379
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
376
380
  the output of the pre-final layer will be used for computing the prompt embeddings.
381
+ lora_scale (`float`, *optional*):
382
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
377
383
  """
378
384
  device = device or self._execution_device
379
385
 
386
+ # set lora scale so that monkey patched LoRA
387
+ # function of text encoder can correctly access it
388
+ if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
389
+ self._lora_scale = lora_scale
390
+
391
+ # dynamically adjust the LoRA scale
392
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
393
+ scale_lora_layers(self.text_encoder, lora_scale)
394
+ if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
395
+ scale_lora_layers(self.text_encoder_2, lora_scale)
396
+
380
397
  prompt = [prompt] if isinstance(prompt, str) else prompt
381
398
  if prompt is not None:
382
399
  batch_size = len(prompt)
@@ -479,6 +496,16 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
479
496
  [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
480
497
  )
481
498
 
499
+ if self.text_encoder is not None:
500
+ if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
501
+ # Retrieve the original scale by scaling back the LoRA layers
502
+ unscale_lora_layers(self.text_encoder, lora_scale)
503
+
504
+ if self.text_encoder_2 is not None:
505
+ if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
506
+ # Retrieve the original scale by scaling back the LoRA layers
507
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
508
+
482
509
  return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
483
510
 
484
511
  def check_inputs(
@@ -683,7 +710,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
683
710
  Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
684
711
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
685
712
  passed will be used. Must be in descending order.
686
- guidance_scale (`float`, *optional*, defaults to 5.0):
713
+ guidance_scale (`float`, *optional*, defaults to 7.0):
687
714
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
688
715
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
689
716
  Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -746,8 +773,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
746
773
  Examples:
747
774
 
748
775
  Returns:
749
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
750
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
776
+ [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
777
+ [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
751
778
  `tuple`. When returning a tuple, the first element is a list with the generated images.
752
779
  """
753
780
 
@@ -787,6 +814,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
787
814
 
788
815
  device = self._execution_device
789
816
 
817
+ lora_scale = (
818
+ self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
819
+ )
790
820
  (
791
821
  prompt_embeds,
792
822
  negative_prompt_embeds,
@@ -808,6 +838,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
808
838
  clip_skip=self.clip_skip,
809
839
  num_images_per_prompt=num_images_per_prompt,
810
840
  max_sequence_length=max_sequence_length,
841
+ lora_scale=lora_scale,
811
842
  )
812
843
 
813
844
  if self.do_classifier_free_guidance:
@@ -25,13 +25,17 @@ from transformers import (
25
25
  )
26
26
 
27
27
  from ...image_processor import PipelineImageInput, VaeImageProcessor
28
+ from ...loaders import SD3LoraLoaderMixin
28
29
  from ...models.autoencoders import AutoencoderKL
29
30
  from ...models.transformers import SD3Transformer2DModel
30
31
  from ...schedulers import FlowMatchEulerDiscreteScheduler
31
32
  from ...utils import (
33
+ USE_PEFT_BACKEND,
32
34
  is_torch_xla_available,
33
35
  logging,
34
36
  replace_example_docstring,
37
+ scale_lora_layers,
38
+ unscale_lora_layers,
35
39
  )
36
40
  from ...utils.torch_utils import randn_tensor
37
41
  from ..pipeline_utils import DiffusionPipeline
@@ -62,7 +66,7 @@ EXAMPLE_DOC_STRING = """
62
66
  >>> pipe = pipe.to(device)
63
67
 
64
68
  >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
65
- >>> init_image = load_image(url).resize((512, 512))
69
+ >>> init_image = load_image(url).resize((1024, 1024))
66
70
 
67
71
  >>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
68
72
 
@@ -346,6 +350,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
346
350
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
347
351
  clip_skip: Optional[int] = None,
348
352
  max_sequence_length: int = 256,
353
+ lora_scale: Optional[float] = None,
349
354
  ):
350
355
  r"""
351
356
 
@@ -391,9 +396,22 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
391
396
  clip_skip (`int`, *optional*):
392
397
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
393
398
  the output of the pre-final layer will be used for computing the prompt embeddings.
399
+ lora_scale (`float`, *optional*):
400
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
394
401
  """
395
402
  device = device or self._execution_device
396
403
 
404
+ # set lora scale so that monkey patched LoRA
405
+ # function of text encoder can correctly access it
406
+ if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
407
+ self._lora_scale = lora_scale
408
+
409
+ # dynamically adjust the LoRA scale
410
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
411
+ scale_lora_layers(self.text_encoder, lora_scale)
412
+ if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
413
+ scale_lora_layers(self.text_encoder_2, lora_scale)
414
+
397
415
  prompt = [prompt] if isinstance(prompt, str) else prompt
398
416
  if prompt is not None:
399
417
  batch_size = len(prompt)
@@ -496,6 +514,16 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
496
514
  [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
497
515
  )
498
516
 
517
+ if self.text_encoder is not None:
518
+ if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
519
+ # Retrieve the original scale by scaling back the LoRA layers
520
+ unscale_lora_layers(self.text_encoder, lora_scale)
521
+
522
+ if self.text_encoder_2 is not None:
523
+ if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
524
+ # Retrieve the original scale by scaling back the LoRA layers
525
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
526
+
499
527
  return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
500
528
 
501
529
  def check_inputs(
@@ -605,8 +633,6 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
605
633
  )
606
634
 
607
635
  image = image.to(device=device, dtype=dtype)
608
- if image.shape[1] == self.vae.config.latent_channels:
609
- init_latents = image
610
636
 
611
637
  batch_size = batch_size * num_images_per_prompt
612
638
  if image.shape[1] == self.vae.config.latent_channels:
@@ -726,7 +752,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
726
752
  Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
727
753
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
728
754
  passed will be used. Must be in descending order.
729
- guidance_scale (`float`, *optional*, defaults to 5.0):
755
+ guidance_scale (`float`, *optional*, defaults to 7.0):
730
756
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
731
757
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
732
758
  Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -785,8 +811,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
785
811
  Examples:
786
812
 
787
813
  Returns:
788
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
789
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
814
+ [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
815
+ [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
790
816
  `tuple`. When returning a tuple, the first element is a list with the generated images.
791
817
  """
792
818
 
@@ -854,7 +880,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
854
880
  # 4. Prepare timesteps
855
881
  timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
856
882
  timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
857
- latent_timestep = timesteps[:1].repeat(batch_size * num_inference_steps)
883
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
858
884
 
859
885
  # 5. Prepare latent variables
860
886
  if latents is None: