diffusers 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. diffusers/__init__.py +94 -3
  2. diffusers/commands/env.py +1 -5
  3. diffusers/configuration_utils.py +4 -9
  4. diffusers/dependency_versions_table.py +2 -2
  5. diffusers/image_processor.py +1 -2
  6. diffusers/loaders/__init__.py +17 -2
  7. diffusers/loaders/ip_adapter.py +10 -7
  8. diffusers/loaders/lora_base.py +752 -0
  9. diffusers/loaders/lora_pipeline.py +2222 -0
  10. diffusers/loaders/peft.py +213 -5
  11. diffusers/loaders/single_file.py +1 -12
  12. diffusers/loaders/single_file_model.py +31 -10
  13. diffusers/loaders/single_file_utils.py +262 -2
  14. diffusers/loaders/textual_inversion.py +1 -6
  15. diffusers/loaders/unet.py +23 -208
  16. diffusers/models/__init__.py +20 -0
  17. diffusers/models/activations.py +22 -0
  18. diffusers/models/attention.py +386 -7
  19. diffusers/models/attention_processor.py +1795 -629
  20. diffusers/models/autoencoders/__init__.py +2 -0
  21. diffusers/models/autoencoders/autoencoder_kl.py +14 -3
  22. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1035 -0
  23. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  24. diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
  25. diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
  26. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  27. diffusers/models/autoencoders/vq_model.py +4 -4
  28. diffusers/models/controlnet.py +2 -3
  29. diffusers/models/controlnet_hunyuan.py +401 -0
  30. diffusers/models/controlnet_sd3.py +11 -11
  31. diffusers/models/controlnet_sparsectrl.py +789 -0
  32. diffusers/models/controlnet_xs.py +40 -10
  33. diffusers/models/downsampling.py +68 -0
  34. diffusers/models/embeddings.py +319 -36
  35. diffusers/models/model_loading_utils.py +1 -3
  36. diffusers/models/modeling_flax_utils.py +1 -6
  37. diffusers/models/modeling_utils.py +4 -16
  38. diffusers/models/normalization.py +203 -12
  39. diffusers/models/transformers/__init__.py +6 -0
  40. diffusers/models/transformers/auraflow_transformer_2d.py +527 -0
  41. diffusers/models/transformers/cogvideox_transformer_3d.py +345 -0
  42. diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
  43. diffusers/models/transformers/latte_transformer_3d.py +327 -0
  44. diffusers/models/transformers/lumina_nextdit2d.py +340 -0
  45. diffusers/models/transformers/pixart_transformer_2d.py +102 -1
  46. diffusers/models/transformers/prior_transformer.py +1 -1
  47. diffusers/models/transformers/stable_audio_transformer.py +458 -0
  48. diffusers/models/transformers/transformer_flux.py +455 -0
  49. diffusers/models/transformers/transformer_sd3.py +18 -4
  50. diffusers/models/unets/unet_1d_blocks.py +1 -1
  51. diffusers/models/unets/unet_2d_condition.py +8 -1
  52. diffusers/models/unets/unet_3d_blocks.py +51 -920
  53. diffusers/models/unets/unet_3d_condition.py +4 -1
  54. diffusers/models/unets/unet_i2vgen_xl.py +4 -1
  55. diffusers/models/unets/unet_kandinsky3.py +1 -1
  56. diffusers/models/unets/unet_motion_model.py +1330 -84
  57. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  58. diffusers/models/unets/unet_stable_cascade.py +1 -3
  59. diffusers/models/unets/uvit_2d.py +1 -1
  60. diffusers/models/upsampling.py +64 -0
  61. diffusers/models/vq_model.py +8 -4
  62. diffusers/optimization.py +1 -1
  63. diffusers/pipelines/__init__.py +100 -3
  64. diffusers/pipelines/animatediff/__init__.py +4 -0
  65. diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
  66. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
  70. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  71. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
  72. diffusers/pipelines/aura_flow/__init__.py +48 -0
  73. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
  74. diffusers/pipelines/auto_pipeline.py +97 -19
  75. diffusers/pipelines/cogvideo/__init__.py +48 -0
  76. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +687 -0
  77. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  78. diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
  79. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
  80. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
  81. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
  82. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
  83. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
  84. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  85. diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
  86. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
  87. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
  88. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
  90. diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
  91. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
  96. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
  97. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
  98. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
  99. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  100. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
  101. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
  103. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  104. diffusers/pipelines/flux/__init__.py +47 -0
  105. diffusers/pipelines/flux/pipeline_flux.py +749 -0
  106. diffusers/pipelines/flux/pipeline_output.py +21 -0
  107. diffusers/pipelines/free_init_utils.py +2 -0
  108. diffusers/pipelines/free_noise_utils.py +236 -0
  109. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
  110. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
  111. diffusers/pipelines/kolors/__init__.py +54 -0
  112. diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
  113. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
  114. diffusers/pipelines/kolors/pipeline_output.py +21 -0
  115. diffusers/pipelines/kolors/text_encoder.py +889 -0
  116. diffusers/pipelines/kolors/tokenizer.py +334 -0
  117. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
  118. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
  119. diffusers/pipelines/latte/__init__.py +48 -0
  120. diffusers/pipelines/latte/pipeline_latte.py +881 -0
  121. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
  122. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
  123. diffusers/pipelines/lumina/__init__.py +48 -0
  124. diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
  125. diffusers/pipelines/pag/__init__.py +67 -0
  126. diffusers/pipelines/pag/pag_utils.py +237 -0
  127. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
  128. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
  129. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
  130. diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
  131. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
  132. diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
  133. diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
  134. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
  135. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
  136. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
  137. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
  138. diffusers/pipelines/pia/pipeline_pia.py +30 -37
  139. diffusers/pipelines/pipeline_flax_utils.py +4 -9
  140. diffusers/pipelines/pipeline_loading_utils.py +0 -3
  141. diffusers/pipelines/pipeline_utils.py +2 -14
  142. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
  143. diffusers/pipelines/stable_audio/__init__.py +50 -0
  144. diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
  145. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
  146. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
  147. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  148. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
  149. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
  150. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
  151. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
  152. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
  153. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
  154. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
  155. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
  156. diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
  157. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
  158. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
  159. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
  160. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
  161. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
  162. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
  163. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
  164. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
  165. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
  166. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
  167. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
  168. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
  170. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
  171. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
  172. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
  173. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
  174. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
  175. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
  176. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
  177. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
  178. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
  179. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  180. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  181. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
  182. diffusers/schedulers/__init__.py +8 -0
  183. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
  184. diffusers/schedulers/scheduling_ddim.py +1 -1
  185. diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
  186. diffusers/schedulers/scheduling_ddpm.py +1 -1
  187. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
  188. diffusers/schedulers/scheduling_deis_multistep.py +2 -2
  189. diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
  190. diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
  191. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
  192. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
  193. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
  194. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
  195. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
  196. diffusers/schedulers/scheduling_ipndm.py +1 -1
  197. diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
  198. diffusers/schedulers/scheduling_utils.py +1 -3
  199. diffusers/schedulers/scheduling_utils_flax.py +1 -3
  200. diffusers/training_utils.py +99 -14
  201. diffusers/utils/__init__.py +2 -2
  202. diffusers/utils/dummy_pt_objects.py +210 -0
  203. diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
  204. diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
  205. diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
  206. diffusers/utils/dynamic_modules_utils.py +1 -11
  207. diffusers/utils/export_utils.py +1 -4
  208. diffusers/utils/hub_utils.py +45 -42
  209. diffusers/utils/import_utils.py +19 -16
  210. diffusers/utils/loading_utils.py +76 -3
  211. diffusers/utils/testing_utils.py +11 -8
  212. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/METADATA +73 -83
  213. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/RECORD +217 -164
  214. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/WHEEL +1 -1
  215. diffusers/loaders/autoencoder.py +0 -146
  216. diffusers/loaders/controlnet.py +0 -136
  217. diffusers/loaders/lora.py +0 -1728
  218. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/LICENSE +0 -0
  219. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/entry_points.txt +0 -0
  220. {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,7 @@ from torch.nn import functional as F
22
22
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
23
 
24
24
  from ...image_processor import VaeImageProcessor
25
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
26
  from ...models import AutoencoderKL, UNet2DConditionModel
27
27
  from ...models.attention_processor import Attention
28
28
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -323,7 +323,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
323
323
  """
324
324
  # set lora scale so that monkey patched LoRA
325
325
  # function of text encoder can correctly access it
326
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
326
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
327
327
  self._lora_scale = lora_scale
328
328
 
329
329
  # dynamically adjust the LoRA scale
@@ -456,7 +456,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
456
456
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
457
457
 
458
458
  if self.text_encoder is not None:
459
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
459
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
460
460
  # Retrieve the original scale by scaling back the LoRA layers
461
461
  unscale_lora_layers(self.text_encoder, lora_scale)
462
462
 
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
24
24
 
25
25
  from ...configuration_utils import FrozenDict
26
26
  from ...image_processor import VaeImageProcessor
27
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
27
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
28
28
  from ...models import AutoencoderKL, UNet2DConditionModel
29
29
  from ...models.lora import adjust_lora_scale_text_encoder
30
30
  from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
@@ -234,7 +234,7 @@ def preprocess_mask(mask, batch_size: int = 1):
234
234
 
235
235
 
236
236
  class StableDiffusionDiffEditPipeline(
237
- DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
237
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
238
238
  ):
239
239
  r"""
240
240
  <Tip warning={true}>
@@ -250,8 +250,8 @@ class StableDiffusionDiffEditPipeline(
250
250
 
251
251
  The pipeline also inherits the following loading and saving methods:
252
252
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
253
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
254
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
253
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
254
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
255
255
 
256
256
  Args:
257
257
  vae ([`AutoencoderKL`]):
@@ -448,7 +448,7 @@ class StableDiffusionDiffEditPipeline(
448
448
  """
449
449
  # set lora scale so that monkey patched LoRA
450
450
  # function of text encoder can correctly access it
451
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
451
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
452
452
  self._lora_scale = lora_scale
453
453
 
454
454
  # dynamically adjust the LoRA scale
@@ -581,7 +581,7 @@ class StableDiffusionDiffEditPipeline(
581
581
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
582
582
 
583
583
  if self.text_encoder is not None:
584
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
584
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
585
585
  # Retrieve the original scale by scaling back the LoRA layers
586
586
  unscale_lora_layers(self.text_encoder, lora_scale)
587
587
 
@@ -18,10 +18,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
18
18
 
19
19
  import PIL.Image
20
20
  import torch
21
- from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
21
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
22
22
 
23
23
  from ...image_processor import VaeImageProcessor
24
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
24
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
25
25
  from ...models import AutoencoderKL, UNet2DConditionModel
26
26
  from ...models.attention import GatedSelfAttentionDense
27
27
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -138,7 +138,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
138
138
  unet: UNet2DConditionModel,
139
139
  scheduler: KarrasDiffusionSchedulers,
140
140
  safety_checker: StableDiffusionSafetyChecker,
141
- feature_extractor: CLIPFeatureExtractor,
141
+ feature_extractor: CLIPImageProcessor,
142
142
  requires_safety_checker: bool = True,
143
143
  ):
144
144
  super().__init__()
@@ -249,7 +249,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
249
249
  """
250
250
  # set lora scale so that monkey patched LoRA
251
251
  # function of text encoder can correctly access it
252
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
252
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
253
253
  self._lora_scale = lora_scale
254
254
 
255
255
  # dynamically adjust the LoRA scale
@@ -382,7 +382,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
382
382
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
383
383
 
384
384
  if self.text_encoder is not None:
385
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
385
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
386
386
  # Retrieve the original scale by scaling back the LoRA layers
387
387
  unscale_lora_layers(self.text_encoder, lora_scale)
388
388
 
@@ -19,7 +19,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
19
19
  import PIL.Image
20
20
  import torch
21
21
  from transformers import (
22
- CLIPFeatureExtractor,
22
+ CLIPImageProcessor,
23
23
  CLIPProcessor,
24
24
  CLIPTextModel,
25
25
  CLIPTokenizer,
@@ -27,7 +27,7 @@ from transformers import (
27
27
  )
28
28
 
29
29
  from ...image_processor import VaeImageProcessor
30
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
30
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
31
31
  from ...models import AutoencoderKL, UNet2DConditionModel
32
32
  from ...models.attention import GatedSelfAttentionDense
33
33
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -193,7 +193,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
193
193
  unet: UNet2DConditionModel,
194
194
  scheduler: KarrasDiffusionSchedulers,
195
195
  safety_checker: StableDiffusionSafetyChecker,
196
- feature_extractor: CLIPFeatureExtractor,
196
+ feature_extractor: CLIPImageProcessor,
197
197
  requires_safety_checker: bool = True,
198
198
  ):
199
199
  super().__init__()
@@ -274,7 +274,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
274
274
  """
275
275
  # set lora scale so that monkey patched LoRA
276
276
  # function of text encoder can correctly access it
277
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
277
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
278
278
  self._lora_scale = lora_scale
279
279
 
280
280
  # dynamically adjust the LoRA scale
@@ -407,7 +407,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
407
407
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
408
408
 
409
409
  if self.text_encoder is not None:
410
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
410
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
411
411
  # Retrieve the original scale by scaling back the LoRA layers
412
412
  unscale_lora_layers(self.text_encoder, lora_scale)
413
413
 
@@ -21,7 +21,7 @@ from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
21
21
  from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
22
22
 
23
23
  from ...image_processor import VaeImageProcessor
24
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
24
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
25
25
  from ...models.lora import adjust_lora_scale_text_encoder
26
26
  from ...schedulers import LMSDiscreteScheduler
27
27
  from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
@@ -48,7 +48,7 @@ class ModelWrapper:
48
48
 
49
49
 
50
50
  class StableDiffusionKDiffusionPipeline(
51
- DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
51
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
52
52
  ):
53
53
  r"""
54
54
  Pipeline for text-to-image generation using Stable Diffusion.
@@ -58,8 +58,8 @@ class StableDiffusionKDiffusionPipeline(
58
58
 
59
59
  The pipeline also inherits the following loading methods:
60
60
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
61
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
62
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
61
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
62
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
63
63
 
64
64
  <Tip warning={true}>
65
65
 
@@ -223,7 +223,7 @@ class StableDiffusionKDiffusionPipeline(
223
223
  """
224
224
  # set lora scale so that monkey patched LoRA
225
225
  # function of text encoder can correctly access it
226
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
226
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
227
227
  self._lora_scale = lora_scale
228
228
 
229
229
  # dynamically adjust the LoRA scale
@@ -356,7 +356,7 @@ class StableDiffusionKDiffusionPipeline(
356
356
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
357
357
 
358
358
  if self.text_encoder is not None:
359
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
359
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
360
360
  # Retrieve the original scale by scaling back the LoRA layers
361
361
  unscale_lora_layers(self.text_encoder, lora_scale)
362
362
 
@@ -36,8 +36,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel
36
36
  from ...models.attention_processor import (
37
37
  AttnProcessor2_0,
38
38
  FusedAttnProcessor2_0,
39
- LoRAAttnProcessor2_0,
40
- LoRAXFormersAttnProcessor,
41
39
  XFormersAttnProcessor,
42
40
  )
43
41
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -543,8 +541,6 @@ class StableDiffusionXLKDiffusionPipeline(
543
541
  (
544
542
  AttnProcessor2_0,
545
543
  XFormersAttnProcessor,
546
- LoRAXFormersAttnProcessor,
547
- LoRAAttnProcessor2_0,
548
544
  FusedAttnProcessor2_0,
549
545
  ),
550
546
  )
@@ -22,7 +22,7 @@ import torch
22
22
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
23
23
 
24
24
  from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
25
- from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
26
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
27
27
  from ...models.lora import adjust_lora_scale_text_encoder
28
28
  from ...schedulers import KarrasDiffusionSchedulers
@@ -161,7 +161,7 @@ class StableDiffusionLDM3DPipeline(
161
161
  StableDiffusionMixin,
162
162
  TextualInversionLoaderMixin,
163
163
  IPAdapterMixin,
164
- LoraLoaderMixin,
164
+ StableDiffusionLoraLoaderMixin,
165
165
  FromSingleFileMixin,
166
166
  ):
167
167
  r"""
@@ -172,8 +172,8 @@ class StableDiffusionLDM3DPipeline(
172
172
 
173
173
  The pipeline also inherits the following loading methods:
174
174
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
175
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
176
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
175
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
176
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
177
177
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
178
178
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
179
179
 
@@ -323,7 +323,7 @@ class StableDiffusionLDM3DPipeline(
323
323
  """
324
324
  # set lora scale so that monkey patched LoRA
325
325
  # function of text encoder can correctly access it
326
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
326
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
327
327
  self._lora_scale = lora_scale
328
328
 
329
329
  # dynamically adjust the LoRA scale
@@ -456,7 +456,7 @@ class StableDiffusionLDM3DPipeline(
456
456
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
457
457
 
458
458
  if self.text_encoder is not None:
459
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
459
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
460
460
  # Retrieve the original scale by scaling back the LoRA layers
461
461
  unscale_lora_layers(self.text_encoder, lora_scale)
462
462
 
@@ -491,6 +491,9 @@ class StableDiffusionLDM3DPipeline(
491
491
  def prepare_ip_adapter_image_embeds(
492
492
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
493
493
  ):
494
+ image_embeds = []
495
+ if do_classifier_free_guidance:
496
+ negative_image_embeds = []
494
497
  if ip_adapter_image_embeds is None:
495
498
  if not isinstance(ip_adapter_image, list):
496
499
  ip_adapter_image = [ip_adapter_image]
@@ -500,7 +503,6 @@ class StableDiffusionLDM3DPipeline(
500
503
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
501
504
  )
502
505
 
503
- image_embeds = []
504
506
  for single_ip_adapter_image, image_proj_layer in zip(
505
507
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
506
508
  ):
@@ -508,36 +510,28 @@ class StableDiffusionLDM3DPipeline(
508
510
  single_image_embeds, single_negative_image_embeds = self.encode_image(
509
511
  single_ip_adapter_image, device, 1, output_hidden_state
510
512
  )
511
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
512
- single_negative_image_embeds = torch.stack(
513
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
514
- )
515
513
 
514
+ image_embeds.append(single_image_embeds[None, :])
516
515
  if do_classifier_free_guidance:
517
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
518
- single_image_embeds = single_image_embeds.to(device)
519
-
520
- image_embeds.append(single_image_embeds)
516
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
521
517
  else:
522
- repeat_dims = [1]
523
- image_embeds = []
524
518
  for single_image_embeds in ip_adapter_image_embeds:
525
519
  if do_classifier_free_guidance:
526
520
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
527
- single_image_embeds = single_image_embeds.repeat(
528
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
529
- )
530
- single_negative_image_embeds = single_negative_image_embeds.repeat(
531
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
532
- )
533
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
534
- else:
535
- single_image_embeds = single_image_embeds.repeat(
536
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
537
- )
521
+ negative_image_embeds.append(single_negative_image_embeds)
538
522
  image_embeds.append(single_image_embeds)
539
523
 
540
- return image_embeds
524
+ ip_adapter_image_embeds = []
525
+ for i, single_image_embeds in enumerate(image_embeds):
526
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
527
+ if do_classifier_free_guidance:
528
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
529
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
530
+
531
+ single_image_embeds = single_image_embeds.to(device=device)
532
+ ip_adapter_image_embeds.append(single_image_embeds)
533
+
534
+ return ip_adapter_image_embeds
541
535
 
542
536
  def run_safety_checker(self, image, device, dtype):
543
537
  if self.safety_checker is None:
@@ -19,7 +19,7 @@ import torch
19
19
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
20
20
 
21
21
  from ...image_processor import PipelineImageInput, VaeImageProcessor
22
- from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
22
+ from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
23
23
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
24
24
  from ...models.lora import adjust_lora_scale_text_encoder
25
25
  from ...schedulers import DDIMScheduler
@@ -135,7 +135,11 @@ def retrieve_timesteps(
135
135
 
136
136
 
137
137
  class StableDiffusionPanoramaPipeline(
138
- DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
138
+ DiffusionPipeline,
139
+ StableDiffusionMixin,
140
+ TextualInversionLoaderMixin,
141
+ StableDiffusionLoraLoaderMixin,
142
+ IPAdapterMixin,
139
143
  ):
140
144
  r"""
141
145
  Pipeline for text-to-image generation using MultiDiffusion.
@@ -145,8 +149,8 @@ class StableDiffusionPanoramaPipeline(
145
149
 
146
150
  The pipeline also inherits the following loading methods:
147
151
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
148
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
149
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
152
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
153
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
150
154
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
151
155
 
152
156
  Args:
@@ -295,7 +299,7 @@ class StableDiffusionPanoramaPipeline(
295
299
  """
296
300
  # set lora scale so that monkey patched LoRA
297
301
  # function of text encoder can correctly access it
298
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
302
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
299
303
  self._lora_scale = lora_scale
300
304
 
301
305
  # dynamically adjust the LoRA scale
@@ -428,7 +432,7 @@ class StableDiffusionPanoramaPipeline(
428
432
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
429
433
 
430
434
  if self.text_encoder is not None:
431
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
435
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
432
436
  # Retrieve the original scale by scaling back the LoRA layers
433
437
  unscale_lora_layers(self.text_encoder, lora_scale)
434
438
 
@@ -463,6 +467,9 @@ class StableDiffusionPanoramaPipeline(
463
467
  def prepare_ip_adapter_image_embeds(
464
468
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
465
469
  ):
470
+ image_embeds = []
471
+ if do_classifier_free_guidance:
472
+ negative_image_embeds = []
466
473
  if ip_adapter_image_embeds is None:
467
474
  if not isinstance(ip_adapter_image, list):
468
475
  ip_adapter_image = [ip_adapter_image]
@@ -472,7 +479,6 @@ class StableDiffusionPanoramaPipeline(
472
479
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
473
480
  )
474
481
 
475
- image_embeds = []
476
482
  for single_ip_adapter_image, image_proj_layer in zip(
477
483
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
478
484
  ):
@@ -480,36 +486,28 @@ class StableDiffusionPanoramaPipeline(
480
486
  single_image_embeds, single_negative_image_embeds = self.encode_image(
481
487
  single_ip_adapter_image, device, 1, output_hidden_state
482
488
  )
483
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
484
- single_negative_image_embeds = torch.stack(
485
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
486
- )
487
489
 
490
+ image_embeds.append(single_image_embeds[None, :])
488
491
  if do_classifier_free_guidance:
489
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
490
- single_image_embeds = single_image_embeds.to(device)
491
-
492
- image_embeds.append(single_image_embeds)
492
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
493
493
  else:
494
- repeat_dims = [1]
495
- image_embeds = []
496
494
  for single_image_embeds in ip_adapter_image_embeds:
497
495
  if do_classifier_free_guidance:
498
496
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
499
- single_image_embeds = single_image_embeds.repeat(
500
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
501
- )
502
- single_negative_image_embeds = single_negative_image_embeds.repeat(
503
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
504
- )
505
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
506
- else:
507
- single_image_embeds = single_image_embeds.repeat(
508
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
509
- )
497
+ negative_image_embeds.append(single_negative_image_embeds)
510
498
  image_embeds.append(single_image_embeds)
511
499
 
512
- return image_embeds
500
+ ip_adapter_image_embeds = []
501
+ for i, single_image_embeds in enumerate(image_embeds):
502
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
503
+ if do_classifier_free_guidance:
504
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
505
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
506
+
507
+ single_image_embeds = single_image_embeds.to(device=device)
508
+ ip_adapter_image_embeds.append(single_image_embeds)
509
+
510
+ return ip_adapter_image_embeds
513
511
 
514
512
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
515
513
  def run_safety_checker(self, image, device, dtype):
@@ -20,7 +20,7 @@ import torch.nn.functional as F
20
20
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
21
21
 
22
22
  from ...image_processor import PipelineImageInput, VaeImageProcessor
23
- from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
23
+ from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
24
24
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
25
25
  from ...models.lora import adjust_lora_scale_text_encoder
26
26
  from ...schedulers import KarrasDiffusionSchedulers
@@ -238,7 +238,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
238
238
  """
239
239
  # set lora scale so that monkey patched LoRA
240
240
  # function of text encoder can correctly access it
241
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
241
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
242
242
  self._lora_scale = lora_scale
243
243
 
244
244
  # dynamically adjust the LoRA scale
@@ -371,7 +371,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
371
371
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
372
372
 
373
373
  if self.text_encoder is not None:
374
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
374
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
375
375
  # Retrieve the original scale by scaling back the LoRA layers
376
376
  unscale_lora_layers(self.text_encoder, lora_scale)
377
377
 
@@ -36,8 +36,6 @@ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
36
36
  from ...models.attention_processor import (
37
37
  AttnProcessor2_0,
38
38
  FusedAttnProcessor2_0,
39
- LoRAAttnProcessor2_0,
40
- LoRAXFormersAttnProcessor,
41
39
  XFormersAttnProcessor,
42
40
  )
43
41
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -537,6 +535,9 @@ class StableDiffusionXLPipeline(
537
535
  def prepare_ip_adapter_image_embeds(
538
536
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
539
537
  ):
538
+ image_embeds = []
539
+ if do_classifier_free_guidance:
540
+ negative_image_embeds = []
540
541
  if ip_adapter_image_embeds is None:
541
542
  if not isinstance(ip_adapter_image, list):
542
543
  ip_adapter_image = [ip_adapter_image]
@@ -546,7 +547,6 @@ class StableDiffusionXLPipeline(
546
547
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
547
548
  )
548
549
 
549
- image_embeds = []
550
550
  for single_ip_adapter_image, image_proj_layer in zip(
551
551
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
552
552
  ):
@@ -554,36 +554,28 @@ class StableDiffusionXLPipeline(
554
554
  single_image_embeds, single_negative_image_embeds = self.encode_image(
555
555
  single_ip_adapter_image, device, 1, output_hidden_state
556
556
  )
557
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
558
- single_negative_image_embeds = torch.stack(
559
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
560
- )
561
557
 
558
+ image_embeds.append(single_image_embeds[None, :])
562
559
  if do_classifier_free_guidance:
563
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
564
- single_image_embeds = single_image_embeds.to(device)
565
-
566
- image_embeds.append(single_image_embeds)
560
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
567
561
  else:
568
- repeat_dims = [1]
569
- image_embeds = []
570
562
  for single_image_embeds in ip_adapter_image_embeds:
571
563
  if do_classifier_free_guidance:
572
564
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
573
- single_image_embeds = single_image_embeds.repeat(
574
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
575
- )
576
- single_negative_image_embeds = single_negative_image_embeds.repeat(
577
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
578
- )
579
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
580
- else:
581
- single_image_embeds = single_image_embeds.repeat(
582
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
583
- )
565
+ negative_image_embeds.append(single_negative_image_embeds)
584
566
  image_embeds.append(single_image_embeds)
585
567
 
586
- return image_embeds
568
+ ip_adapter_image_embeds = []
569
+ for i, single_image_embeds in enumerate(image_embeds):
570
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
571
+ if do_classifier_free_guidance:
572
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
573
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
574
+
575
+ single_image_embeds = single_image_embeds.to(device=device)
576
+ ip_adapter_image_embeds.append(single_image_embeds)
577
+
578
+ return ip_adapter_image_embeds
587
579
 
588
580
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
589
581
  def prepare_extra_step_kwargs(self, generator, eta):
@@ -748,8 +740,6 @@ class StableDiffusionXLPipeline(
748
740
  (
749
741
  AttnProcessor2_0,
750
742
  XFormersAttnProcessor,
751
- LoRAXFormersAttnProcessor,
752
- LoRAAttnProcessor2_0,
753
743
  FusedAttnProcessor2_0,
754
744
  ),
755
745
  )