diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. diffusers/__init__.py +94 -3
  2. diffusers/commands/env.py +1 -5
  3. diffusers/configuration_utils.py +4 -9
  4. diffusers/dependency_versions_table.py +2 -2
  5. diffusers/image_processor.py +1 -2
  6. diffusers/loaders/__init__.py +17 -2
  7. diffusers/loaders/ip_adapter.py +10 -7
  8. diffusers/loaders/lora_base.py +752 -0
  9. diffusers/loaders/lora_pipeline.py +2252 -0
  10. diffusers/loaders/peft.py +213 -5
  11. diffusers/loaders/single_file.py +3 -14
  12. diffusers/loaders/single_file_model.py +31 -10
  13. diffusers/loaders/single_file_utils.py +293 -8
  14. diffusers/loaders/textual_inversion.py +1 -6
  15. diffusers/loaders/unet.py +23 -208
  16. diffusers/models/__init__.py +20 -0
  17. diffusers/models/activations.py +22 -0
  18. diffusers/models/attention.py +386 -7
  19. diffusers/models/attention_processor.py +1937 -629
  20. diffusers/models/autoencoders/__init__.py +2 -0
  21. diffusers/models/autoencoders/autoencoder_kl.py +14 -3
  22. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1271 -0
  23. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  24. diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
  25. diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
  26. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  27. diffusers/models/autoencoders/vq_model.py +4 -4
  28. diffusers/models/controlnet.py +2 -3
  29. diffusers/models/controlnet_hunyuan.py +401 -0
  30. diffusers/models/controlnet_sd3.py +11 -11
  31. diffusers/models/controlnet_sparsectrl.py +789 -0
  32. diffusers/models/controlnet_xs.py +40 -10
  33. diffusers/models/downsampling.py +68 -0
  34. diffusers/models/embeddings.py +403 -36
  35. diffusers/models/model_loading_utils.py +1 -3
  36. diffusers/models/modeling_flax_utils.py +1 -6
  37. diffusers/models/modeling_utils.py +4 -16
  38. diffusers/models/normalization.py +203 -12
  39. diffusers/models/transformers/__init__.py +6 -0
  40. diffusers/models/transformers/auraflow_transformer_2d.py +543 -0
  41. diffusers/models/transformers/cogvideox_transformer_3d.py +485 -0
  42. diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
  43. diffusers/models/transformers/latte_transformer_3d.py +327 -0
  44. diffusers/models/transformers/lumina_nextdit2d.py +340 -0
  45. diffusers/models/transformers/pixart_transformer_2d.py +102 -1
  46. diffusers/models/transformers/prior_transformer.py +1 -1
  47. diffusers/models/transformers/stable_audio_transformer.py +458 -0
  48. diffusers/models/transformers/transformer_flux.py +455 -0
  49. diffusers/models/transformers/transformer_sd3.py +18 -4
  50. diffusers/models/unets/unet_1d_blocks.py +1 -1
  51. diffusers/models/unets/unet_2d_condition.py +8 -1
  52. diffusers/models/unets/unet_3d_blocks.py +51 -920
  53. diffusers/models/unets/unet_3d_condition.py +4 -1
  54. diffusers/models/unets/unet_i2vgen_xl.py +4 -1
  55. diffusers/models/unets/unet_kandinsky3.py +1 -1
  56. diffusers/models/unets/unet_motion_model.py +1330 -84
  57. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  58. diffusers/models/unets/unet_stable_cascade.py +1 -3
  59. diffusers/models/unets/uvit_2d.py +1 -1
  60. diffusers/models/upsampling.py +64 -0
  61. diffusers/models/vq_model.py +8 -4
  62. diffusers/optimization.py +1 -1
  63. diffusers/pipelines/__init__.py +100 -3
  64. diffusers/pipelines/animatediff/__init__.py +4 -0
  65. diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
  66. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
  70. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  71. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
  72. diffusers/pipelines/aura_flow/__init__.py +48 -0
  73. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
  74. diffusers/pipelines/auto_pipeline.py +97 -19
  75. diffusers/pipelines/cogvideo/__init__.py +48 -0
  76. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +746 -0
  77. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  78. diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
  79. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
  80. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
  81. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
  82. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
  83. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
  84. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  85. diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
  86. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
  87. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
  88. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
  90. diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
  91. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
  96. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
  97. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
  98. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
  99. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  100. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
  101. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
  103. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  104. diffusers/pipelines/flux/__init__.py +47 -0
  105. diffusers/pipelines/flux/pipeline_flux.py +749 -0
  106. diffusers/pipelines/flux/pipeline_output.py +21 -0
  107. diffusers/pipelines/free_init_utils.py +2 -0
  108. diffusers/pipelines/free_noise_utils.py +236 -0
  109. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
  110. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
  111. diffusers/pipelines/kolors/__init__.py +54 -0
  112. diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
  113. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
  114. diffusers/pipelines/kolors/pipeline_output.py +21 -0
  115. diffusers/pipelines/kolors/text_encoder.py +889 -0
  116. diffusers/pipelines/kolors/tokenizer.py +334 -0
  117. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
  118. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
  119. diffusers/pipelines/latte/__init__.py +48 -0
  120. diffusers/pipelines/latte/pipeline_latte.py +881 -0
  121. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
  122. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
  123. diffusers/pipelines/lumina/__init__.py +48 -0
  124. diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
  125. diffusers/pipelines/pag/__init__.py +67 -0
  126. diffusers/pipelines/pag/pag_utils.py +237 -0
  127. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
  128. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
  129. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
  130. diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
  131. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
  132. diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
  133. diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
  134. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
  135. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
  136. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
  137. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
  138. diffusers/pipelines/pia/pipeline_pia.py +30 -37
  139. diffusers/pipelines/pipeline_flax_utils.py +4 -9
  140. diffusers/pipelines/pipeline_loading_utils.py +0 -3
  141. diffusers/pipelines/pipeline_utils.py +2 -14
  142. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
  143. diffusers/pipelines/stable_audio/__init__.py +50 -0
  144. diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
  145. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
  146. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
  147. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  148. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
  149. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
  150. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
  151. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
  152. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
  153. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
  154. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
  155. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
  156. diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
  157. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
  158. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
  159. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
  160. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
  161. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
  162. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
  163. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
  164. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
  165. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
  166. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
  167. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
  168. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
  170. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
  171. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
  172. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
  173. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
  174. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
  175. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
  176. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
  177. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
  178. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
  179. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  180. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  181. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
  182. diffusers/schedulers/__init__.py +8 -0
  183. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
  184. diffusers/schedulers/scheduling_ddim.py +1 -1
  185. diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
  186. diffusers/schedulers/scheduling_ddpm.py +1 -1
  187. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
  188. diffusers/schedulers/scheduling_deis_multistep.py +2 -2
  189. diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
  190. diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
  191. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
  192. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
  193. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
  194. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
  195. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
  196. diffusers/schedulers/scheduling_ipndm.py +1 -1
  197. diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
  198. diffusers/schedulers/scheduling_utils.py +1 -3
  199. diffusers/schedulers/scheduling_utils_flax.py +1 -3
  200. diffusers/training_utils.py +99 -14
  201. diffusers/utils/__init__.py +2 -2
  202. diffusers/utils/dummy_pt_objects.py +210 -0
  203. diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
  204. diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
  205. diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
  206. diffusers/utils/dynamic_modules_utils.py +1 -11
  207. diffusers/utils/export_utils.py +50 -6
  208. diffusers/utils/hub_utils.py +45 -42
  209. diffusers/utils/import_utils.py +37 -15
  210. diffusers/utils/loading_utils.py +80 -3
  211. diffusers/utils/testing_utils.py +11 -8
  212. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/METADATA +73 -83
  213. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/RECORD +217 -164
  214. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/WHEEL +1 -1
  215. diffusers/loaders/autoencoder.py +0 -146
  216. diffusers/loaders/controlnet.py +0 -136
  217. diffusers/loaders/lora.py +0 -1728
  218. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/LICENSE +0 -0
  219. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/entry_points.txt +0 -0
  220. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/top_level.txt +0 -0
@@ -52,7 +52,7 @@ EXAMPLE_DOC_STRING = """
52
52
  >>> image.save("cd_imagenet64_l2_onestep_sample_penguin.png")
53
53
 
54
54
  >>> # Multistep sampling, class-conditional image generation
55
- >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original Github repo:
55
+ >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original GitHub repo:
56
56
  >>> # https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77
57
57
  >>> image = pipe(num_inference_steps=None, timesteps=[22, 0], class_labels=145).images[0]
58
58
  >>> image.save("cd_imagenet64_l2_multistep_sample_penguin.png")
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
24
24
 
25
25
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
26
26
  from ...image_processor import PipelineImageInput, VaeImageProcessor
27
- from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
27
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
28
28
  from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
29
29
  from ...models.lora import adjust_lora_scale_text_encoder
30
30
  from ...schedulers import KarrasDiffusionSchedulers
@@ -156,7 +156,7 @@ class StableDiffusionControlNetPipeline(
156
156
  DiffusionPipeline,
157
157
  StableDiffusionMixin,
158
158
  TextualInversionLoaderMixin,
159
- LoraLoaderMixin,
159
+ StableDiffusionLoraLoaderMixin,
160
160
  IPAdapterMixin,
161
161
  FromSingleFileMixin,
162
162
  ):
@@ -168,8 +168,8 @@ class StableDiffusionControlNetPipeline(
168
168
 
169
169
  The pipeline also inherits the following loading methods:
170
170
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
171
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
172
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
171
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
172
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
173
173
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
174
174
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
175
175
 
@@ -331,7 +331,7 @@ class StableDiffusionControlNetPipeline(
331
331
  """
332
332
  # set lora scale so that monkey patched LoRA
333
333
  # function of text encoder can correctly access it
334
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
334
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
335
335
  self._lora_scale = lora_scale
336
336
 
337
337
  # dynamically adjust the LoRA scale
@@ -464,7 +464,7 @@ class StableDiffusionControlNetPipeline(
464
464
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
465
465
 
466
466
  if self.text_encoder is not None:
467
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
467
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
468
468
  # Retrieve the original scale by scaling back the LoRA layers
469
469
  unscale_lora_layers(self.text_encoder, lora_scale)
470
470
 
@@ -499,6 +499,9 @@ class StableDiffusionControlNetPipeline(
499
499
  def prepare_ip_adapter_image_embeds(
500
500
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
501
501
  ):
502
+ image_embeds = []
503
+ if do_classifier_free_guidance:
504
+ negative_image_embeds = []
502
505
  if ip_adapter_image_embeds is None:
503
506
  if not isinstance(ip_adapter_image, list):
504
507
  ip_adapter_image = [ip_adapter_image]
@@ -508,7 +511,6 @@ class StableDiffusionControlNetPipeline(
508
511
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
509
512
  )
510
513
 
511
- image_embeds = []
512
514
  for single_ip_adapter_image, image_proj_layer in zip(
513
515
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
514
516
  ):
@@ -516,36 +518,28 @@ class StableDiffusionControlNetPipeline(
516
518
  single_image_embeds, single_negative_image_embeds = self.encode_image(
517
519
  single_ip_adapter_image, device, 1, output_hidden_state
518
520
  )
519
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
520
- single_negative_image_embeds = torch.stack(
521
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
522
- )
523
521
 
522
+ image_embeds.append(single_image_embeds[None, :])
524
523
  if do_classifier_free_guidance:
525
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
526
- single_image_embeds = single_image_embeds.to(device)
527
-
528
- image_embeds.append(single_image_embeds)
524
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
529
525
  else:
530
- repeat_dims = [1]
531
- image_embeds = []
532
526
  for single_image_embeds in ip_adapter_image_embeds:
533
527
  if do_classifier_free_guidance:
534
528
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
535
- single_image_embeds = single_image_embeds.repeat(
536
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
537
- )
538
- single_negative_image_embeds = single_negative_image_embeds.repeat(
539
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
540
- )
541
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
542
- else:
543
- single_image_embeds = single_image_embeds.repeat(
544
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
545
- )
529
+ negative_image_embeds.append(single_negative_image_embeds)
546
530
  image_embeds.append(single_image_embeds)
547
531
 
548
- return image_embeds
532
+ ip_adapter_image_embeds = []
533
+ for i, single_image_embeds in enumerate(image_embeds):
534
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
535
+ if do_classifier_free_guidance:
536
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
537
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
538
+
539
+ single_image_embeds = single_image_embeds.to(device=device)
540
+ ip_adapter_image_embeds.append(single_image_embeds)
541
+
542
+ return ip_adapter_image_embeds
549
543
 
550
544
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
551
545
  def run_safety_checker(self, image, device, dtype):
@@ -1278,7 +1272,7 @@ class StableDiffusionControlNetPipeline(
1278
1272
  )
1279
1273
 
1280
1274
  if guess_mode and self.do_classifier_free_guidance:
1281
- # Infered ControlNet only for the conditional batch.
1275
+ # Inferred ControlNet only for the conditional batch.
1282
1276
  # To apply the output of ControlNet to both the unconditional and conditional batches,
1283
1277
  # add 0 to the unconditional batch to keep it unchanged.
1284
1278
  down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
23
23
 
24
24
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
25
  from ...image_processor import PipelineImageInput, VaeImageProcessor
26
- from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
27
27
  from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
28
28
  from ...models.lora import adjust_lora_scale_text_encoder
29
29
  from ...schedulers import KarrasDiffusionSchedulers
@@ -134,7 +134,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
134
134
  DiffusionPipeline,
135
135
  StableDiffusionMixin,
136
136
  TextualInversionLoaderMixin,
137
- LoraLoaderMixin,
137
+ StableDiffusionLoraLoaderMixin,
138
138
  IPAdapterMixin,
139
139
  FromSingleFileMixin,
140
140
  ):
@@ -146,8 +146,8 @@ class StableDiffusionControlNetImg2ImgPipeline(
146
146
 
147
147
  The pipeline also inherits the following loading methods:
148
148
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
149
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
150
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
149
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
150
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
151
151
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
152
152
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
153
153
 
@@ -309,7 +309,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
309
309
  """
310
310
  # set lora scale so that monkey patched LoRA
311
311
  # function of text encoder can correctly access it
312
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
312
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
313
313
  self._lora_scale = lora_scale
314
314
 
315
315
  # dynamically adjust the LoRA scale
@@ -442,7 +442,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
442
442
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
443
443
 
444
444
  if self.text_encoder is not None:
445
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
445
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
446
446
  # Retrieve the original scale by scaling back the LoRA layers
447
447
  unscale_lora_layers(self.text_encoder, lora_scale)
448
448
 
@@ -477,6 +477,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
477
477
  def prepare_ip_adapter_image_embeds(
478
478
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
479
479
  ):
480
+ image_embeds = []
481
+ if do_classifier_free_guidance:
482
+ negative_image_embeds = []
480
483
  if ip_adapter_image_embeds is None:
481
484
  if not isinstance(ip_adapter_image, list):
482
485
  ip_adapter_image = [ip_adapter_image]
@@ -486,7 +489,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
486
489
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
487
490
  )
488
491
 
489
- image_embeds = []
490
492
  for single_ip_adapter_image, image_proj_layer in zip(
491
493
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
492
494
  ):
@@ -494,36 +496,28 @@ class StableDiffusionControlNetImg2ImgPipeline(
494
496
  single_image_embeds, single_negative_image_embeds = self.encode_image(
495
497
  single_ip_adapter_image, device, 1, output_hidden_state
496
498
  )
497
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
498
- single_negative_image_embeds = torch.stack(
499
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
500
- )
501
499
 
500
+ image_embeds.append(single_image_embeds[None, :])
502
501
  if do_classifier_free_guidance:
503
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
504
- single_image_embeds = single_image_embeds.to(device)
505
-
506
- image_embeds.append(single_image_embeds)
502
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
507
503
  else:
508
- repeat_dims = [1]
509
- image_embeds = []
510
504
  for single_image_embeds in ip_adapter_image_embeds:
511
505
  if do_classifier_free_guidance:
512
506
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
513
- single_image_embeds = single_image_embeds.repeat(
514
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
515
- )
516
- single_negative_image_embeds = single_negative_image_embeds.repeat(
517
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
518
- )
519
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
520
- else:
521
- single_image_embeds = single_image_embeds.repeat(
522
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
523
- )
507
+ negative_image_embeds.append(single_negative_image_embeds)
524
508
  image_embeds.append(single_image_embeds)
525
509
 
526
- return image_embeds
510
+ ip_adapter_image_embeds = []
511
+ for i, single_image_embeds in enumerate(image_embeds):
512
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
513
+ if do_classifier_free_guidance:
514
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
515
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
516
+
517
+ single_image_embeds = single_image_embeds.to(device=device)
518
+ ip_adapter_image_embeds.append(single_image_embeds)
519
+
520
+ return ip_adapter_image_embeds
527
521
 
528
522
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
529
523
  def run_safety_checker(self, image, device, dtype):
@@ -830,6 +824,13 @@ class StableDiffusionControlNetImg2ImgPipeline(
830
824
  )
831
825
 
832
826
  elif isinstance(generator, list):
827
+ if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
828
+ image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
829
+ elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
830
+ raise ValueError(
831
+ f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
832
+ )
833
+
833
834
  init_latents = [
834
835
  retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
835
836
  for i in range(batch_size)
@@ -1243,7 +1244,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
1243
1244
  )
1244
1245
 
1245
1246
  if guess_mode and self.do_classifier_free_guidance:
1246
- # Infered ControlNet only for the conditional batch.
1247
+ # Inferred ControlNet only for the conditional batch.
1247
1248
  # To apply the output of ControlNet to both the unconditional and conditional batches,
1248
1249
  # add 0 to the unconditional batch to keep it unchanged.
1249
1250
  down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -25,7 +25,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
25
25
 
26
26
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
27
27
  from ...image_processor import PipelineImageInput, VaeImageProcessor
28
- from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
28
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
29
29
  from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
30
30
  from ...models.lora import adjust_lora_scale_text_encoder
31
31
  from ...schedulers import KarrasDiffusionSchedulers
@@ -118,134 +118,11 @@ def retrieve_latents(
118
118
  raise AttributeError("Could not access latents of provided encoder_output")
119
119
 
120
120
 
121
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
122
- def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
123
- """
124
- Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
125
- converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
126
- ``image`` and ``1`` for the ``mask``.
127
-
128
- The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
129
- binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
130
-
131
- Args:
132
- image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
133
- It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
134
- ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
135
- mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
136
- It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
137
- ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
138
-
139
-
140
- Raises:
141
- ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
142
- should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
143
- TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
144
- (ot the other way around).
145
-
146
- Returns:
147
- tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
148
- dimensions: ``batch x channels x height x width``.
149
- """
150
- deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
151
- deprecate(
152
- "prepare_mask_and_masked_image",
153
- "0.30.0",
154
- deprecation_message,
155
- )
156
- if image is None:
157
- raise ValueError("`image` input cannot be undefined.")
158
-
159
- if mask is None:
160
- raise ValueError("`mask_image` input cannot be undefined.")
161
-
162
- if isinstance(image, torch.Tensor):
163
- if not isinstance(mask, torch.Tensor):
164
- raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
165
-
166
- # Batch single image
167
- if image.ndim == 3:
168
- assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
169
- image = image.unsqueeze(0)
170
-
171
- # Batch and add channel dim for single mask
172
- if mask.ndim == 2:
173
- mask = mask.unsqueeze(0).unsqueeze(0)
174
-
175
- # Batch single mask or add channel dim
176
- if mask.ndim == 3:
177
- # Single batched mask, no channel dim or single mask not batched but channel dim
178
- if mask.shape[0] == 1:
179
- mask = mask.unsqueeze(0)
180
-
181
- # Batched masks no channel dim
182
- else:
183
- mask = mask.unsqueeze(1)
184
-
185
- assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
186
- assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
187
- assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
188
-
189
- # Check image is in [-1, 1]
190
- if image.min() < -1 or image.max() > 1:
191
- raise ValueError("Image should be in [-1, 1] range")
192
-
193
- # Check mask is in [0, 1]
194
- if mask.min() < 0 or mask.max() > 1:
195
- raise ValueError("Mask should be in [0, 1] range")
196
-
197
- # Binarize mask
198
- mask[mask < 0.5] = 0
199
- mask[mask >= 0.5] = 1
200
-
201
- # Image as float32
202
- image = image.to(dtype=torch.float32)
203
- elif isinstance(mask, torch.Tensor):
204
- raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
205
- else:
206
- # preprocess image
207
- if isinstance(image, (PIL.Image.Image, np.ndarray)):
208
- image = [image]
209
- if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
210
- # resize all images w.r.t passed height an width
211
- image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
212
- image = [np.array(i.convert("RGB"))[None, :] for i in image]
213
- image = np.concatenate(image, axis=0)
214
- elif isinstance(image, list) and isinstance(image[0], np.ndarray):
215
- image = np.concatenate([i[None, :] for i in image], axis=0)
216
-
217
- image = image.transpose(0, 3, 1, 2)
218
- image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
219
-
220
- # preprocess mask
221
- if isinstance(mask, (PIL.Image.Image, np.ndarray)):
222
- mask = [mask]
223
-
224
- if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
225
- mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
226
- mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
227
- mask = mask.astype(np.float32) / 255.0
228
- elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
229
- mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
230
-
231
- mask[mask < 0.5] = 0
232
- mask[mask >= 0.5] = 1
233
- mask = torch.from_numpy(mask)
234
-
235
- masked_image = image * (mask < 0.5)
236
-
237
- # n.b. ensure backwards compatibility as old function does not return image
238
- if return_image:
239
- return mask, masked_image, image
240
-
241
- return mask, masked_image
242
-
243
-
244
121
  class StableDiffusionControlNetInpaintPipeline(
245
122
  DiffusionPipeline,
246
123
  StableDiffusionMixin,
247
124
  TextualInversionLoaderMixin,
248
- LoraLoaderMixin,
125
+ StableDiffusionLoraLoaderMixin,
249
126
  IPAdapterMixin,
250
127
  FromSingleFileMixin,
251
128
  ):
@@ -257,8 +134,8 @@ class StableDiffusionControlNetInpaintPipeline(
257
134
 
258
135
  The pipeline also inherits the following loading methods:
259
136
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
260
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
261
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
137
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
138
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
262
139
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
263
140
  - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
264
141
 
@@ -434,7 +311,7 @@ class StableDiffusionControlNetInpaintPipeline(
434
311
  """
435
312
  # set lora scale so that monkey patched LoRA
436
313
  # function of text encoder can correctly access it
437
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
314
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
438
315
  self._lora_scale = lora_scale
439
316
 
440
317
  # dynamically adjust the LoRA scale
@@ -567,7 +444,7 @@ class StableDiffusionControlNetInpaintPipeline(
567
444
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
568
445
 
569
446
  if self.text_encoder is not None:
570
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
447
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
571
448
  # Retrieve the original scale by scaling back the LoRA layers
572
449
  unscale_lora_layers(self.text_encoder, lora_scale)
573
450
 
@@ -602,6 +479,9 @@ class StableDiffusionControlNetInpaintPipeline(
602
479
  def prepare_ip_adapter_image_embeds(
603
480
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
604
481
  ):
482
+ image_embeds = []
483
+ if do_classifier_free_guidance:
484
+ negative_image_embeds = []
605
485
  if ip_adapter_image_embeds is None:
606
486
  if not isinstance(ip_adapter_image, list):
607
487
  ip_adapter_image = [ip_adapter_image]
@@ -611,7 +491,6 @@ class StableDiffusionControlNetInpaintPipeline(
611
491
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
612
492
  )
613
493
 
614
- image_embeds = []
615
494
  for single_ip_adapter_image, image_proj_layer in zip(
616
495
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
617
496
  ):
@@ -619,36 +498,28 @@ class StableDiffusionControlNetInpaintPipeline(
619
498
  single_image_embeds, single_negative_image_embeds = self.encode_image(
620
499
  single_ip_adapter_image, device, 1, output_hidden_state
621
500
  )
622
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
623
- single_negative_image_embeds = torch.stack(
624
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
625
- )
626
501
 
502
+ image_embeds.append(single_image_embeds[None, :])
627
503
  if do_classifier_free_guidance:
628
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
629
- single_image_embeds = single_image_embeds.to(device)
630
-
631
- image_embeds.append(single_image_embeds)
504
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
632
505
  else:
633
- repeat_dims = [1]
634
- image_embeds = []
635
506
  for single_image_embeds in ip_adapter_image_embeds:
636
507
  if do_classifier_free_guidance:
637
508
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
638
- single_image_embeds = single_image_embeds.repeat(
639
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
640
- )
641
- single_negative_image_embeds = single_negative_image_embeds.repeat(
642
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
643
- )
644
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
645
- else:
646
- single_image_embeds = single_image_embeds.repeat(
647
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
648
- )
509
+ negative_image_embeds.append(single_negative_image_embeds)
649
510
  image_embeds.append(single_image_embeds)
650
511
 
651
- return image_embeds
512
+ ip_adapter_image_embeds = []
513
+ for i, single_image_embeds in enumerate(image_embeds):
514
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
515
+ if do_classifier_free_guidance:
516
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
517
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
518
+
519
+ single_image_embeds = single_image_embeds.to(device=device)
520
+ ip_adapter_image_embeds.append(single_image_embeds)
521
+
522
+ return ip_adapter_image_embeds
652
523
 
653
524
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
654
525
  def run_safety_checker(self, image, device, dtype):
@@ -1537,7 +1408,7 @@ class StableDiffusionControlNetInpaintPipeline(
1537
1408
  )
1538
1409
 
1539
1410
  if guess_mode and self.do_classifier_free_guidance:
1540
- # Infered ControlNet only for the conditional batch.
1411
+ # Inferred ControlNet only for the conditional batch.
1541
1412
  # To apply the output of ControlNet to both the unconditional and conditional batches,
1542
1413
  # add 0 to the unconditional batch to keep it unchanged.
1543
1414
  down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -38,8 +38,6 @@ from ...loaders import (
38
38
  from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
39
39
  from ...models.attention_processor import (
40
40
  AttnProcessor2_0,
41
- LoRAAttnProcessor2_0,
42
- LoRAXFormersAttnProcessor,
43
41
  XFormersAttnProcessor,
44
42
  )
45
43
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -86,6 +84,7 @@ EXAMPLE_DOC_STRING = """
86
84
  >>> # !pip install transformers accelerate
87
85
  >>> from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
88
86
  >>> from diffusers.utils import load_image
87
+ >>> from PIL import Image
89
88
  >>> import numpy as np
90
89
  >>> import torch
91
90
 
@@ -534,6 +533,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
534
533
  def prepare_ip_adapter_image_embeds(
535
534
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
536
535
  ):
536
+ image_embeds = []
537
+ if do_classifier_free_guidance:
538
+ negative_image_embeds = []
537
539
  if ip_adapter_image_embeds is None:
538
540
  if not isinstance(ip_adapter_image, list):
539
541
  ip_adapter_image = [ip_adapter_image]
@@ -543,7 +545,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
543
545
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
544
546
  )
545
547
 
546
- image_embeds = []
547
548
  for single_ip_adapter_image, image_proj_layer in zip(
548
549
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
549
550
  ):
@@ -551,36 +552,28 @@ class StableDiffusionXLControlNetInpaintPipeline(
551
552
  single_image_embeds, single_negative_image_embeds = self.encode_image(
552
553
  single_ip_adapter_image, device, 1, output_hidden_state
553
554
  )
554
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
555
- single_negative_image_embeds = torch.stack(
556
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
557
- )
558
555
 
556
+ image_embeds.append(single_image_embeds[None, :])
559
557
  if do_classifier_free_guidance:
560
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
561
- single_image_embeds = single_image_embeds.to(device)
562
-
563
- image_embeds.append(single_image_embeds)
558
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
564
559
  else:
565
- repeat_dims = [1]
566
- image_embeds = []
567
560
  for single_image_embeds in ip_adapter_image_embeds:
568
561
  if do_classifier_free_guidance:
569
562
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
570
- single_image_embeds = single_image_embeds.repeat(
571
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
572
- )
573
- single_negative_image_embeds = single_negative_image_embeds.repeat(
574
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
575
- )
576
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
577
- else:
578
- single_image_embeds = single_image_embeds.repeat(
579
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
580
- )
563
+ negative_image_embeds.append(single_negative_image_embeds)
581
564
  image_embeds.append(single_image_embeds)
582
565
 
583
- return image_embeds
566
+ ip_adapter_image_embeds = []
567
+ for i, single_image_embeds in enumerate(image_embeds):
568
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
569
+ if do_classifier_free_guidance:
570
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
571
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
572
+
573
+ single_image_embeds = single_image_embeds.to(device=device)
574
+ ip_adapter_image_embeds.append(single_image_embeds)
575
+
576
+ return ip_adapter_image_embeds
584
577
 
585
578
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
586
579
  def prepare_extra_step_kwargs(self, generator, eta):
@@ -1117,8 +1110,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
1117
1110
  (
1118
1111
  AttnProcessor2_0,
1119
1112
  XFormersAttnProcessor,
1120
- LoRAXFormersAttnProcessor,
1121
- LoRAAttnProcessor2_0,
1122
1113
  ),
1123
1114
  )
1124
1115
  # if xformers or torch_2_0 is used attention block does not need
@@ -1748,7 +1739,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
1748
1739
  )
1749
1740
 
1750
1741
  if guess_mode and self.do_classifier_free_guidance:
1751
- # Infered ControlNet only for the conditional batch.
1742
+ # Inferred ControlNet only for the conditional batch.
1752
1743
  # To apply the output of ControlNet to both the unconditional and conditional batches,
1753
1744
  # add 0 to the unconditional batch to keep it unchanged.
1754
1745
  down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]