diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. diffusers/__init__.py +94 -3
  2. diffusers/commands/env.py +1 -5
  3. diffusers/configuration_utils.py +4 -9
  4. diffusers/dependency_versions_table.py +2 -2
  5. diffusers/image_processor.py +1 -2
  6. diffusers/loaders/__init__.py +17 -2
  7. diffusers/loaders/ip_adapter.py +10 -7
  8. diffusers/loaders/lora_base.py +752 -0
  9. diffusers/loaders/lora_pipeline.py +2252 -0
  10. diffusers/loaders/peft.py +213 -5
  11. diffusers/loaders/single_file.py +3 -14
  12. diffusers/loaders/single_file_model.py +31 -10
  13. diffusers/loaders/single_file_utils.py +293 -8
  14. diffusers/loaders/textual_inversion.py +1 -6
  15. diffusers/loaders/unet.py +23 -208
  16. diffusers/models/__init__.py +20 -0
  17. diffusers/models/activations.py +22 -0
  18. diffusers/models/attention.py +386 -7
  19. diffusers/models/attention_processor.py +1937 -629
  20. diffusers/models/autoencoders/__init__.py +2 -0
  21. diffusers/models/autoencoders/autoencoder_kl.py +14 -3
  22. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1271 -0
  23. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  24. diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
  25. diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
  26. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  27. diffusers/models/autoencoders/vq_model.py +4 -4
  28. diffusers/models/controlnet.py +2 -3
  29. diffusers/models/controlnet_hunyuan.py +401 -0
  30. diffusers/models/controlnet_sd3.py +11 -11
  31. diffusers/models/controlnet_sparsectrl.py +789 -0
  32. diffusers/models/controlnet_xs.py +40 -10
  33. diffusers/models/downsampling.py +68 -0
  34. diffusers/models/embeddings.py +403 -36
  35. diffusers/models/model_loading_utils.py +1 -3
  36. diffusers/models/modeling_flax_utils.py +1 -6
  37. diffusers/models/modeling_utils.py +4 -16
  38. diffusers/models/normalization.py +203 -12
  39. diffusers/models/transformers/__init__.py +6 -0
  40. diffusers/models/transformers/auraflow_transformer_2d.py +543 -0
  41. diffusers/models/transformers/cogvideox_transformer_3d.py +485 -0
  42. diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
  43. diffusers/models/transformers/latte_transformer_3d.py +327 -0
  44. diffusers/models/transformers/lumina_nextdit2d.py +340 -0
  45. diffusers/models/transformers/pixart_transformer_2d.py +102 -1
  46. diffusers/models/transformers/prior_transformer.py +1 -1
  47. diffusers/models/transformers/stable_audio_transformer.py +458 -0
  48. diffusers/models/transformers/transformer_flux.py +455 -0
  49. diffusers/models/transformers/transformer_sd3.py +18 -4
  50. diffusers/models/unets/unet_1d_blocks.py +1 -1
  51. diffusers/models/unets/unet_2d_condition.py +8 -1
  52. diffusers/models/unets/unet_3d_blocks.py +51 -920
  53. diffusers/models/unets/unet_3d_condition.py +4 -1
  54. diffusers/models/unets/unet_i2vgen_xl.py +4 -1
  55. diffusers/models/unets/unet_kandinsky3.py +1 -1
  56. diffusers/models/unets/unet_motion_model.py +1330 -84
  57. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  58. diffusers/models/unets/unet_stable_cascade.py +1 -3
  59. diffusers/models/unets/uvit_2d.py +1 -1
  60. diffusers/models/upsampling.py +64 -0
  61. diffusers/models/vq_model.py +8 -4
  62. diffusers/optimization.py +1 -1
  63. diffusers/pipelines/__init__.py +100 -3
  64. diffusers/pipelines/animatediff/__init__.py +4 -0
  65. diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
  66. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
  70. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  71. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
  72. diffusers/pipelines/aura_flow/__init__.py +48 -0
  73. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
  74. diffusers/pipelines/auto_pipeline.py +97 -19
  75. diffusers/pipelines/cogvideo/__init__.py +48 -0
  76. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +746 -0
  77. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  78. diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
  79. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
  80. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
  81. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
  82. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
  83. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
  84. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  85. diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
  86. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
  87. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
  88. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
  90. diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
  91. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
  96. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
  97. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
  98. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
  99. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  100. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
  101. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
  103. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
  104. diffusers/pipelines/flux/__init__.py +47 -0
  105. diffusers/pipelines/flux/pipeline_flux.py +749 -0
  106. diffusers/pipelines/flux/pipeline_output.py +21 -0
  107. diffusers/pipelines/free_init_utils.py +2 -0
  108. diffusers/pipelines/free_noise_utils.py +236 -0
  109. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
  110. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
  111. diffusers/pipelines/kolors/__init__.py +54 -0
  112. diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
  113. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
  114. diffusers/pipelines/kolors/pipeline_output.py +21 -0
  115. diffusers/pipelines/kolors/text_encoder.py +889 -0
  116. diffusers/pipelines/kolors/tokenizer.py +334 -0
  117. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
  118. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
  119. diffusers/pipelines/latte/__init__.py +48 -0
  120. diffusers/pipelines/latte/pipeline_latte.py +881 -0
  121. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
  122. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
  123. diffusers/pipelines/lumina/__init__.py +48 -0
  124. diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
  125. diffusers/pipelines/pag/__init__.py +67 -0
  126. diffusers/pipelines/pag/pag_utils.py +237 -0
  127. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
  128. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
  129. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
  130. diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
  131. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
  132. diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
  133. diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
  134. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
  135. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
  136. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
  137. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
  138. diffusers/pipelines/pia/pipeline_pia.py +30 -37
  139. diffusers/pipelines/pipeline_flax_utils.py +4 -9
  140. diffusers/pipelines/pipeline_loading_utils.py +0 -3
  141. diffusers/pipelines/pipeline_utils.py +2 -14
  142. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
  143. diffusers/pipelines/stable_audio/__init__.py +50 -0
  144. diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
  145. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
  146. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
  147. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  148. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
  149. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
  150. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
  151. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
  152. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
  153. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
  154. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
  155. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
  156. diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
  157. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
  158. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
  159. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
  160. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
  161. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
  162. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
  163. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
  164. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
  165. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
  166. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
  167. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
  168. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
  170. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
  171. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
  172. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
  173. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
  174. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
  175. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
  176. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
  177. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
  178. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
  179. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  180. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  181. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
  182. diffusers/schedulers/__init__.py +8 -0
  183. diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
  184. diffusers/schedulers/scheduling_ddim.py +1 -1
  185. diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
  186. diffusers/schedulers/scheduling_ddpm.py +1 -1
  187. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
  188. diffusers/schedulers/scheduling_deis_multistep.py +2 -2
  189. diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
  190. diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
  191. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
  192. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
  193. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
  194. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
  195. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
  196. diffusers/schedulers/scheduling_ipndm.py +1 -1
  197. diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
  198. diffusers/schedulers/scheduling_utils.py +1 -3
  199. diffusers/schedulers/scheduling_utils_flax.py +1 -3
  200. diffusers/training_utils.py +99 -14
  201. diffusers/utils/__init__.py +2 -2
  202. diffusers/utils/dummy_pt_objects.py +210 -0
  203. diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
  204. diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
  205. diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
  206. diffusers/utils/dynamic_modules_utils.py +1 -11
  207. diffusers/utils/export_utils.py +50 -6
  208. diffusers/utils/hub_utils.py +45 -42
  209. diffusers/utils/import_utils.py +37 -15
  210. diffusers/utils/loading_utils.py +80 -3
  211. diffusers/utils/testing_utils.py +11 -8
  212. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/METADATA +73 -83
  213. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/RECORD +217 -164
  214. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/WHEEL +1 -1
  215. diffusers/loaders/autoencoder.py +0 -146
  216. diffusers/loaders/controlnet.py +0 -136
  217. diffusers/loaders/lora.py +0 -1728
  218. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/LICENSE +0 -0
  219. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/entry_points.txt +0 -0
  220. {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/top_level.txt +0 -0
@@ -36,8 +36,6 @@ from ...loaders import (
36
36
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
37
37
  from ...models.attention_processor import (
38
38
  AttnProcessor2_0,
39
- LoRAAttnProcessor2_0,
40
- LoRAXFormersAttnProcessor,
41
39
  XFormersAttnProcessor,
42
40
  )
43
41
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -712,6 +710,13 @@ class StableDiffusionXLImg2ImgPipeline(
712
710
  )
713
711
 
714
712
  elif isinstance(generator, list):
713
+ if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
714
+ image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
715
+ elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
716
+ raise ValueError(
717
+ f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
718
+ )
719
+
715
720
  init_latents = [
716
721
  retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
717
722
  for i in range(batch_size)
@@ -725,8 +730,8 @@ class StableDiffusionXLImg2ImgPipeline(
725
730
 
726
731
  init_latents = init_latents.to(dtype)
727
732
  if latents_mean is not None and latents_std is not None:
728
- latents_mean = latents_mean.to(device=self.device, dtype=dtype)
729
- latents_std = latents_std.to(device=self.device, dtype=dtype)
733
+ latents_mean = latents_mean.to(device=device, dtype=dtype)
734
+ latents_std = latents_std.to(device=device, dtype=dtype)
730
735
  init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
731
736
  else:
732
737
  init_latents = self.vae.config.scaling_factor * init_latents
@@ -781,6 +786,9 @@ class StableDiffusionXLImg2ImgPipeline(
781
786
  def prepare_ip_adapter_image_embeds(
782
787
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
783
788
  ):
789
+ image_embeds = []
790
+ if do_classifier_free_guidance:
791
+ negative_image_embeds = []
784
792
  if ip_adapter_image_embeds is None:
785
793
  if not isinstance(ip_adapter_image, list):
786
794
  ip_adapter_image = [ip_adapter_image]
@@ -790,7 +798,6 @@ class StableDiffusionXLImg2ImgPipeline(
790
798
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
791
799
  )
792
800
 
793
- image_embeds = []
794
801
  for single_ip_adapter_image, image_proj_layer in zip(
795
802
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
796
803
  ):
@@ -798,36 +805,28 @@ class StableDiffusionXLImg2ImgPipeline(
798
805
  single_image_embeds, single_negative_image_embeds = self.encode_image(
799
806
  single_ip_adapter_image, device, 1, output_hidden_state
800
807
  )
801
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
802
- single_negative_image_embeds = torch.stack(
803
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
804
- )
805
808
 
809
+ image_embeds.append(single_image_embeds[None, :])
806
810
  if do_classifier_free_guidance:
807
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
808
- single_image_embeds = single_image_embeds.to(device)
809
-
810
- image_embeds.append(single_image_embeds)
811
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
811
812
  else:
812
- repeat_dims = [1]
813
- image_embeds = []
814
813
  for single_image_embeds in ip_adapter_image_embeds:
815
814
  if do_classifier_free_guidance:
816
815
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
817
- single_image_embeds = single_image_embeds.repeat(
818
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
819
- )
820
- single_negative_image_embeds = single_negative_image_embeds.repeat(
821
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
822
- )
823
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
824
- else:
825
- single_image_embeds = single_image_embeds.repeat(
826
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
827
- )
816
+ negative_image_embeds.append(single_negative_image_embeds)
828
817
  image_embeds.append(single_image_embeds)
829
818
 
830
- return image_embeds
819
+ ip_adapter_image_embeds = []
820
+ for i, single_image_embeds in enumerate(image_embeds):
821
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
822
+ if do_classifier_free_guidance:
823
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
824
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
825
+
826
+ single_image_embeds = single_image_embeds.to(device=device)
827
+ ip_adapter_image_embeds.append(single_image_embeds)
828
+
829
+ return ip_adapter_image_embeds
831
830
 
832
831
  def _get_add_time_ids(
833
832
  self,
@@ -889,8 +888,6 @@ class StableDiffusionXLImg2ImgPipeline(
889
888
  (
890
889
  AttnProcessor2_0,
891
890
  XFormersAttnProcessor,
892
- LoRAXFormersAttnProcessor,
893
- LoRAAttnProcessor2_0,
894
891
  ),
895
892
  )
896
893
  # if xformers or torch_2_0 is used attention block does not need
@@ -37,8 +37,6 @@ from ...loaders import (
37
37
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
38
38
  from ...models.attention_processor import (
39
39
  AttnProcessor2_0,
40
- LoRAAttnProcessor2_0,
41
- LoRAXFormersAttnProcessor,
42
40
  XFormersAttnProcessor,
43
41
  )
44
42
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -132,124 +130,6 @@ def mask_pil_to_torch(mask, height, width):
132
130
  return mask
133
131
 
134
132
 
135
- def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
136
- """
137
- Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
138
- converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
139
- ``image`` and ``1`` for the ``mask``.
140
-
141
- The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
142
- binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
143
-
144
- Args:
145
- image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
146
- It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
147
- ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
148
- mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
149
- It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
150
- ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
151
-
152
-
153
- Raises:
154
- ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
155
- should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
156
- TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
157
- (ot the other way around).
158
-
159
- Returns:
160
- tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
161
- dimensions: ``batch x channels x height x width``.
162
- """
163
-
164
- # checkpoint. TOD(Yiyi) - need to clean this up later
165
- deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
166
- deprecate(
167
- "prepare_mask_and_masked_image",
168
- "0.30.0",
169
- deprecation_message,
170
- )
171
- if image is None:
172
- raise ValueError("`image` input cannot be undefined.")
173
-
174
- if mask is None:
175
- raise ValueError("`mask_image` input cannot be undefined.")
176
-
177
- if isinstance(image, torch.Tensor):
178
- if not isinstance(mask, torch.Tensor):
179
- mask = mask_pil_to_torch(mask, height, width)
180
-
181
- if image.ndim == 3:
182
- image = image.unsqueeze(0)
183
-
184
- # Batch and add channel dim for single mask
185
- if mask.ndim == 2:
186
- mask = mask.unsqueeze(0).unsqueeze(0)
187
-
188
- # Batch single mask or add channel dim
189
- if mask.ndim == 3:
190
- # Single batched mask, no channel dim or single mask not batched but channel dim
191
- if mask.shape[0] == 1:
192
- mask = mask.unsqueeze(0)
193
-
194
- # Batched masks no channel dim
195
- else:
196
- mask = mask.unsqueeze(1)
197
-
198
- assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
199
- # assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
200
- assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
201
-
202
- # Check image is in [-1, 1]
203
- # if image.min() < -1 or image.max() > 1:
204
- # raise ValueError("Image should be in [-1, 1] range")
205
-
206
- # Check mask is in [0, 1]
207
- if mask.min() < 0 or mask.max() > 1:
208
- raise ValueError("Mask should be in [0, 1] range")
209
-
210
- # Binarize mask
211
- mask[mask < 0.5] = 0
212
- mask[mask >= 0.5] = 1
213
-
214
- # Image as float32
215
- image = image.to(dtype=torch.float32)
216
- elif isinstance(mask, torch.Tensor):
217
- raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
218
- else:
219
- # preprocess image
220
- if isinstance(image, (PIL.Image.Image, np.ndarray)):
221
- image = [image]
222
- if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
223
- # resize all images w.r.t passed height an width
224
- image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
225
- image = [np.array(i.convert("RGB"))[None, :] for i in image]
226
- image = np.concatenate(image, axis=0)
227
- elif isinstance(image, list) and isinstance(image[0], np.ndarray):
228
- image = np.concatenate([i[None, :] for i in image], axis=0)
229
-
230
- image = image.transpose(0, 3, 1, 2)
231
- image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
232
-
233
- mask = mask_pil_to_torch(mask, height, width)
234
- mask[mask < 0.5] = 0
235
- mask[mask >= 0.5] = 1
236
-
237
- if image.shape[1] == 4:
238
- # images are in latent space and thus can't
239
- # be masked set masked_image to None
240
- # we assume that the checkpoint is not an inpainting
241
- # checkpoint. TOD(Yiyi) - need to clean this up later
242
- masked_image = None
243
- else:
244
- masked_image = image * (mask < 0.5)
245
-
246
- # n.b. ensure backwards compatibility as old function does not return image
247
- if return_image:
248
- return mask, masked_image, image
249
-
250
- return mask, masked_image
251
-
252
-
253
133
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
254
134
  def retrieve_latents(
255
135
  encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
@@ -474,6 +354,9 @@ class StableDiffusionXLInpaintPipeline(
474
354
  def prepare_ip_adapter_image_embeds(
475
355
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
476
356
  ):
357
+ image_embeds = []
358
+ if do_classifier_free_guidance:
359
+ negative_image_embeds = []
477
360
  if ip_adapter_image_embeds is None:
478
361
  if not isinstance(ip_adapter_image, list):
479
362
  ip_adapter_image = [ip_adapter_image]
@@ -483,7 +366,6 @@ class StableDiffusionXLInpaintPipeline(
483
366
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
484
367
  )
485
368
 
486
- image_embeds = []
487
369
  for single_ip_adapter_image, image_proj_layer in zip(
488
370
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
489
371
  ):
@@ -491,36 +373,28 @@ class StableDiffusionXLInpaintPipeline(
491
373
  single_image_embeds, single_negative_image_embeds = self.encode_image(
492
374
  single_ip_adapter_image, device, 1, output_hidden_state
493
375
  )
494
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
495
- single_negative_image_embeds = torch.stack(
496
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
497
- )
498
376
 
377
+ image_embeds.append(single_image_embeds[None, :])
499
378
  if do_classifier_free_guidance:
500
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
501
- single_image_embeds = single_image_embeds.to(device)
502
-
503
- image_embeds.append(single_image_embeds)
379
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
504
380
  else:
505
- repeat_dims = [1]
506
- image_embeds = []
507
381
  for single_image_embeds in ip_adapter_image_embeds:
508
382
  if do_classifier_free_guidance:
509
383
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
510
- single_image_embeds = single_image_embeds.repeat(
511
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
512
- )
513
- single_negative_image_embeds = single_negative_image_embeds.repeat(
514
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
515
- )
516
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
517
- else:
518
- single_image_embeds = single_image_embeds.repeat(
519
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
520
- )
384
+ negative_image_embeds.append(single_negative_image_embeds)
521
385
  image_embeds.append(single_image_embeds)
522
386
 
523
- return image_embeds
387
+ ip_adapter_image_embeds = []
388
+ for i, single_image_embeds in enumerate(image_embeds):
389
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
390
+ if do_classifier_free_guidance:
391
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
392
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
393
+
394
+ single_image_embeds = single_image_embeds.to(device=device)
395
+ ip_adapter_image_embeds.append(single_image_embeds)
396
+
397
+ return ip_adapter_image_embeds
524
398
 
525
399
  # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
526
400
  def encode_prompt(
@@ -1119,8 +993,6 @@ class StableDiffusionXLInpaintPipeline(
1119
993
  (
1120
994
  AttnProcessor2_0,
1121
995
  XFormersAttnProcessor,
1122
- LoRAXFormersAttnProcessor,
1123
- LoRAAttnProcessor2_0,
1124
996
  ),
1125
997
  )
1126
998
  # if xformers or torch_2_0 is used attention block does not need
@@ -25,8 +25,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel
25
25
  from ...models.attention_processor import (
26
26
  AttnProcessor2_0,
27
27
  FusedAttnProcessor2_0,
28
- LoRAAttnProcessor2_0,
29
- LoRAXFormersAttnProcessor,
30
28
  XFormersAttnProcessor,
31
29
  )
32
30
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -592,8 +590,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
592
590
  (
593
591
  AttnProcessor2_0,
594
592
  XFormersAttnProcessor,
595
- LoRAXFormersAttnProcessor,
596
- LoRAAttnProcessor2_0,
597
593
  FusedAttnProcessor2_0,
598
594
  ),
599
595
  )
@@ -19,10 +19,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
19
19
  import numpy as np
20
20
  import PIL.Image
21
21
  import torch
22
- from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
23
 
24
24
  from ...image_processor import VaeImageProcessor
25
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
26
  from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
27
27
  from ...models.lora import adjust_lora_scale_text_encoder
28
28
  from ...schedulers import KarrasDiffusionSchedulers
@@ -209,7 +209,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
209
209
  safety_checker ([`StableDiffusionSafetyChecker`]):
210
210
  Classification module that estimates whether generated images could be considered offensive or harmful.
211
211
  Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
212
- feature_extractor ([`CLIPFeatureExtractor`]):
212
+ feature_extractor ([`CLIPImageProcessor`]):
213
213
  Model that extracts features from generated images to be used as inputs for the `safety_checker`.
214
214
  """
215
215
 
@@ -225,7 +225,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
225
225
  adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
226
226
  scheduler: KarrasDiffusionSchedulers,
227
227
  safety_checker: StableDiffusionSafetyChecker,
228
- feature_extractor: CLIPFeatureExtractor,
228
+ feature_extractor: CLIPImageProcessor,
229
229
  requires_safety_checker: bool = True,
230
230
  ):
231
231
  super().__init__()
@@ -340,7 +340,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
340
340
  """
341
341
  # set lora scale so that monkey patched LoRA
342
342
  # function of text encoder can correctly access it
343
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
343
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
344
344
  self._lora_scale = lora_scale
345
345
 
346
346
  # dynamically adjust the LoRA scale
@@ -473,7 +473,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
473
473
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
474
474
 
475
475
  if self.text_encoder is not None:
476
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
476
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
477
477
  # Retrieve the original scale by scaling back the LoRA layers
478
478
  unscale_lora_layers(self.text_encoder, lora_scale)
479
479
 
@@ -36,8 +36,6 @@ from ...loaders import (
36
36
  from ...models import AutoencoderKL, ImageProjection, MultiAdapter, T2IAdapter, UNet2DConditionModel
37
37
  from ...models.attention_processor import (
38
38
  AttnProcessor2_0,
39
- LoRAAttnProcessor2_0,
40
- LoRAXFormersAttnProcessor,
41
39
  XFormersAttnProcessor,
42
40
  )
43
41
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -239,7 +237,7 @@ class StableDiffusionXLAdapterPipeline(
239
237
  safety_checker ([`StableDiffusionSafetyChecker`]):
240
238
  Classification module that estimates whether generated images could be considered offensive or harmful.
241
239
  Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
242
- feature_extractor ([`CLIPFeatureExtractor`]):
240
+ feature_extractor ([`CLIPImageProcessor`]):
243
241
  Model that extracts features from generated images to be used as inputs for the `safety_checker`.
244
242
  """
245
243
 
@@ -550,6 +548,9 @@ class StableDiffusionXLAdapterPipeline(
550
548
  def prepare_ip_adapter_image_embeds(
551
549
  self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
552
550
  ):
551
+ image_embeds = []
552
+ if do_classifier_free_guidance:
553
+ negative_image_embeds = []
553
554
  if ip_adapter_image_embeds is None:
554
555
  if not isinstance(ip_adapter_image, list):
555
556
  ip_adapter_image = [ip_adapter_image]
@@ -559,7 +560,6 @@ class StableDiffusionXLAdapterPipeline(
559
560
  f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
560
561
  )
561
562
 
562
- image_embeds = []
563
563
  for single_ip_adapter_image, image_proj_layer in zip(
564
564
  ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
565
565
  ):
@@ -567,36 +567,28 @@ class StableDiffusionXLAdapterPipeline(
567
567
  single_image_embeds, single_negative_image_embeds = self.encode_image(
568
568
  single_ip_adapter_image, device, 1, output_hidden_state
569
569
  )
570
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
571
- single_negative_image_embeds = torch.stack(
572
- [single_negative_image_embeds] * num_images_per_prompt, dim=0
573
- )
574
570
 
571
+ image_embeds.append(single_image_embeds[None, :])
575
572
  if do_classifier_free_guidance:
576
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
577
- single_image_embeds = single_image_embeds.to(device)
578
-
579
- image_embeds.append(single_image_embeds)
573
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
580
574
  else:
581
- repeat_dims = [1]
582
- image_embeds = []
583
575
  for single_image_embeds in ip_adapter_image_embeds:
584
576
  if do_classifier_free_guidance:
585
577
  single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
586
- single_image_embeds = single_image_embeds.repeat(
587
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
588
- )
589
- single_negative_image_embeds = single_negative_image_embeds.repeat(
590
- num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
591
- )
592
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
593
- else:
594
- single_image_embeds = single_image_embeds.repeat(
595
- num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
596
- )
578
+ negative_image_embeds.append(single_negative_image_embeds)
597
579
  image_embeds.append(single_image_embeds)
598
580
 
599
- return image_embeds
581
+ ip_adapter_image_embeds = []
582
+ for i, single_image_embeds in enumerate(image_embeds):
583
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
584
+ if do_classifier_free_guidance:
585
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
586
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
587
+
588
+ single_image_embeds = single_image_embeds.to(device=device)
589
+ ip_adapter_image_embeds.append(single_image_embeds)
590
+
591
+ return ip_adapter_image_embeds
600
592
 
601
593
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
602
594
  def prepare_extra_step_kwargs(self, generator, eta):
@@ -764,8 +756,6 @@ class StableDiffusionXLAdapterPipeline(
764
756
  (
765
757
  AttnProcessor2_0,
766
758
  XFormersAttnProcessor,
767
- LoRAXFormersAttnProcessor,
768
- LoRAAttnProcessor2_0,
769
759
  ),
770
760
  )
771
761
  # if xformers or torch_2_0 is used attention block does not need
@@ -18,7 +18,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
18
18
  import torch
19
19
  from transformers import CLIPTextModel, CLIPTokenizer
20
20
 
21
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
21
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
22
22
  from ...models import AutoencoderKL, UNet3DConditionModel
23
23
  from ...models.lora import adjust_lora_scale_text_encoder
24
24
  from ...schedulers import KarrasDiffusionSchedulers
@@ -58,7 +58,9 @@ EXAMPLE_DOC_STRING = """
58
58
  """
59
59
 
60
60
 
61
- class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
61
+ class TextToVideoSDPipeline(
62
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
63
+ ):
62
64
  r"""
63
65
  Pipeline for text-to-video generation.
64
66
 
@@ -67,8 +69,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
67
69
 
68
70
  The pipeline also inherits the following loading methods:
69
71
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
70
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
71
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
72
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
73
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
72
74
 
73
75
  Args:
74
76
  vae ([`AutoencoderKL`]):
@@ -183,7 +185,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
183
185
  """
184
186
  # set lora scale so that monkey patched LoRA
185
187
  # function of text encoder can correctly access it
186
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
188
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
187
189
  self._lora_scale = lora_scale
188
190
 
189
191
  # dynamically adjust the LoRA scale
@@ -316,7 +318,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
316
318
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
317
319
 
318
320
  if self.text_encoder is not None:
319
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
321
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
320
322
  # Retrieve the original scale by scaling back the LoRA layers
321
323
  unscale_lora_layers(self.text_encoder, lora_scale)
322
324
 
@@ -19,7 +19,7 @@ import numpy as np
19
19
  import torch
20
20
  from transformers import CLIPTextModel, CLIPTokenizer
21
21
 
22
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
22
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
23
23
  from ...models import AutoencoderKL, UNet3DConditionModel
24
24
  from ...models.lora import adjust_lora_scale_text_encoder
25
25
  from ...schedulers import KarrasDiffusionSchedulers
@@ -93,7 +93,9 @@ def retrieve_latents(
93
93
  raise AttributeError("Could not access latents of provided encoder_output")
94
94
 
95
95
 
96
- class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
96
+ class VideoToVideoSDPipeline(
97
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
98
+ ):
97
99
  r"""
98
100
  Pipeline for text-guided video-to-video generation.
99
101
 
@@ -102,8 +104,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
102
104
 
103
105
  The pipeline also inherits the following loading methods:
104
106
  - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
105
- - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
106
- - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
107
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
108
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
107
109
 
108
110
  Args:
109
111
  vae ([`AutoencoderKL`]):
@@ -218,7 +220,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
218
220
  """
219
221
  # set lora scale so that monkey patched LoRA
220
222
  # function of text encoder can correctly access it
221
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
223
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
222
224
  self._lora_scale = lora_scale
223
225
 
224
226
  # dynamically adjust the LoRA scale
@@ -351,7 +353,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
351
353
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
352
354
 
353
355
  if self.text_encoder is not None:
354
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
356
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
355
357
  # Retrieve the original scale by scaling back the LoRA layers
356
358
  unscale_lora_layers(self.text_encoder, lora_scale)
357
359
 
@@ -11,7 +11,7 @@ from torch.nn.functional import grid_sample
11
11
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
12
12
 
13
13
  from ...image_processor import VaeImageProcessor
14
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
14
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
15
15
  from ...models import AutoencoderKL, UNet2DConditionModel
16
16
  from ...models.lora import adjust_lora_scale_text_encoder
17
17
  from ...schedulers import KarrasDiffusionSchedulers
@@ -281,7 +281,9 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s
281
281
  return warped_latents
282
282
 
283
283
 
284
- class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
284
+ class TextToVideoZeroPipeline(
285
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
286
+ ):
285
287
  r"""
286
288
  Pipeline for zero-shot text-to-video generation using Stable Diffusion.
287
289
 
@@ -831,7 +833,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
831
833
  """
832
834
  # set lora scale so that monkey patched LoRA
833
835
  # function of text encoder can correctly access it
834
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
836
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
835
837
  self._lora_scale = lora_scale
836
838
 
837
839
  # dynamically adjust the LoRA scale
@@ -964,7 +966,7 @@ class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualIn
964
966
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
965
967
 
966
968
  if self.text_encoder is not None:
967
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
969
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
968
970
  # Retrieve the original scale by scaling back the LoRA layers
969
971
  unscale_lora_layers(self.text_encoder, lora_scale)
970
972
 
@@ -22,8 +22,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel
22
22
  from ...models.attention_processor import (
23
23
  AttnProcessor2_0,
24
24
  FusedAttnProcessor2_0,
25
- LoRAAttnProcessor2_0,
26
- LoRAXFormersAttnProcessor,
27
25
  XFormersAttnProcessor,
28
26
  )
29
27
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -438,8 +436,6 @@ class TextToVideoZeroSDXLPipeline(
438
436
  (
439
437
  AttnProcessor2_0,
440
438
  XFormersAttnProcessor,
441
- LoRAXFormersAttnProcessor,
442
- LoRAAttnProcessor2_0,
443
439
  FusedAttnProcessor2_0,
444
440
  ),
445
441
  )
@@ -14,7 +14,7 @@ from transformers import (
14
14
  )
15
15
 
16
16
  from ...image_processor import VaeImageProcessor
17
- from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
17
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
18
18
  from ...models import AutoencoderKL
19
19
  from ...models.lora import adjust_lora_scale_text_encoder
20
20
  from ...schedulers import KarrasDiffusionSchedulers
@@ -422,7 +422,7 @@ class UniDiffuserPipeline(DiffusionPipeline):
422
422
  """
423
423
  # set lora scale so that monkey patched LoRA
424
424
  # function of text encoder can correctly access it
425
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
425
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
426
426
  self._lora_scale = lora_scale
427
427
 
428
428
  # dynamically adjust the LoRA scale
@@ -555,7 +555,7 @@ class UniDiffuserPipeline(DiffusionPipeline):
555
555
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
556
556
 
557
557
  if self.text_encoder is not None:
558
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
558
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
559
559
  # Retrieve the original scale by scaling back the LoRA layers
560
560
  unscale_lora_layers(self.text_encoder, lora_scale)
561
561