diffusers 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (174) hide show
  1. diffusers/__init__.py +11 -1
  2. diffusers/commands/fp16_safetensors.py +10 -11
  3. diffusers/configuration_utils.py +12 -8
  4. diffusers/dependency_versions_table.py +2 -1
  5. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  6. diffusers/image_processor.py +286 -46
  7. diffusers/loaders/ip_adapter.py +11 -9
  8. diffusers/loaders/lora.py +198 -60
  9. diffusers/loaders/single_file.py +24 -18
  10. diffusers/loaders/textual_inversion.py +10 -14
  11. diffusers/loaders/unet.py +130 -37
  12. diffusers/models/__init__.py +18 -12
  13. diffusers/models/activations.py +9 -6
  14. diffusers/models/attention.py +137 -16
  15. diffusers/models/attention_processor.py +133 -46
  16. diffusers/models/autoencoders/__init__.py +5 -0
  17. diffusers/models/{autoencoder_asym_kl.py → autoencoders/autoencoder_asym_kl.py} +4 -4
  18. diffusers/models/{autoencoder_kl.py → autoencoders/autoencoder_kl.py} +45 -6
  19. diffusers/models/{autoencoder_kl_temporal_decoder.py → autoencoders/autoencoder_kl_temporal_decoder.py} +8 -8
  20. diffusers/models/{autoencoder_tiny.py → autoencoders/autoencoder_tiny.py} +4 -4
  21. diffusers/models/{consistency_decoder_vae.py → autoencoders/consistency_decoder_vae.py} +14 -14
  22. diffusers/models/{vae.py → autoencoders/vae.py} +9 -5
  23. diffusers/models/downsampling.py +338 -0
  24. diffusers/models/embeddings.py +112 -29
  25. diffusers/models/modeling_flax_utils.py +12 -7
  26. diffusers/models/modeling_utils.py +10 -10
  27. diffusers/models/normalization.py +108 -2
  28. diffusers/models/resnet.py +15 -699
  29. diffusers/models/transformer_2d.py +2 -2
  30. diffusers/models/unet_2d_condition.py +37 -0
  31. diffusers/models/{unet_kandi3.py → unet_kandinsky3.py} +105 -159
  32. diffusers/models/upsampling.py +454 -0
  33. diffusers/models/uvit_2d.py +471 -0
  34. diffusers/models/vq_model.py +9 -2
  35. diffusers/pipelines/__init__.py +81 -73
  36. diffusers/pipelines/amused/__init__.py +62 -0
  37. diffusers/pipelines/amused/pipeline_amused.py +328 -0
  38. diffusers/pipelines/amused/pipeline_amused_img2img.py +347 -0
  39. diffusers/pipelines/amused/pipeline_amused_inpaint.py +378 -0
  40. diffusers/pipelines/animatediff/pipeline_animatediff.py +38 -10
  41. diffusers/pipelines/auto_pipeline.py +17 -13
  42. diffusers/pipelines/controlnet/pipeline_controlnet.py +27 -10
  43. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +47 -5
  44. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +25 -8
  45. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +4 -6
  46. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +26 -10
  47. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +4 -3
  48. diffusers/pipelines/deprecated/__init__.py +153 -0
  49. diffusers/pipelines/{alt_diffusion → deprecated/alt_diffusion}/__init__.py +3 -3
  50. diffusers/pipelines/{alt_diffusion → deprecated/alt_diffusion}/pipeline_alt_diffusion.py +91 -18
  51. diffusers/pipelines/{alt_diffusion → deprecated/alt_diffusion}/pipeline_alt_diffusion_img2img.py +91 -18
  52. diffusers/pipelines/{alt_diffusion → deprecated/alt_diffusion}/pipeline_output.py +1 -1
  53. diffusers/pipelines/{audio_diffusion → deprecated/audio_diffusion}/__init__.py +1 -1
  54. diffusers/pipelines/{audio_diffusion → deprecated/audio_diffusion}/mel.py +2 -2
  55. diffusers/pipelines/{audio_diffusion → deprecated/audio_diffusion}/pipeline_audio_diffusion.py +4 -4
  56. diffusers/pipelines/{latent_diffusion_uncond → deprecated/latent_diffusion_uncond}/__init__.py +1 -1
  57. diffusers/pipelines/{latent_diffusion_uncond → deprecated/latent_diffusion_uncond}/pipeline_latent_diffusion_uncond.py +4 -4
  58. diffusers/pipelines/{pndm → deprecated/pndm}/__init__.py +1 -1
  59. diffusers/pipelines/{pndm → deprecated/pndm}/pipeline_pndm.py +4 -4
  60. diffusers/pipelines/{repaint → deprecated/repaint}/__init__.py +1 -1
  61. diffusers/pipelines/{repaint → deprecated/repaint}/pipeline_repaint.py +5 -5
  62. diffusers/pipelines/{score_sde_ve → deprecated/score_sde_ve}/__init__.py +1 -1
  63. diffusers/pipelines/{score_sde_ve → deprecated/score_sde_ve}/pipeline_score_sde_ve.py +4 -4
  64. diffusers/pipelines/{spectrogram_diffusion → deprecated/spectrogram_diffusion}/__init__.py +6 -6
  65. diffusers/pipelines/{spectrogram_diffusion/continous_encoder.py → deprecated/spectrogram_diffusion/continuous_encoder.py} +2 -2
  66. diffusers/pipelines/{spectrogram_diffusion → deprecated/spectrogram_diffusion}/midi_utils.py +1 -1
  67. diffusers/pipelines/{spectrogram_diffusion → deprecated/spectrogram_diffusion}/notes_encoder.py +2 -2
  68. diffusers/pipelines/{spectrogram_diffusion → deprecated/spectrogram_diffusion}/pipeline_spectrogram_diffusion.py +7 -7
  69. diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py +55 -0
  70. diffusers/pipelines/{stable_diffusion → deprecated/stable_diffusion_variants}/pipeline_cycle_diffusion.py +16 -11
  71. diffusers/pipelines/{stable_diffusion → deprecated/stable_diffusion_variants}/pipeline_onnx_stable_diffusion_inpaint_legacy.py +6 -6
  72. diffusers/pipelines/{stable_diffusion → deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_inpaint_legacy.py +11 -11
  73. diffusers/pipelines/{stable_diffusion → deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_model_editing.py +16 -11
  74. diffusers/pipelines/{stable_diffusion → deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_paradigms.py +10 -10
  75. diffusers/pipelines/{stable_diffusion → deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_pix2pix_zero.py +13 -13
  76. diffusers/pipelines/{stochastic_karras_ve → deprecated/stochastic_karras_ve}/__init__.py +1 -1
  77. diffusers/pipelines/{stochastic_karras_ve → deprecated/stochastic_karras_ve}/pipeline_stochastic_karras_ve.py +4 -4
  78. diffusers/pipelines/{versatile_diffusion → deprecated/versatile_diffusion}/__init__.py +3 -3
  79. diffusers/pipelines/{versatile_diffusion → deprecated/versatile_diffusion}/modeling_text_unet.py +54 -11
  80. diffusers/pipelines/{versatile_diffusion → deprecated/versatile_diffusion}/pipeline_versatile_diffusion.py +4 -4
  81. diffusers/pipelines/{versatile_diffusion → deprecated/versatile_diffusion}/pipeline_versatile_diffusion_dual_guided.py +6 -6
  82. diffusers/pipelines/{versatile_diffusion → deprecated/versatile_diffusion}/pipeline_versatile_diffusion_image_variation.py +6 -6
  83. diffusers/pipelines/{versatile_diffusion → deprecated/versatile_diffusion}/pipeline_versatile_diffusion_text_to_image.py +6 -6
  84. diffusers/pipelines/{vq_diffusion → deprecated/vq_diffusion}/__init__.py +3 -3
  85. diffusers/pipelines/{vq_diffusion → deprecated/vq_diffusion}/pipeline_vq_diffusion.py +5 -5
  86. diffusers/pipelines/kandinsky3/__init__.py +4 -4
  87. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +98 -0
  88. diffusers/pipelines/kandinsky3/{kandinsky3_pipeline.py → pipeline_kandinsky3.py} +172 -35
  89. diffusers/pipelines/kandinsky3/{kandinsky3img2img_pipeline.py → pipeline_kandinsky3_img2img.py} +228 -34
  90. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +46 -5
  91. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +47 -6
  92. diffusers/pipelines/onnx_utils.py +8 -5
  93. diffusers/pipelines/pipeline_flax_utils.py +7 -6
  94. diffusers/pipelines/pipeline_utils.py +30 -29
  95. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +51 -2
  96. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +3 -3
  97. diffusers/pipelines/stable_diffusion/__init__.py +1 -72
  98. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +67 -75
  99. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +92 -8
  100. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -8
  101. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +138 -10
  102. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +57 -7
  103. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +3 -0
  104. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +6 -0
  105. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +5 -0
  106. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +5 -0
  107. diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py +48 -0
  108. diffusers/pipelines/{stable_diffusion → stable_diffusion_attend_and_excite}/pipeline_stable_diffusion_attend_and_excite.py +5 -2
  109. diffusers/pipelines/stable_diffusion_diffedit/__init__.py +48 -0
  110. diffusers/pipelines/{stable_diffusion → stable_diffusion_diffedit}/pipeline_stable_diffusion_diffedit.py +2 -3
  111. diffusers/pipelines/stable_diffusion_gligen/__init__.py +50 -0
  112. diffusers/pipelines/{stable_diffusion → stable_diffusion_gligen}/pipeline_stable_diffusion_gligen.py +2 -2
  113. diffusers/pipelines/{stable_diffusion → stable_diffusion_gligen}/pipeline_stable_diffusion_gligen_text_image.py +3 -3
  114. diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py +60 -0
  115. diffusers/pipelines/{stable_diffusion → stable_diffusion_k_diffusion}/pipeline_stable_diffusion_k_diffusion.py +6 -1
  116. diffusers/pipelines/stable_diffusion_ldm3d/__init__.py +48 -0
  117. diffusers/pipelines/{stable_diffusion → stable_diffusion_ldm3d}/pipeline_stable_diffusion_ldm3d.py +50 -7
  118. diffusers/pipelines/stable_diffusion_panorama/__init__.py +48 -0
  119. diffusers/pipelines/{stable_diffusion → stable_diffusion_panorama}/pipeline_stable_diffusion_panorama.py +56 -8
  120. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +58 -6
  121. diffusers/pipelines/stable_diffusion_sag/__init__.py +48 -0
  122. diffusers/pipelines/{stable_diffusion → stable_diffusion_sag}/pipeline_stable_diffusion_sag.py +67 -10
  123. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +97 -15
  124. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +98 -14
  125. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +97 -14
  126. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +7 -5
  127. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +12 -9
  128. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +6 -0
  129. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -0
  130. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +5 -0
  131. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +331 -9
  132. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +468 -9
  133. diffusers/pipelines/unclip/pipeline_unclip.py +2 -1
  134. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +1 -0
  135. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  136. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +4 -0
  137. diffusers/schedulers/__init__.py +2 -0
  138. diffusers/schedulers/scheduling_amused.py +162 -0
  139. diffusers/schedulers/scheduling_consistency_models.py +2 -0
  140. diffusers/schedulers/scheduling_ddim_inverse.py +1 -4
  141. diffusers/schedulers/scheduling_ddpm.py +46 -0
  142. diffusers/schedulers/scheduling_ddpm_parallel.py +46 -0
  143. diffusers/schedulers/scheduling_deis_multistep.py +13 -1
  144. diffusers/schedulers/scheduling_dpmsolver_multistep.py +13 -1
  145. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +13 -1
  146. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -0
  147. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +13 -1
  148. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -0
  149. diffusers/schedulers/scheduling_euler_discrete.py +62 -3
  150. diffusers/schedulers/scheduling_heun_discrete.py +2 -0
  151. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +2 -0
  152. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +2 -0
  153. diffusers/schedulers/scheduling_lms_discrete.py +2 -0
  154. diffusers/schedulers/scheduling_unipc_multistep.py +13 -1
  155. diffusers/schedulers/scheduling_utils.py +3 -1
  156. diffusers/schedulers/scheduling_utils_flax.py +3 -1
  157. diffusers/training_utils.py +1 -1
  158. diffusers/utils/__init__.py +0 -2
  159. diffusers/utils/constants.py +2 -5
  160. diffusers/utils/dummy_pt_objects.py +30 -0
  161. diffusers/utils/dummy_torch_and_transformers_objects.py +45 -0
  162. diffusers/utils/dynamic_modules_utils.py +14 -18
  163. diffusers/utils/hub_utils.py +24 -36
  164. diffusers/utils/logging.py +1 -1
  165. diffusers/utils/state_dict_utils.py +8 -0
  166. diffusers/utils/testing_utils.py +199 -1
  167. diffusers/utils/torch_utils.py +3 -3
  168. {diffusers-0.24.0.dist-info → diffusers-0.25.0.dist-info}/METADATA +54 -53
  169. {diffusers-0.24.0.dist-info → diffusers-0.25.0.dist-info}/RECORD +174 -155
  170. {diffusers-0.24.0.dist-info → diffusers-0.25.0.dist-info}/WHEEL +1 -1
  171. {diffusers-0.24.0.dist-info → diffusers-0.25.0.dist-info}/entry_points.txt +0 -1
  172. /diffusers/pipelines/{alt_diffusion → deprecated/alt_diffusion}/modeling_roberta_series.py +0 -0
  173. {diffusers-0.24.0.dist-info → diffusers-0.25.0.dist-info}/LICENSE +0 -0
  174. {diffusers-0.24.0.dist-info → diffusers-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import inspect
2
3
  from dataclasses import dataclass
3
4
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
4
5
 
@@ -15,11 +16,35 @@ from transformers import (
15
16
  CLIPVisionModelWithProjection,
16
17
  )
17
18
 
18
- from diffusers.models import AutoencoderKL, UNet2DConditionModel
19
- from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipeline
20
- from diffusers.schedulers import KarrasDiffusionSchedulers
21
- from diffusers.utils import BaseOutput
22
- from diffusers.utils.torch_utils import randn_tensor
19
+ from ...image_processor import VaeImageProcessor
20
+ from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
21
+ from ...models import AutoencoderKL, UNet2DConditionModel
22
+ from ...models.attention_processor import (
23
+ AttnProcessor2_0,
24
+ FusedAttnProcessor2_0,
25
+ LoRAAttnProcessor2_0,
26
+ LoRAXFormersAttnProcessor,
27
+ XFormersAttnProcessor,
28
+ )
29
+ from ...models.lora import adjust_lora_scale_text_encoder
30
+ from ...schedulers import KarrasDiffusionSchedulers
31
+ from ...utils import (
32
+ USE_PEFT_BACKEND,
33
+ BaseOutput,
34
+ is_invisible_watermark_available,
35
+ logging,
36
+ scale_lora_layers,
37
+ unscale_lora_layers,
38
+ )
39
+ from ...utils.torch_utils import randn_tensor
40
+ from ..pipeline_utils import DiffusionPipeline
41
+
42
+
43
+ if is_invisible_watermark_available():
44
+ from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
45
+
46
+
47
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
23
48
 
24
49
 
25
50
  # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_0
@@ -300,7 +325,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
300
325
  return noise_cfg
301
326
 
302
327
 
303
- class TextToVideoZeroSDXLPipeline(StableDiffusionXLPipeline):
328
+ class TextToVideoZeroSDXLPipeline(
329
+ DiffusionPipeline,
330
+ StableDiffusionXLLoraLoaderMixin,
331
+ TextualInversionLoaderMixin,
332
+ ):
304
333
  r"""
305
334
  Pipeline for zero-shot text-to-video generation using Stable Diffusion XL.
306
335
 
@@ -332,6 +361,16 @@ class TextToVideoZeroSDXLPipeline(StableDiffusionXLPipeline):
332
361
  [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
333
362
  """
334
363
 
364
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
365
+ _optional_components = [
366
+ "tokenizer",
367
+ "tokenizer_2",
368
+ "text_encoder",
369
+ "text_encoder_2",
370
+ "image_encoder",
371
+ "feature_extractor",
372
+ ]
373
+
335
374
  def __init__(
336
375
  self,
337
376
  vae: AutoencoderKL,
@@ -346,7 +385,8 @@ class TextToVideoZeroSDXLPipeline(StableDiffusionXLPipeline):
346
385
  force_zeros_for_empty_prompt: bool = True,
347
386
  add_watermarker: Optional[bool] = None,
348
387
  ):
349
- super().__init__(
388
+ super().__init__()
389
+ self.register_modules(
350
390
  vae=vae,
351
391
  text_encoder=text_encoder,
352
392
  text_encoder_2=text_encoder_2,
@@ -356,16 +396,435 @@ class TextToVideoZeroSDXLPipeline(StableDiffusionXLPipeline):
356
396
  scheduler=scheduler,
357
397
  image_encoder=image_encoder,
358
398
  feature_extractor=feature_extractor,
359
- force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
360
- add_watermarker=add_watermarker,
361
399
  )
400
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
401
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
402
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
403
+
404
+ self.default_sample_size = self.unet.config.sample_size
405
+
406
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
407
+
408
+ if add_watermarker:
409
+ self.watermark = StableDiffusionXLWatermarker()
410
+ else:
411
+ self.watermark = None
412
+
362
413
  processor = (
363
414
  CrossFrameAttnProcessor2_0(batch_size=2)
364
415
  if hasattr(F, "scaled_dot_product_attention")
365
416
  else CrossFrameAttnProcessor(batch_size=2)
366
417
  )
418
+
367
419
  self.unet.set_attn_processor(processor)
368
420
 
421
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
422
+ def prepare_extra_step_kwargs(self, generator, eta):
423
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
424
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
425
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
426
+ # and should be between [0, 1]
427
+
428
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
429
+ extra_step_kwargs = {}
430
+ if accepts_eta:
431
+ extra_step_kwargs["eta"] = eta
432
+
433
+ # check if the scheduler accepts generator
434
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
435
+ if accepts_generator:
436
+ extra_step_kwargs["generator"] = generator
437
+ return extra_step_kwargs
438
+
439
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
440
+ def enable_vae_slicing(self):
441
+ r"""
442
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
443
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
444
+ """
445
+ self.vae.enable_slicing()
446
+
447
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
448
+ def disable_vae_slicing(self):
449
+ r"""
450
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
451
+ computing decoding in one step.
452
+ """
453
+ self.vae.disable_slicing()
454
+
455
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
456
+ def upcast_vae(self):
457
+ dtype = self.vae.dtype
458
+ self.vae.to(dtype=torch.float32)
459
+ use_torch_2_0_or_xformers = isinstance(
460
+ self.vae.decoder.mid_block.attentions[0].processor,
461
+ (
462
+ AttnProcessor2_0,
463
+ XFormersAttnProcessor,
464
+ LoRAXFormersAttnProcessor,
465
+ LoRAAttnProcessor2_0,
466
+ FusedAttnProcessor2_0,
467
+ ),
468
+ )
469
+ # if xformers or torch_2_0 is used attention block does not need
470
+ # to be in float32 which can save lots of memory
471
+ if use_torch_2_0_or_xformers:
472
+ self.vae.post_quant_conv.to(dtype)
473
+ self.vae.decoder.conv_in.to(dtype)
474
+ self.vae.decoder.mid_block.to(dtype)
475
+
476
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
477
+ def _get_add_time_ids(
478
+ self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
479
+ ):
480
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
481
+
482
+ passed_add_embed_dim = (
483
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
484
+ )
485
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
486
+
487
+ if expected_add_embed_dim != passed_add_embed_dim:
488
+ raise ValueError(
489
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
490
+ )
491
+
492
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
493
+ return add_time_ids
494
+
495
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
496
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
497
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
498
+ if isinstance(generator, list) and len(generator) != batch_size:
499
+ raise ValueError(
500
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
501
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
502
+ )
503
+
504
+ if latents is None:
505
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
506
+ else:
507
+ latents = latents.to(device)
508
+
509
+ # scale the initial noise by the standard deviation required by the scheduler
510
+ latents = latents * self.scheduler.init_noise_sigma
511
+ return latents
512
+
513
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
514
+ def check_inputs(
515
+ self,
516
+ prompt,
517
+ prompt_2,
518
+ height,
519
+ width,
520
+ callback_steps,
521
+ negative_prompt=None,
522
+ negative_prompt_2=None,
523
+ prompt_embeds=None,
524
+ negative_prompt_embeds=None,
525
+ pooled_prompt_embeds=None,
526
+ negative_pooled_prompt_embeds=None,
527
+ callback_on_step_end_tensor_inputs=None,
528
+ ):
529
+ if height % 8 != 0 or width % 8 != 0:
530
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
531
+
532
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
533
+ raise ValueError(
534
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
535
+ f" {type(callback_steps)}."
536
+ )
537
+
538
+ if callback_on_step_end_tensor_inputs is not None and not all(
539
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
540
+ ):
541
+ raise ValueError(
542
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
543
+ )
544
+
545
+ if prompt is not None and prompt_embeds is not None:
546
+ raise ValueError(
547
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
548
+ " only forward one of the two."
549
+ )
550
+ elif prompt_2 is not None and prompt_embeds is not None:
551
+ raise ValueError(
552
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
553
+ " only forward one of the two."
554
+ )
555
+ elif prompt is None and prompt_embeds is None:
556
+ raise ValueError(
557
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
558
+ )
559
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
560
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
561
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
562
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
563
+
564
+ if negative_prompt is not None and negative_prompt_embeds is not None:
565
+ raise ValueError(
566
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
567
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
568
+ )
569
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
570
+ raise ValueError(
571
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
572
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
573
+ )
574
+
575
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
576
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
577
+ raise ValueError(
578
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
579
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
580
+ f" {negative_prompt_embeds.shape}."
581
+ )
582
+
583
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
584
+ raise ValueError(
585
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
586
+ )
587
+
588
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
589
+ raise ValueError(
590
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
591
+ )
592
+
593
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
594
+ def encode_prompt(
595
+ self,
596
+ prompt: str,
597
+ prompt_2: Optional[str] = None,
598
+ device: Optional[torch.device] = None,
599
+ num_images_per_prompt: int = 1,
600
+ do_classifier_free_guidance: bool = True,
601
+ negative_prompt: Optional[str] = None,
602
+ negative_prompt_2: Optional[str] = None,
603
+ prompt_embeds: Optional[torch.FloatTensor] = None,
604
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
605
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
606
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
607
+ lora_scale: Optional[float] = None,
608
+ clip_skip: Optional[int] = None,
609
+ ):
610
+ r"""
611
+ Encodes the prompt into text encoder hidden states.
612
+
613
+ Args:
614
+ prompt (`str` or `List[str]`, *optional*):
615
+ prompt to be encoded
616
+ prompt_2 (`str` or `List[str]`, *optional*):
617
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
618
+ used in both text-encoders
619
+ device: (`torch.device`):
620
+ torch device
621
+ num_images_per_prompt (`int`):
622
+ number of images that should be generated per prompt
623
+ do_classifier_free_guidance (`bool`):
624
+ whether to use classifier free guidance or not
625
+ negative_prompt (`str` or `List[str]`, *optional*):
626
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
627
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
628
+ less than `1`).
629
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
630
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
631
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
632
+ prompt_embeds (`torch.FloatTensor`, *optional*):
633
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
634
+ provided, text embeddings will be generated from `prompt` input argument.
635
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
636
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
637
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
638
+ argument.
639
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
640
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
641
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
642
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
643
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
644
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
645
+ input argument.
646
+ lora_scale (`float`, *optional*):
647
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
648
+ clip_skip (`int`, *optional*):
649
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
650
+ the output of the pre-final layer will be used for computing the prompt embeddings.
651
+ """
652
+ device = device or self._execution_device
653
+
654
+ # set lora scale so that monkey patched LoRA
655
+ # function of text encoder can correctly access it
656
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
657
+ self._lora_scale = lora_scale
658
+
659
+ # dynamically adjust the LoRA scale
660
+ if self.text_encoder is not None:
661
+ if not USE_PEFT_BACKEND:
662
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
663
+ else:
664
+ scale_lora_layers(self.text_encoder, lora_scale)
665
+
666
+ if self.text_encoder_2 is not None:
667
+ if not USE_PEFT_BACKEND:
668
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
669
+ else:
670
+ scale_lora_layers(self.text_encoder_2, lora_scale)
671
+
672
+ prompt = [prompt] if isinstance(prompt, str) else prompt
673
+
674
+ if prompt is not None:
675
+ batch_size = len(prompt)
676
+ else:
677
+ batch_size = prompt_embeds.shape[0]
678
+
679
+ # Define tokenizers and text encoders
680
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
681
+ text_encoders = (
682
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
683
+ )
684
+
685
+ if prompt_embeds is None:
686
+ prompt_2 = prompt_2 or prompt
687
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
688
+
689
+ # textual inversion: procecss multi-vector tokens if necessary
690
+ prompt_embeds_list = []
691
+ prompts = [prompt, prompt_2]
692
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
693
+ if isinstance(self, TextualInversionLoaderMixin):
694
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
695
+
696
+ text_inputs = tokenizer(
697
+ prompt,
698
+ padding="max_length",
699
+ max_length=tokenizer.model_max_length,
700
+ truncation=True,
701
+ return_tensors="pt",
702
+ )
703
+
704
+ text_input_ids = text_inputs.input_ids
705
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
706
+
707
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
708
+ text_input_ids, untruncated_ids
709
+ ):
710
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
711
+ logger.warning(
712
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
713
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
714
+ )
715
+
716
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
717
+
718
+ # We are only ALWAYS interested in the pooled output of the final text encoder
719
+ pooled_prompt_embeds = prompt_embeds[0]
720
+ if clip_skip is None:
721
+ prompt_embeds = prompt_embeds.hidden_states[-2]
722
+ else:
723
+ # "2" because SDXL always indexes from the penultimate layer.
724
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
725
+
726
+ prompt_embeds_list.append(prompt_embeds)
727
+
728
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
729
+
730
+ # get unconditional embeddings for classifier free guidance
731
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
732
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
733
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
734
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
735
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
736
+ negative_prompt = negative_prompt or ""
737
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
738
+
739
+ # normalize str to list
740
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
741
+ negative_prompt_2 = (
742
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
743
+ )
744
+
745
+ uncond_tokens: List[str]
746
+ if prompt is not None and type(prompt) is not type(negative_prompt):
747
+ raise TypeError(
748
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
749
+ f" {type(prompt)}."
750
+ )
751
+ elif batch_size != len(negative_prompt):
752
+ raise ValueError(
753
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
754
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
755
+ " the batch size of `prompt`."
756
+ )
757
+ else:
758
+ uncond_tokens = [negative_prompt, negative_prompt_2]
759
+
760
+ negative_prompt_embeds_list = []
761
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
762
+ if isinstance(self, TextualInversionLoaderMixin):
763
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
764
+
765
+ max_length = prompt_embeds.shape[1]
766
+ uncond_input = tokenizer(
767
+ negative_prompt,
768
+ padding="max_length",
769
+ max_length=max_length,
770
+ truncation=True,
771
+ return_tensors="pt",
772
+ )
773
+
774
+ negative_prompt_embeds = text_encoder(
775
+ uncond_input.input_ids.to(device),
776
+ output_hidden_states=True,
777
+ )
778
+ # We are only ALWAYS interested in the pooled output of the final text encoder
779
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
780
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
781
+
782
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
783
+
784
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
785
+
786
+ if self.text_encoder_2 is not None:
787
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
788
+ else:
789
+ prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
790
+
791
+ bs_embed, seq_len, _ = prompt_embeds.shape
792
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
793
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
794
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
795
+
796
+ if do_classifier_free_guidance:
797
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
798
+ seq_len = negative_prompt_embeds.shape[1]
799
+
800
+ if self.text_encoder_2 is not None:
801
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
802
+ else:
803
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
804
+
805
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
806
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
807
+
808
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
809
+ bs_embed * num_images_per_prompt, -1
810
+ )
811
+ if do_classifier_free_guidance:
812
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
813
+ bs_embed * num_images_per_prompt, -1
814
+ )
815
+
816
+ if self.text_encoder is not None:
817
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
818
+ # Retrieve the original scale by scaling back the LoRA layers
819
+ unscale_lora_layers(self.text_encoder, lora_scale)
820
+
821
+ if self.text_encoder_2 is not None:
822
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
823
+ # Retrieve the original scale by scaling back the LoRA layers
824
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
825
+
826
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
827
+
369
828
  # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoZeroPipeline.forward_loop
370
829
  def forward_loop(self, x_t0, t0, t1, generator):
371
830
  """
@@ -477,8 +477,9 @@ class UnCLIPPipeline(DiffusionPipeline):
477
477
  image = super_res_latents
478
478
  # done super res
479
479
 
480
- # post processing
480
+ self.maybe_free_model_hooks()
481
481
 
482
+ # post processing
482
483
  image = image * 0.5 + 0.5
483
484
  image = image.clamp(0, 1)
484
485
  image = image.cpu().permute(0, 2, 3, 1).float().numpy()
@@ -403,6 +403,7 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline):
403
403
  image = super_res_latents
404
404
 
405
405
  # done super res
406
+ self.maybe_free_model_hooks()
406
407
 
407
408
  # post processing
408
409
 
@@ -19,8 +19,8 @@ import torch
19
19
  import torch.nn as nn
20
20
 
21
21
  from ...configuration_utils import ConfigMixin, register_to_config
22
+ from ...models.autoencoders.vae import DecoderOutput, VectorQuantizer
22
23
  from ...models.modeling_utils import ModelMixin
23
- from ...models.vae import DecoderOutput, VectorQuantizer
24
24
  from ...models.vq_model import VQEncoderOutput
25
25
  from ...utils.accelerate_utils import apply_forward_hook
26
26
 
@@ -69,6 +69,10 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin):
69
69
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
70
70
  library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
71
71
 
72
+ The pipeline also inherits the following loading methods:
73
+ - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
74
+ - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
75
+
72
76
  Args:
73
77
  prior ([`Prior`]):
74
78
  The canonical unCLIP prior to approximate the image embedding from the text embedding.
@@ -39,6 +39,7 @@ except OptionalDependencyNotAvailable:
39
39
 
40
40
  else:
41
41
  _import_structure["deprecated"] = ["KarrasVeScheduler", "ScoreSdeVpScheduler"]
42
+ _import_structure["scheduling_amused"] = ["AmusedScheduler"]
42
43
  _import_structure["scheduling_consistency_decoder"] = ["ConsistencyDecoderScheduler"]
43
44
  _import_structure["scheduling_consistency_models"] = ["CMStochasticIterativeScheduler"]
44
45
  _import_structure["scheduling_ddim"] = ["DDIMScheduler"]
@@ -129,6 +130,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
129
130
  from ..utils.dummy_pt_objects import * # noqa F403
130
131
  else:
131
132
  from .deprecated import KarrasVeScheduler, ScoreSdeVpScheduler
133
+ from .scheduling_amused import AmusedScheduler
132
134
  from .scheduling_consistency_decoder import ConsistencyDecoderScheduler
133
135
  from .scheduling_consistency_models import CMStochasticIterativeScheduler
134
136
  from .scheduling_ddim import DDIMScheduler