diffusers 0.28.2__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. diffusers/__init__.py +15 -1
  2. diffusers/commands/env.py +1 -5
  3. diffusers/dependency_versions_table.py +1 -1
  4. diffusers/image_processor.py +2 -1
  5. diffusers/loaders/__init__.py +2 -2
  6. diffusers/loaders/lora.py +406 -140
  7. diffusers/loaders/lora_conversion_utils.py +7 -1
  8. diffusers/loaders/single_file.py +13 -1
  9. diffusers/loaders/single_file_model.py +15 -8
  10. diffusers/loaders/single_file_utils.py +267 -17
  11. diffusers/loaders/unet.py +307 -272
  12. diffusers/models/__init__.py +7 -3
  13. diffusers/models/attention.py +125 -1
  14. diffusers/models/attention_processor.py +169 -1
  15. diffusers/models/autoencoders/__init__.py +1 -0
  16. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  17. diffusers/models/autoencoders/autoencoder_kl.py +17 -6
  18. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
  19. diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
  20. diffusers/models/autoencoders/vq_model.py +182 -0
  21. diffusers/models/controlnet_sd3.py +418 -0
  22. diffusers/models/controlnet_xs.py +6 -6
  23. diffusers/models/embeddings.py +112 -84
  24. diffusers/models/model_loading_utils.py +55 -0
  25. diffusers/models/modeling_utils.py +138 -20
  26. diffusers/models/normalization.py +11 -6
  27. diffusers/models/transformers/__init__.py +1 -0
  28. diffusers/models/transformers/dual_transformer_2d.py +5 -4
  29. diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
  30. diffusers/models/transformers/prior_transformer.py +5 -5
  31. diffusers/models/transformers/transformer_2d.py +2 -2
  32. diffusers/models/transformers/transformer_sd3.py +353 -0
  33. diffusers/models/transformers/transformer_temporal.py +12 -10
  34. diffusers/models/unets/unet_1d.py +3 -3
  35. diffusers/models/unets/unet_2d.py +3 -3
  36. diffusers/models/unets/unet_2d_condition.py +4 -15
  37. diffusers/models/unets/unet_3d_condition.py +5 -17
  38. diffusers/models/unets/unet_i2vgen_xl.py +4 -4
  39. diffusers/models/unets/unet_motion_model.py +4 -4
  40. diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
  41. diffusers/models/vq_model.py +8 -165
  42. diffusers/pipelines/__init__.py +11 -0
  43. diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
  44. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
  45. diffusers/pipelines/auto_pipeline.py +8 -0
  46. diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
  47. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
  48. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
  49. diffusers/pipelines/controlnet_sd3/__init__.py +53 -0
  50. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1062 -0
  51. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
  52. diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
  53. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
  54. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
  55. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
  56. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
  57. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
  58. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
  59. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
  60. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
  61. diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
  62. diffusers/pipelines/pia/pipeline_pia.py +4 -3
  63. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
  64. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
  65. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
  69. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
  70. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
  71. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
  72. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
  73. diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
  74. diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
  75. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +904 -0
  76. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +941 -0
  77. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
  78. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
  79. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
  80. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
  81. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
  82. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
  83. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
  84. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
  85. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
  86. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
  87. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
  88. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
  89. diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
  90. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
  91. diffusers/schedulers/__init__.py +2 -0
  92. diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
  93. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
  94. diffusers/schedulers/scheduling_edm_euler.py +2 -4
  95. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
  96. diffusers/schedulers/scheduling_lms_discrete.py +2 -2
  97. diffusers/training_utils.py +4 -4
  98. diffusers/utils/__init__.py +3 -0
  99. diffusers/utils/constants.py +2 -0
  100. diffusers/utils/dummy_pt_objects.py +60 -0
  101. diffusers/utils/dummy_torch_and_transformers_objects.py +45 -0
  102. diffusers/utils/dynamic_modules_utils.py +15 -13
  103. diffusers/utils/hub_utils.py +106 -0
  104. diffusers/utils/import_utils.py +0 -1
  105. diffusers/utils/logging.py +3 -1
  106. diffusers/utils/state_dict_utils.py +2 -0
  107. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/METADATA +3 -3
  108. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/RECORD +112 -112
  109. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/WHEEL +1 -1
  110. diffusers/models/dual_transformer_2d.py +0 -20
  111. diffusers/models/prior_transformer.py +0 -12
  112. diffusers/models/t5_film_transformer.py +0 -70
  113. diffusers/models/transformer_2d.py +0 -25
  114. diffusers/models/transformer_temporal.py +0 -34
  115. diffusers/models/unet_1d.py +0 -26
  116. diffusers/models/unet_1d_blocks.py +0 -203
  117. diffusers/models/unet_2d.py +0 -27
  118. diffusers/models/unet_2d_blocks.py +0 -375
  119. diffusers/models/unet_2d_condition.py +0 -25
  120. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/LICENSE +0 -0
  121. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/entry_points.txt +0 -0
  122. {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/top_level.txt +0 -0
@@ -206,11 +206,11 @@ class UNet1DModel(ModelMixin, ConfigMixin):
206
206
  The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
207
207
  timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
208
208
  return_dict (`bool`, *optional*, defaults to `True`):
209
- Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
209
+ Whether or not to return a [`~models.unets.unet_1d.UNet1DOutput`] instead of a plain tuple.
210
210
 
211
211
  Returns:
212
- [`~models.unet_1d.UNet1DOutput`] or `tuple`:
213
- If `return_dict` is True, an [`~models.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
212
+ [`~models.unets.unet_1d.UNet1DOutput`] or `tuple`:
213
+ If `return_dict` is True, an [`~models.unets.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
214
214
  returned where the first element is the sample tensor.
215
215
  """
216
216
 
@@ -257,11 +257,11 @@ class UNet2DModel(ModelMixin, ConfigMixin):
257
257
  class_labels (`torch.Tensor`, *optional*, defaults to `None`):
258
258
  Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
259
259
  return_dict (`bool`, *optional*, defaults to `True`):
260
- Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
260
+ Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.
261
261
 
262
262
  Returns:
263
- [`~models.unet_2d.UNet2DOutput`] or `tuple`:
264
- If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
263
+ [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
264
+ If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
265
265
  returned where the first element is the sample tensor.
266
266
  """
267
267
  # 0. center input if necessary
@@ -110,13 +110,13 @@ class UNet2DConditionModel(
110
110
  The dimension of the cross attention features.
111
111
  transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
112
112
  The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
113
- [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
114
- [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
113
+ [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
114
+ [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
115
115
  reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
116
116
  The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
117
117
  blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
118
- [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
119
- [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
118
+ [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
119
+ [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
120
120
  encoder_hid_dim (`int`, *optional*, defaults to None):
121
121
  If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
122
122
  dimension to `cross_attention_dim`.
@@ -903,17 +903,6 @@ class UNet2DConditionModel(
903
903
  if self.original_attn_processors is not None:
904
904
  self.set_attn_processor(self.original_attn_processors)
905
905
 
906
- def unload_lora(self):
907
- """Unloads LoRA weights."""
908
- deprecate(
909
- "unload_lora",
910
- "0.28.0",
911
- "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
912
- )
913
- for module in self.modules():
914
- if hasattr(module, "set_lora_layer"):
915
- module.set_lora_layer(None)
916
-
917
906
  def get_time_embed(
918
907
  self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
919
908
  ) -> Optional[torch.Tensor]:
@@ -22,7 +22,7 @@ import torch.utils.checkpoint
22
22
 
23
23
  from ...configuration_utils import ConfigMixin, register_to_config
24
24
  from ...loaders import UNet2DConditionLoadersMixin
25
- from ...utils import BaseOutput, deprecate, logging
25
+ from ...utils import BaseOutput, logging
26
26
  from ..activations import get_activation
27
27
  from ..attention_processor import (
28
28
  ADDED_KV_ATTENTION_PROCESSORS,
@@ -546,18 +546,6 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
546
546
  if self.original_attn_processors is not None:
547
547
  self.set_attn_processor(self.original_attn_processors)
548
548
 
549
- # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unload_lora
550
- def unload_lora(self):
551
- """Unloads LoRA weights."""
552
- deprecate(
553
- "unload_lora",
554
- "0.28.0",
555
- "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
556
- )
557
- for module in self.modules():
558
- if hasattr(module, "set_lora_layer"):
559
- module.set_lora_layer(None)
560
-
561
549
  def forward(
562
550
  self,
563
551
  sample: torch.Tensor,
@@ -598,15 +586,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
598
586
  mid_block_additional_residual: (`torch.Tensor`, *optional*):
599
587
  A tensor that if specified is added to the residual of the middle unet block.
600
588
  return_dict (`bool`, *optional*, defaults to `True`):
601
- Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
589
+ Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
602
590
  tuple.
603
591
  cross_attention_kwargs (`dict`, *optional*):
604
592
  A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
605
593
 
606
594
  Returns:
607
- [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
608
- If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
609
- a `tuple` is returned where the first element is the sample tensor.
595
+ [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
596
+ If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
597
+ otherwise a `tuple` is returned where the first element is the sample tensor.
610
598
  """
611
599
  # By default samples have to be AT least a multiple of the overall upsampling factor.
612
600
  # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
@@ -542,13 +542,13 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
542
542
  `self.processor` in
543
543
  [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
544
544
  return_dict (`bool`, *optional*, defaults to `True`):
545
- Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
545
+ Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
546
546
  tuple.
547
547
 
548
548
  Returns:
549
- [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
550
- If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
551
- a `tuple` is returned where the first element is the sample tensor.
549
+ [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
550
+ If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
551
+ otherwise a `tuple` is returned where the first element is the sample tensor.
552
552
  """
553
553
  batch_size, channels, num_frames, height, width = sample.shape
554
554
 
@@ -856,13 +856,13 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
856
856
  mid_block_additional_residual: (`torch.Tensor`, *optional*):
857
857
  A tensor that if specified is added to the residual of the middle unet block.
858
858
  return_dict (`bool`, *optional*, defaults to `True`):
859
- Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
859
+ Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
860
860
  tuple.
861
861
 
862
862
  Returns:
863
- [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
864
- If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
865
- a `tuple` is returned where the first element is the sample tensor.
863
+ [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
864
+ If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
865
+ otherwise a `tuple` is returned where the first element is the sample tensor.
866
866
  """
867
867
  # By default samples have to be AT least a multiple of the overall upsampling factor.
868
868
  # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
@@ -57,9 +57,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
57
57
  The dimension of the cross attention features.
58
58
  transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
59
59
  The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
60
- [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
61
- [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
62
- [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
60
+ [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
61
+ [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
62
+ [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
63
63
  num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
64
64
  The number of attention heads.
65
65
  dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
@@ -11,172 +11,15 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from dataclasses import dataclass
15
- from typing import Optional, Tuple, Union
14
+ from ..utils import deprecate
15
+ from .autoencoders.vq_model import VQEncoderOutput, VQModel
16
16
 
17
- import torch
18
- import torch.nn as nn
19
17
 
20
- from ..configuration_utils import ConfigMixin, register_to_config
21
- from ..utils import BaseOutput
22
- from ..utils.accelerate_utils import apply_forward_hook
23
- from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
24
- from .modeling_utils import ModelMixin
18
+ class VQEncoderOutput(VQEncoderOutput):
19
+ deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
20
+ deprecate("VQEncoderOutput", "0.31", deprecation_message)
25
21
 
26
22
 
27
- @dataclass
28
- class VQEncoderOutput(BaseOutput):
29
- """
30
- Output of VQModel encoding method.
31
-
32
- Args:
33
- latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
34
- The encoded output sample from the last layer of the model.
35
- """
36
-
37
- latents: torch.Tensor
38
-
39
-
40
- class VQModel(ModelMixin, ConfigMixin):
41
- r"""
42
- A VQ-VAE model for decoding latent representations.
43
-
44
- This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
45
- for all models (such as downloading or saving).
46
-
47
- Parameters:
48
- in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
49
- out_channels (int, *optional*, defaults to 3): Number of channels in the output.
50
- down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
51
- Tuple of downsample block types.
52
- up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
53
- Tuple of upsample block types.
54
- block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
55
- Tuple of block output channels.
56
- layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
57
- act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
58
- latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
59
- sample_size (`int`, *optional*, defaults to `32`): Sample input size.
60
- num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
61
- norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
62
- vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
63
- scaling_factor (`float`, *optional*, defaults to `0.18215`):
64
- The component-wise standard deviation of the trained latent space computed using the first batch of the
65
- training set. This is used to scale the latent space to have unit variance when training the diffusion
66
- model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
67
- diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
68
- / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
69
- Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
70
- norm_type (`str`, *optional*, defaults to `"group"`):
71
- Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
72
- """
73
-
74
- @register_to_config
75
- def __init__(
76
- self,
77
- in_channels: int = 3,
78
- out_channels: int = 3,
79
- down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
80
- up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
81
- block_out_channels: Tuple[int, ...] = (64,),
82
- layers_per_block: int = 1,
83
- act_fn: str = "silu",
84
- latent_channels: int = 3,
85
- sample_size: int = 32,
86
- num_vq_embeddings: int = 256,
87
- norm_num_groups: int = 32,
88
- vq_embed_dim: Optional[int] = None,
89
- scaling_factor: float = 0.18215,
90
- norm_type: str = "group", # group, spatial
91
- mid_block_add_attention=True,
92
- lookup_from_codebook=False,
93
- force_upcast=False,
94
- ):
95
- super().__init__()
96
-
97
- # pass init params to Encoder
98
- self.encoder = Encoder(
99
- in_channels=in_channels,
100
- out_channels=latent_channels,
101
- down_block_types=down_block_types,
102
- block_out_channels=block_out_channels,
103
- layers_per_block=layers_per_block,
104
- act_fn=act_fn,
105
- norm_num_groups=norm_num_groups,
106
- double_z=False,
107
- mid_block_add_attention=mid_block_add_attention,
108
- )
109
-
110
- vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
111
-
112
- self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
113
- self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
114
- self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
115
-
116
- # pass init params to Decoder
117
- self.decoder = Decoder(
118
- in_channels=latent_channels,
119
- out_channels=out_channels,
120
- up_block_types=up_block_types,
121
- block_out_channels=block_out_channels,
122
- layers_per_block=layers_per_block,
123
- act_fn=act_fn,
124
- norm_num_groups=norm_num_groups,
125
- norm_type=norm_type,
126
- mid_block_add_attention=mid_block_add_attention,
127
- )
128
-
129
- @apply_forward_hook
130
- def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
131
- h = self.encoder(x)
132
- h = self.quant_conv(h)
133
-
134
- if not return_dict:
135
- return (h,)
136
-
137
- return VQEncoderOutput(latents=h)
138
-
139
- @apply_forward_hook
140
- def decode(
141
- self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
142
- ) -> Union[DecoderOutput, torch.Tensor]:
143
- # also go through quantization layer
144
- if not force_not_quantize:
145
- quant, commit_loss, _ = self.quantize(h)
146
- elif self.config.lookup_from_codebook:
147
- quant = self.quantize.get_codebook_entry(h, shape)
148
- commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
149
- else:
150
- quant = h
151
- commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
152
- quant2 = self.post_quant_conv(quant)
153
- dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
154
-
155
- if not return_dict:
156
- return dec, commit_loss
157
-
158
- return DecoderOutput(sample=dec, commit_loss=commit_loss)
159
-
160
- def forward(
161
- self, sample: torch.Tensor, return_dict: bool = True
162
- ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
163
- r"""
164
- The [`VQModel`] forward method.
165
-
166
- Args:
167
- sample (`torch.Tensor`): Input sample.
168
- return_dict (`bool`, *optional*, defaults to `True`):
169
- Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
170
-
171
- Returns:
172
- [`~models.vq_model.VQEncoderOutput`] or `tuple`:
173
- If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
174
- is returned.
175
- """
176
-
177
- h = self.encode(sample).latents
178
- dec = self.decode(h)
179
-
180
- if not return_dict:
181
- return dec.sample, dec.commit_loss
182
- return dec
23
+ class VQModel(VQModel):
24
+ deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
25
+ deprecate("VQModel", "0.31", deprecation_message)
@@ -20,6 +20,7 @@ from ..utils import (
20
20
  _dummy_objects = {}
21
21
  _import_structure = {
22
22
  "controlnet": [],
23
+ "controlnet_sd3": [],
23
24
  "controlnet_xs": [],
24
25
  "deprecated": [],
25
26
  "latent_diffusion": [],
@@ -142,6 +143,11 @@ else:
142
143
  "StableDiffusionXLControlNetXSPipeline",
143
144
  ]
144
145
  )
146
+ _import_structure["controlnet_sd3"].extend(
147
+ [
148
+ "StableDiffusion3ControlNetPipeline",
149
+ ]
150
+ )
145
151
  _import_structure["deepfloyd_if"] = [
146
152
  "IFImg2ImgPipeline",
147
153
  "IFImg2ImgSuperResolutionPipeline",
@@ -220,6 +226,7 @@ else:
220
226
  "StableDiffusionLDM3DPipeline",
221
227
  ]
222
228
  )
229
+ _import_structure["stable_diffusion_3"] = ["StableDiffusion3Pipeline", "StableDiffusion3Img2ImgPipeline"]
223
230
  _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
224
231
  _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
225
232
  _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
@@ -393,6 +400,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
393
400
  StableDiffusionXLControlNetInpaintPipeline,
394
401
  StableDiffusionXLControlNetPipeline,
395
402
  )
403
+ from .controlnet_sd3 import (
404
+ StableDiffusion3ControlNetPipeline,
405
+ )
396
406
  from .controlnet_xs import (
397
407
  StableDiffusionControlNetXSPipeline,
398
408
  StableDiffusionXLControlNetXSPipeline,
@@ -485,6 +495,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
485
495
  StableUnCLIPImg2ImgPipeline,
486
496
  StableUnCLIPPipeline,
487
497
  )
498
+ from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
488
499
  from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
489
500
  from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
490
501
  from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
@@ -316,9 +316,10 @@ class AnimateDiffPipeline(
316
316
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
317
317
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
318
318
 
319
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
320
- # Retrieve the original scale by scaling back the LoRA layers
321
- unscale_lora_layers(self.text_encoder, lora_scale)
319
+ if self.text_encoder is not None:
320
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
321
+ # Retrieve the original scale by scaling back the LoRA layers
322
+ unscale_lora_layers(self.text_encoder, lora_scale)
322
323
 
323
324
  return prompt_embeds, negative_prompt_embeds
324
325
 
@@ -420,9 +420,10 @@ class AnimateDiffVideoToVideoPipeline(
420
420
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
421
421
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
422
422
 
423
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
424
- # Retrieve the original scale by scaling back the LoRA layers
425
- unscale_lora_layers(self.text_encoder, lora_scale)
423
+ if self.text_encoder is not None:
424
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
425
+ # Retrieve the original scale by scaling back the LoRA layers
426
+ unscale_lora_layers(self.text_encoder, lora_scale)
426
427
 
427
428
  return prompt_embeds, negative_prompt_embeds
428
429
 
@@ -27,6 +27,7 @@ from .controlnet import (
27
27
  StableDiffusionXLControlNetPipeline,
28
28
  )
29
29
  from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
30
+ from .hunyuandit import HunyuanDiTPipeline
30
31
  from .kandinsky import (
31
32
  KandinskyCombinedPipeline,
32
33
  KandinskyImg2ImgCombinedPipeline,
@@ -52,6 +53,10 @@ from .stable_diffusion import (
52
53
  StableDiffusionInpaintPipeline,
53
54
  StableDiffusionPipeline,
54
55
  )
56
+ from .stable_diffusion_3 import (
57
+ StableDiffusion3Img2ImgPipeline,
58
+ StableDiffusion3Pipeline,
59
+ )
55
60
  from .stable_diffusion_xl import (
56
61
  StableDiffusionXLImg2ImgPipeline,
57
62
  StableDiffusionXLInpaintPipeline,
@@ -64,7 +69,9 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
64
69
  [
65
70
  ("stable-diffusion", StableDiffusionPipeline),
66
71
  ("stable-diffusion-xl", StableDiffusionXLPipeline),
72
+ ("stable-diffusion-3", StableDiffusion3Pipeline),
67
73
  ("if", IFPipeline),
74
+ ("hunyuan", HunyuanDiTPipeline),
68
75
  ("kandinsky", KandinskyCombinedPipeline),
69
76
  ("kandinsky22", KandinskyV22CombinedPipeline),
70
77
  ("kandinsky3", Kandinsky3Pipeline),
@@ -82,6 +89,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
82
89
  [
83
90
  ("stable-diffusion", StableDiffusionImg2ImgPipeline),
84
91
  ("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline),
92
+ ("stable-diffusion-3", StableDiffusion3Img2ImgPipeline),
85
93
  ("if", IFImg2ImgPipeline),
86
94
  ("kandinsky", KandinskyImg2ImgCombinedPipeline),
87
95
  ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
@@ -463,9 +463,10 @@ class StableDiffusionControlNetPipeline(
463
463
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
464
464
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
465
465
 
466
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
467
- # Retrieve the original scale by scaling back the LoRA layers
468
- unscale_lora_layers(self.text_encoder, lora_scale)
466
+ if self.text_encoder is not None:
467
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
468
+ # Retrieve the original scale by scaling back the LoRA layers
469
+ unscale_lora_layers(self.text_encoder, lora_scale)
469
470
 
470
471
  return prompt_embeds, negative_prompt_embeds
471
472
 
@@ -441,9 +441,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
441
441
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
442
442
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
443
443
 
444
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
445
- # Retrieve the original scale by scaling back the LoRA layers
446
- unscale_lora_layers(self.text_encoder, lora_scale)
444
+ if self.text_encoder is not None:
445
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
446
+ # Retrieve the original scale by scaling back the LoRA layers
447
+ unscale_lora_layers(self.text_encoder, lora_scale)
447
448
 
448
449
  return prompt_embeds, negative_prompt_embeds
449
450
 
@@ -566,9 +566,10 @@ class StableDiffusionControlNetInpaintPipeline(
566
566
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
567
567
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
568
568
 
569
- if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
570
- # Retrieve the original scale by scaling back the LoRA layers
571
- unscale_lora_layers(self.text_encoder, lora_scale)
569
+ if self.text_encoder is not None:
570
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
571
+ # Retrieve the original scale by scaling back the LoRA layers
572
+ unscale_lora_layers(self.text_encoder, lora_scale)
572
573
 
573
574
  return prompt_embeds, negative_prompt_embeds
574
575
 
@@ -0,0 +1,53 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_flax_available,
9
+ is_torch_available,
10
+ is_transformers_available,
11
+ )
12
+
13
+
14
+ _dummy_objects = {}
15
+ _import_structure = {}
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_stable_diffusion_3_controlnet"] = ["StableDiffusion3ControlNetPipeline"]
26
+
27
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
+ try:
29
+ if not (is_transformers_available() and is_torch_available()):
30
+ raise OptionalDependencyNotAvailable()
31
+
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils.dummy_torch_and_transformers_objects import *
34
+ else:
35
+ from .pipeline_stable_diffusion_3_controlnet import StableDiffusion3ControlNetPipeline
36
+
37
+ try:
38
+ if not (is_transformers_available() and is_flax_available()):
39
+ raise OptionalDependencyNotAvailable()
40
+ except OptionalDependencyNotAvailable:
41
+ from ...utils.dummy_flax_and_transformers_objects import * # noqa F403
42
+
43
+ else:
44
+ import sys
45
+
46
+ sys.modules[__name__] = _LazyModule(
47
+ __name__,
48
+ globals()["__file__"],
49
+ _import_structure,
50
+ module_spec=__spec__,
51
+ )
52
+ for name, value in _dummy_objects.items():
53
+ setattr(sys.modules[__name__], name, value)