diffusers 0.28.2__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +15 -1
- diffusers/commands/env.py +1 -5
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +2 -1
- diffusers/loaders/__init__.py +2 -2
- diffusers/loaders/lora.py +406 -140
- diffusers/loaders/lora_conversion_utils.py +7 -1
- diffusers/loaders/single_file.py +13 -1
- diffusers/loaders/single_file_model.py +15 -8
- diffusers/loaders/single_file_utils.py +267 -17
- diffusers/loaders/unet.py +307 -272
- diffusers/models/__init__.py +7 -3
- diffusers/models/attention.py +125 -1
- diffusers/models/attention_processor.py +169 -1
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
- diffusers/models/autoencoders/autoencoder_kl.py +17 -6
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
- diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
- diffusers/models/autoencoders/vq_model.py +182 -0
- diffusers/models/controlnet_sd3.py +418 -0
- diffusers/models/controlnet_xs.py +6 -6
- diffusers/models/embeddings.py +112 -84
- diffusers/models/model_loading_utils.py +55 -0
- diffusers/models/modeling_utils.py +138 -20
- diffusers/models/normalization.py +11 -6
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/dual_transformer_2d.py +5 -4
- diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
- diffusers/models/transformers/prior_transformer.py +5 -5
- diffusers/models/transformers/transformer_2d.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +353 -0
- diffusers/models/transformers/transformer_temporal.py +12 -10
- diffusers/models/unets/unet_1d.py +3 -3
- diffusers/models/unets/unet_2d.py +3 -3
- diffusers/models/unets/unet_2d_condition.py +4 -15
- diffusers/models/unets/unet_3d_condition.py +5 -17
- diffusers/models/unets/unet_i2vgen_xl.py +4 -4
- diffusers/models/unets/unet_motion_model.py +4 -4
- diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
- diffusers/models/vq_model.py +8 -165
- diffusers/pipelines/__init__.py +11 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
- diffusers/pipelines/auto_pipeline.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +53 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +1062 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
- diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
- diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
- diffusers/pipelines/pia/pipeline_pia.py +4 -3
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +904 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +941 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
- diffusers/schedulers/__init__.py +2 -0
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
- diffusers/schedulers/scheduling_edm_euler.py +2 -4
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/training_utils.py +4 -4
- diffusers/utils/__init__.py +3 -0
- diffusers/utils/constants.py +2 -0
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +45 -0
- diffusers/utils/dynamic_modules_utils.py +15 -13
- diffusers/utils/hub_utils.py +106 -0
- diffusers/utils/import_utils.py +0 -1
- diffusers/utils/logging.py +3 -1
- diffusers/utils/state_dict_utils.py +2 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/METADATA +3 -3
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/RECORD +112 -112
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/WHEEL +1 -1
- diffusers/models/dual_transformer_2d.py +0 -20
- diffusers/models/prior_transformer.py +0 -12
- diffusers/models/t5_film_transformer.py +0 -70
- diffusers/models/transformer_2d.py +0 -25
- diffusers/models/transformer_temporal.py +0 -34
- diffusers/models/unet_1d.py +0 -26
- diffusers/models/unet_1d_blocks.py +0 -203
- diffusers/models/unet_2d.py +0 -27
- diffusers/models/unet_2d_blocks.py +0 -375
- diffusers/models/unet_2d_condition.py +0 -25
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/LICENSE +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.1.dist-info}/top_level.txt +0 -0
@@ -206,11 +206,11 @@ class UNet1DModel(ModelMixin, ConfigMixin):
|
|
206
206
|
The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
|
207
207
|
timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
|
208
208
|
return_dict (`bool`, *optional*, defaults to `True`):
|
209
|
-
Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
|
209
|
+
Whether or not to return a [`~models.unets.unet_1d.UNet1DOutput`] instead of a plain tuple.
|
210
210
|
|
211
211
|
Returns:
|
212
|
-
[`~models.unet_1d.UNet1DOutput`] or `tuple`:
|
213
|
-
If `return_dict` is True, an [`~models.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
|
212
|
+
[`~models.unets.unet_1d.UNet1DOutput`] or `tuple`:
|
213
|
+
If `return_dict` is True, an [`~models.unets.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
|
214
214
|
returned where the first element is the sample tensor.
|
215
215
|
"""
|
216
216
|
|
@@ -257,11 +257,11 @@ class UNet2DModel(ModelMixin, ConfigMixin):
|
|
257
257
|
class_labels (`torch.Tensor`, *optional*, defaults to `None`):
|
258
258
|
Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
|
259
259
|
return_dict (`bool`, *optional*, defaults to `True`):
|
260
|
-
Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
|
260
|
+
Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.
|
261
261
|
|
262
262
|
Returns:
|
263
|
-
[`~models.unet_2d.UNet2DOutput`] or `tuple`:
|
264
|
-
If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
|
263
|
+
[`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
|
264
|
+
If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
|
265
265
|
returned where the first element is the sample tensor.
|
266
266
|
"""
|
267
267
|
# 0. center input if necessary
|
@@ -110,13 +110,13 @@ class UNet2DConditionModel(
|
|
110
110
|
The dimension of the cross attention features.
|
111
111
|
transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
|
112
112
|
The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
|
113
|
-
[`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
|
114
|
-
[`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
|
113
|
+
[`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
|
114
|
+
[`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
|
115
115
|
reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
|
116
116
|
The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
|
117
117
|
blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
|
118
|
-
[`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
|
119
|
-
[`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
|
118
|
+
[`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
|
119
|
+
[`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
|
120
120
|
encoder_hid_dim (`int`, *optional*, defaults to None):
|
121
121
|
If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
|
122
122
|
dimension to `cross_attention_dim`.
|
@@ -903,17 +903,6 @@ class UNet2DConditionModel(
|
|
903
903
|
if self.original_attn_processors is not None:
|
904
904
|
self.set_attn_processor(self.original_attn_processors)
|
905
905
|
|
906
|
-
def unload_lora(self):
|
907
|
-
"""Unloads LoRA weights."""
|
908
|
-
deprecate(
|
909
|
-
"unload_lora",
|
910
|
-
"0.28.0",
|
911
|
-
"Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
|
912
|
-
)
|
913
|
-
for module in self.modules():
|
914
|
-
if hasattr(module, "set_lora_layer"):
|
915
|
-
module.set_lora_layer(None)
|
916
|
-
|
917
906
|
def get_time_embed(
|
918
907
|
self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
|
919
908
|
) -> Optional[torch.Tensor]:
|
@@ -22,7 +22,7 @@ import torch.utils.checkpoint
|
|
22
22
|
|
23
23
|
from ...configuration_utils import ConfigMixin, register_to_config
|
24
24
|
from ...loaders import UNet2DConditionLoadersMixin
|
25
|
-
from ...utils import BaseOutput,
|
25
|
+
from ...utils import BaseOutput, logging
|
26
26
|
from ..activations import get_activation
|
27
27
|
from ..attention_processor import (
|
28
28
|
ADDED_KV_ATTENTION_PROCESSORS,
|
@@ -546,18 +546,6 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
|
|
546
546
|
if self.original_attn_processors is not None:
|
547
547
|
self.set_attn_processor(self.original_attn_processors)
|
548
548
|
|
549
|
-
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unload_lora
|
550
|
-
def unload_lora(self):
|
551
|
-
"""Unloads LoRA weights."""
|
552
|
-
deprecate(
|
553
|
-
"unload_lora",
|
554
|
-
"0.28.0",
|
555
|
-
"Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
|
556
|
-
)
|
557
|
-
for module in self.modules():
|
558
|
-
if hasattr(module, "set_lora_layer"):
|
559
|
-
module.set_lora_layer(None)
|
560
|
-
|
561
549
|
def forward(
|
562
550
|
self,
|
563
551
|
sample: torch.Tensor,
|
@@ -598,15 +586,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
|
|
598
586
|
mid_block_additional_residual: (`torch.Tensor`, *optional*):
|
599
587
|
A tensor that if specified is added to the residual of the middle unet block.
|
600
588
|
return_dict (`bool`, *optional*, defaults to `True`):
|
601
|
-
Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
|
589
|
+
Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
|
602
590
|
tuple.
|
603
591
|
cross_attention_kwargs (`dict`, *optional*):
|
604
592
|
A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
|
605
593
|
|
606
594
|
Returns:
|
607
|
-
[`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
|
608
|
-
If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned,
|
609
|
-
a `tuple` is returned where the first element is the sample tensor.
|
595
|
+
[`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
|
596
|
+
If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
|
597
|
+
otherwise a `tuple` is returned where the first element is the sample tensor.
|
610
598
|
"""
|
611
599
|
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
612
600
|
# The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
|
@@ -542,13 +542,13 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
|
542
542
|
`self.processor` in
|
543
543
|
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
544
544
|
return_dict (`bool`, *optional*, defaults to `True`):
|
545
|
-
Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
|
545
|
+
Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
|
546
546
|
tuple.
|
547
547
|
|
548
548
|
Returns:
|
549
|
-
[`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
|
550
|
-
If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned,
|
551
|
-
a `tuple` is returned where the first element is the sample tensor.
|
549
|
+
[`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
|
550
|
+
If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
|
551
|
+
otherwise a `tuple` is returned where the first element is the sample tensor.
|
552
552
|
"""
|
553
553
|
batch_size, channels, num_frames, height, width = sample.shape
|
554
554
|
|
@@ -856,13 +856,13 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
|
|
856
856
|
mid_block_additional_residual: (`torch.Tensor`, *optional*):
|
857
857
|
A tensor that if specified is added to the residual of the middle unet block.
|
858
858
|
return_dict (`bool`, *optional*, defaults to `True`):
|
859
|
-
Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
|
859
|
+
Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
|
860
860
|
tuple.
|
861
861
|
|
862
862
|
Returns:
|
863
|
-
[`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
|
864
|
-
If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned,
|
865
|
-
a `tuple` is returned where the first element is the sample tensor.
|
863
|
+
[`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
|
864
|
+
If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
|
865
|
+
otherwise a `tuple` is returned where the first element is the sample tensor.
|
866
866
|
"""
|
867
867
|
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
868
868
|
# The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
|
@@ -57,9 +57,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
|
|
57
57
|
The dimension of the cross attention features.
|
58
58
|
transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
|
59
59
|
The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
|
60
|
-
[`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
|
61
|
-
[`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
|
62
|
-
[`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
|
60
|
+
[`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
|
61
|
+
[`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
|
62
|
+
[`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
|
63
63
|
num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
|
64
64
|
The number of attention heads.
|
65
65
|
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
|
diffusers/models/vq_model.py
CHANGED
@@ -11,172 +11,15 @@
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
|
-
from
|
15
|
-
from
|
14
|
+
from ..utils import deprecate
|
15
|
+
from .autoencoders.vq_model import VQEncoderOutput, VQModel
|
16
16
|
|
17
|
-
import torch
|
18
|
-
import torch.nn as nn
|
19
17
|
|
20
|
-
|
21
|
-
from
|
22
|
-
|
23
|
-
from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
|
24
|
-
from .modeling_utils import ModelMixin
|
18
|
+
class VQEncoderOutput(VQEncoderOutput):
|
19
|
+
deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
|
20
|
+
deprecate("VQEncoderOutput", "0.31", deprecation_message)
|
25
21
|
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
"""
|
30
|
-
Output of VQModel encoding method.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
|
34
|
-
The encoded output sample from the last layer of the model.
|
35
|
-
"""
|
36
|
-
|
37
|
-
latents: torch.Tensor
|
38
|
-
|
39
|
-
|
40
|
-
class VQModel(ModelMixin, ConfigMixin):
|
41
|
-
r"""
|
42
|
-
A VQ-VAE model for decoding latent representations.
|
43
|
-
|
44
|
-
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
|
45
|
-
for all models (such as downloading or saving).
|
46
|
-
|
47
|
-
Parameters:
|
48
|
-
in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
|
49
|
-
out_channels (int, *optional*, defaults to 3): Number of channels in the output.
|
50
|
-
down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
|
51
|
-
Tuple of downsample block types.
|
52
|
-
up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
|
53
|
-
Tuple of upsample block types.
|
54
|
-
block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
|
55
|
-
Tuple of block output channels.
|
56
|
-
layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
|
57
|
-
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
|
58
|
-
latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
|
59
|
-
sample_size (`int`, *optional*, defaults to `32`): Sample input size.
|
60
|
-
num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
|
61
|
-
norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
|
62
|
-
vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
|
63
|
-
scaling_factor (`float`, *optional*, defaults to `0.18215`):
|
64
|
-
The component-wise standard deviation of the trained latent space computed using the first batch of the
|
65
|
-
training set. This is used to scale the latent space to have unit variance when training the diffusion
|
66
|
-
model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
|
67
|
-
diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
|
68
|
-
/ scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
|
69
|
-
Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
|
70
|
-
norm_type (`str`, *optional*, defaults to `"group"`):
|
71
|
-
Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
|
72
|
-
"""
|
73
|
-
|
74
|
-
@register_to_config
|
75
|
-
def __init__(
|
76
|
-
self,
|
77
|
-
in_channels: int = 3,
|
78
|
-
out_channels: int = 3,
|
79
|
-
down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
|
80
|
-
up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
|
81
|
-
block_out_channels: Tuple[int, ...] = (64,),
|
82
|
-
layers_per_block: int = 1,
|
83
|
-
act_fn: str = "silu",
|
84
|
-
latent_channels: int = 3,
|
85
|
-
sample_size: int = 32,
|
86
|
-
num_vq_embeddings: int = 256,
|
87
|
-
norm_num_groups: int = 32,
|
88
|
-
vq_embed_dim: Optional[int] = None,
|
89
|
-
scaling_factor: float = 0.18215,
|
90
|
-
norm_type: str = "group", # group, spatial
|
91
|
-
mid_block_add_attention=True,
|
92
|
-
lookup_from_codebook=False,
|
93
|
-
force_upcast=False,
|
94
|
-
):
|
95
|
-
super().__init__()
|
96
|
-
|
97
|
-
# pass init params to Encoder
|
98
|
-
self.encoder = Encoder(
|
99
|
-
in_channels=in_channels,
|
100
|
-
out_channels=latent_channels,
|
101
|
-
down_block_types=down_block_types,
|
102
|
-
block_out_channels=block_out_channels,
|
103
|
-
layers_per_block=layers_per_block,
|
104
|
-
act_fn=act_fn,
|
105
|
-
norm_num_groups=norm_num_groups,
|
106
|
-
double_z=False,
|
107
|
-
mid_block_add_attention=mid_block_add_attention,
|
108
|
-
)
|
109
|
-
|
110
|
-
vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
|
111
|
-
|
112
|
-
self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
|
113
|
-
self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
|
114
|
-
self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
|
115
|
-
|
116
|
-
# pass init params to Decoder
|
117
|
-
self.decoder = Decoder(
|
118
|
-
in_channels=latent_channels,
|
119
|
-
out_channels=out_channels,
|
120
|
-
up_block_types=up_block_types,
|
121
|
-
block_out_channels=block_out_channels,
|
122
|
-
layers_per_block=layers_per_block,
|
123
|
-
act_fn=act_fn,
|
124
|
-
norm_num_groups=norm_num_groups,
|
125
|
-
norm_type=norm_type,
|
126
|
-
mid_block_add_attention=mid_block_add_attention,
|
127
|
-
)
|
128
|
-
|
129
|
-
@apply_forward_hook
|
130
|
-
def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
|
131
|
-
h = self.encoder(x)
|
132
|
-
h = self.quant_conv(h)
|
133
|
-
|
134
|
-
if not return_dict:
|
135
|
-
return (h,)
|
136
|
-
|
137
|
-
return VQEncoderOutput(latents=h)
|
138
|
-
|
139
|
-
@apply_forward_hook
|
140
|
-
def decode(
|
141
|
-
self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
|
142
|
-
) -> Union[DecoderOutput, torch.Tensor]:
|
143
|
-
# also go through quantization layer
|
144
|
-
if not force_not_quantize:
|
145
|
-
quant, commit_loss, _ = self.quantize(h)
|
146
|
-
elif self.config.lookup_from_codebook:
|
147
|
-
quant = self.quantize.get_codebook_entry(h, shape)
|
148
|
-
commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
|
149
|
-
else:
|
150
|
-
quant = h
|
151
|
-
commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
|
152
|
-
quant2 = self.post_quant_conv(quant)
|
153
|
-
dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
|
154
|
-
|
155
|
-
if not return_dict:
|
156
|
-
return dec, commit_loss
|
157
|
-
|
158
|
-
return DecoderOutput(sample=dec, commit_loss=commit_loss)
|
159
|
-
|
160
|
-
def forward(
|
161
|
-
self, sample: torch.Tensor, return_dict: bool = True
|
162
|
-
) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
|
163
|
-
r"""
|
164
|
-
The [`VQModel`] forward method.
|
165
|
-
|
166
|
-
Args:
|
167
|
-
sample (`torch.Tensor`): Input sample.
|
168
|
-
return_dict (`bool`, *optional*, defaults to `True`):
|
169
|
-
Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
|
170
|
-
|
171
|
-
Returns:
|
172
|
-
[`~models.vq_model.VQEncoderOutput`] or `tuple`:
|
173
|
-
If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
|
174
|
-
is returned.
|
175
|
-
"""
|
176
|
-
|
177
|
-
h = self.encode(sample).latents
|
178
|
-
dec = self.decode(h)
|
179
|
-
|
180
|
-
if not return_dict:
|
181
|
-
return dec.sample, dec.commit_loss
|
182
|
-
return dec
|
23
|
+
class VQModel(VQModel):
|
24
|
+
deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
|
25
|
+
deprecate("VQModel", "0.31", deprecation_message)
|
diffusers/pipelines/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from ..utils import (
|
|
20
20
|
_dummy_objects = {}
|
21
21
|
_import_structure = {
|
22
22
|
"controlnet": [],
|
23
|
+
"controlnet_sd3": [],
|
23
24
|
"controlnet_xs": [],
|
24
25
|
"deprecated": [],
|
25
26
|
"latent_diffusion": [],
|
@@ -142,6 +143,11 @@ else:
|
|
142
143
|
"StableDiffusionXLControlNetXSPipeline",
|
143
144
|
]
|
144
145
|
)
|
146
|
+
_import_structure["controlnet_sd3"].extend(
|
147
|
+
[
|
148
|
+
"StableDiffusion3ControlNetPipeline",
|
149
|
+
]
|
150
|
+
)
|
145
151
|
_import_structure["deepfloyd_if"] = [
|
146
152
|
"IFImg2ImgPipeline",
|
147
153
|
"IFImg2ImgSuperResolutionPipeline",
|
@@ -220,6 +226,7 @@ else:
|
|
220
226
|
"StableDiffusionLDM3DPipeline",
|
221
227
|
]
|
222
228
|
)
|
229
|
+
_import_structure["stable_diffusion_3"] = ["StableDiffusion3Pipeline", "StableDiffusion3Img2ImgPipeline"]
|
223
230
|
_import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
|
224
231
|
_import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
|
225
232
|
_import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
|
@@ -393,6 +400,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
393
400
|
StableDiffusionXLControlNetInpaintPipeline,
|
394
401
|
StableDiffusionXLControlNetPipeline,
|
395
402
|
)
|
403
|
+
from .controlnet_sd3 import (
|
404
|
+
StableDiffusion3ControlNetPipeline,
|
405
|
+
)
|
396
406
|
from .controlnet_xs import (
|
397
407
|
StableDiffusionControlNetXSPipeline,
|
398
408
|
StableDiffusionXLControlNetXSPipeline,
|
@@ -485,6 +495,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
485
495
|
StableUnCLIPImg2ImgPipeline,
|
486
496
|
StableUnCLIPPipeline,
|
487
497
|
)
|
498
|
+
from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
|
488
499
|
from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
|
489
500
|
from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
|
490
501
|
from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
|
@@ -316,9 +316,10 @@ class AnimateDiffPipeline(
|
|
316
316
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
317
317
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
318
318
|
|
319
|
-
if
|
320
|
-
|
321
|
-
|
319
|
+
if self.text_encoder is not None:
|
320
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
321
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
322
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
322
323
|
|
323
324
|
return prompt_embeds, negative_prompt_embeds
|
324
325
|
|
@@ -420,9 +420,10 @@ class AnimateDiffVideoToVideoPipeline(
|
|
420
420
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
421
421
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
422
422
|
|
423
|
-
if
|
424
|
-
|
425
|
-
|
423
|
+
if self.text_encoder is not None:
|
424
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
425
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
426
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
426
427
|
|
427
428
|
return prompt_embeds, negative_prompt_embeds
|
428
429
|
|
@@ -27,6 +27,7 @@ from .controlnet import (
|
|
27
27
|
StableDiffusionXLControlNetPipeline,
|
28
28
|
)
|
29
29
|
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
|
30
|
+
from .hunyuandit import HunyuanDiTPipeline
|
30
31
|
from .kandinsky import (
|
31
32
|
KandinskyCombinedPipeline,
|
32
33
|
KandinskyImg2ImgCombinedPipeline,
|
@@ -52,6 +53,10 @@ from .stable_diffusion import (
|
|
52
53
|
StableDiffusionInpaintPipeline,
|
53
54
|
StableDiffusionPipeline,
|
54
55
|
)
|
56
|
+
from .stable_diffusion_3 import (
|
57
|
+
StableDiffusion3Img2ImgPipeline,
|
58
|
+
StableDiffusion3Pipeline,
|
59
|
+
)
|
55
60
|
from .stable_diffusion_xl import (
|
56
61
|
StableDiffusionXLImg2ImgPipeline,
|
57
62
|
StableDiffusionXLInpaintPipeline,
|
@@ -64,7 +69,9 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
64
69
|
[
|
65
70
|
("stable-diffusion", StableDiffusionPipeline),
|
66
71
|
("stable-diffusion-xl", StableDiffusionXLPipeline),
|
72
|
+
("stable-diffusion-3", StableDiffusion3Pipeline),
|
67
73
|
("if", IFPipeline),
|
74
|
+
("hunyuan", HunyuanDiTPipeline),
|
68
75
|
("kandinsky", KandinskyCombinedPipeline),
|
69
76
|
("kandinsky22", KandinskyV22CombinedPipeline),
|
70
77
|
("kandinsky3", Kandinsky3Pipeline),
|
@@ -82,6 +89,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|
82
89
|
[
|
83
90
|
("stable-diffusion", StableDiffusionImg2ImgPipeline),
|
84
91
|
("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline),
|
92
|
+
("stable-diffusion-3", StableDiffusion3Img2ImgPipeline),
|
85
93
|
("if", IFImg2ImgPipeline),
|
86
94
|
("kandinsky", KandinskyImg2ImgCombinedPipeline),
|
87
95
|
("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
|
@@ -463,9 +463,10 @@ class StableDiffusionControlNetPipeline(
|
|
463
463
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
464
464
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
465
465
|
|
466
|
-
if
|
467
|
-
|
468
|
-
|
466
|
+
if self.text_encoder is not None:
|
467
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
468
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
469
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
469
470
|
|
470
471
|
return prompt_embeds, negative_prompt_embeds
|
471
472
|
|
@@ -441,9 +441,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
441
441
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
442
442
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
443
443
|
|
444
|
-
if
|
445
|
-
|
446
|
-
|
444
|
+
if self.text_encoder is not None:
|
445
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
446
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
447
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
447
448
|
|
448
449
|
return prompt_embeds, negative_prompt_embeds
|
449
450
|
|
@@ -566,9 +566,10 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
566
566
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
567
567
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
568
568
|
|
569
|
-
if
|
570
|
-
|
571
|
-
|
569
|
+
if self.text_encoder is not None:
|
570
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
571
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
572
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
572
573
|
|
573
574
|
return prompt_embeds, negative_prompt_embeds
|
574
575
|
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from ...utils import (
|
4
|
+
DIFFUSERS_SLOW_IMPORT,
|
5
|
+
OptionalDependencyNotAvailable,
|
6
|
+
_LazyModule,
|
7
|
+
get_objects_from_module,
|
8
|
+
is_flax_available,
|
9
|
+
is_torch_available,
|
10
|
+
is_transformers_available,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
_dummy_objects = {}
|
15
|
+
_import_structure = {}
|
16
|
+
|
17
|
+
try:
|
18
|
+
if not (is_transformers_available() and is_torch_available()):
|
19
|
+
raise OptionalDependencyNotAvailable()
|
20
|
+
except OptionalDependencyNotAvailable:
|
21
|
+
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
22
|
+
|
23
|
+
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
|
+
else:
|
25
|
+
_import_structure["pipeline_stable_diffusion_3_controlnet"] = ["StableDiffusion3ControlNetPipeline"]
|
26
|
+
|
27
|
+
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
28
|
+
try:
|
29
|
+
if not (is_transformers_available() and is_torch_available()):
|
30
|
+
raise OptionalDependencyNotAvailable()
|
31
|
+
|
32
|
+
except OptionalDependencyNotAvailable:
|
33
|
+
from ...utils.dummy_torch_and_transformers_objects import *
|
34
|
+
else:
|
35
|
+
from .pipeline_stable_diffusion_3_controlnet import StableDiffusion3ControlNetPipeline
|
36
|
+
|
37
|
+
try:
|
38
|
+
if not (is_transformers_available() and is_flax_available()):
|
39
|
+
raise OptionalDependencyNotAvailable()
|
40
|
+
except OptionalDependencyNotAvailable:
|
41
|
+
from ...utils.dummy_flax_and_transformers_objects import * # noqa F403
|
42
|
+
|
43
|
+
else:
|
44
|
+
import sys
|
45
|
+
|
46
|
+
sys.modules[__name__] = _LazyModule(
|
47
|
+
__name__,
|
48
|
+
globals()["__file__"],
|
49
|
+
_import_structure,
|
50
|
+
module_spec=__spec__,
|
51
|
+
)
|
52
|
+
for name, value in _dummy_objects.items():
|
53
|
+
setattr(sys.modules[__name__], name, value)
|