diffusers 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +9 -1
- diffusers/commands/env.py +1 -5
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +2 -1
- diffusers/loaders/__init__.py +2 -2
- diffusers/loaders/lora.py +406 -140
- diffusers/loaders/lora_conversion_utils.py +7 -1
- diffusers/loaders/single_file.py +1 -1
- diffusers/loaders/single_file_model.py +5 -0
- diffusers/loaders/single_file_utils.py +242 -2
- diffusers/loaders/unet.py +307 -272
- diffusers/models/__init__.py +5 -3
- diffusers/models/attention.py +125 -1
- diffusers/models/attention_processor.py +169 -1
- diffusers/models/autoencoders/__init__.py +1 -0
- diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
- diffusers/models/autoencoders/autoencoder_kl.py +17 -6
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +4 -2
- diffusers/models/autoencoders/consistency_decoder_vae.py +9 -9
- diffusers/models/autoencoders/vq_model.py +182 -0
- diffusers/models/controlnet_xs.py +6 -6
- diffusers/models/embeddings.py +112 -84
- diffusers/models/model_loading_utils.py +55 -0
- diffusers/models/modeling_utils.py +128 -17
- diffusers/models/normalization.py +11 -6
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/dual_transformer_2d.py +5 -4
- diffusers/models/transformers/hunyuan_transformer_2d.py +149 -2
- diffusers/models/transformers/prior_transformer.py +5 -5
- diffusers/models/transformers/transformer_2d.py +2 -2
- diffusers/models/transformers/transformer_sd3.py +344 -0
- diffusers/models/transformers/transformer_temporal.py +12 -10
- diffusers/models/unets/unet_1d.py +3 -3
- diffusers/models/unets/unet_2d.py +3 -3
- diffusers/models/unets/unet_2d_condition.py +4 -15
- diffusers/models/unets/unet_3d_condition.py +5 -17
- diffusers/models/unets/unet_i2vgen_xl.py +4 -4
- diffusers/models/unets/unet_motion_model.py +4 -4
- diffusers/models/unets/unet_spatio_temporal_condition.py +3 -3
- diffusers/models/vq_model.py +8 -165
- diffusers/pipelines/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +4 -3
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +4 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +4 -3
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +4 -3
- diffusers/pipelines/deepfloyd_if/watermark.py +1 -1
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +4 -3
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +4 -3
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +24 -5
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +4 -3
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +4 -3
- diffusers/pipelines/marigold/marigold_image_processing.py +35 -20
- diffusers/pipelines/pia/pipeline_pia.py +4 -3
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +17 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +4 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +52 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_output.py +21 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +886 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +923 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +4 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +10 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +4 -3
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +4 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +4 -3
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +4 -3
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +4 -3
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +4 -3
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +4 -3
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -3
- diffusers/pipelines/unidiffuser/modeling_uvit.py +1 -1
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +4 -3
- diffusers/schedulers/__init__.py +2 -0
- diffusers/schedulers/scheduling_dpmsolver_sde.py +2 -2
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -3
- diffusers/schedulers/scheduling_edm_euler.py +2 -4
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +287 -0
- diffusers/schedulers/scheduling_lms_discrete.py +2 -2
- diffusers/training_utils.py +4 -4
- diffusers/utils/__init__.py +3 -0
- diffusers/utils/constants.py +2 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +30 -0
- diffusers/utils/dynamic_modules_utils.py +15 -13
- diffusers/utils/hub_utils.py +106 -0
- diffusers/utils/import_utils.py +0 -1
- diffusers/utils/logging.py +3 -1
- diffusers/utils/state_dict_utils.py +2 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/METADATA +45 -45
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/RECORD +108 -111
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/WHEEL +1 -1
- diffusers/models/dual_transformer_2d.py +0 -20
- diffusers/models/prior_transformer.py +0 -12
- diffusers/models/t5_film_transformer.py +0 -70
- diffusers/models/transformer_2d.py +0 -25
- diffusers/models/transformer_temporal.py +0 -34
- diffusers/models/unet_1d.py +0 -26
- diffusers/models/unet_1d_blocks.py +0 -203
- diffusers/models/unet_2d.py +0 -27
- diffusers/models/unet_2d_blocks.py +0 -375
- diffusers/models/unet_2d_condition.py +0 -25
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/LICENSE +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.28.2.dist-info → diffusers-0.29.0.dist-info}/top_level.txt +0 -0
diffusers/models/vq_model.py
CHANGED
@@ -11,172 +11,15 @@
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
|
-
from
|
15
|
-
from
|
14
|
+
from ..utils import deprecate
|
15
|
+
from .autoencoders.vq_model import VQEncoderOutput, VQModel
|
16
16
|
|
17
|
-
import torch
|
18
|
-
import torch.nn as nn
|
19
17
|
|
20
|
-
|
21
|
-
from
|
22
|
-
|
23
|
-
from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
|
24
|
-
from .modeling_utils import ModelMixin
|
18
|
+
class VQEncoderOutput(VQEncoderOutput):
|
19
|
+
deprecation_message = "Importing `VQEncoderOutput` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQEncoderOutput`, instead."
|
20
|
+
deprecate("VQEncoderOutput", "0.31", deprecation_message)
|
25
21
|
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
"""
|
30
|
-
Output of VQModel encoding method.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
|
34
|
-
The encoded output sample from the last layer of the model.
|
35
|
-
"""
|
36
|
-
|
37
|
-
latents: torch.Tensor
|
38
|
-
|
39
|
-
|
40
|
-
class VQModel(ModelMixin, ConfigMixin):
|
41
|
-
r"""
|
42
|
-
A VQ-VAE model for decoding latent representations.
|
43
|
-
|
44
|
-
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
|
45
|
-
for all models (such as downloading or saving).
|
46
|
-
|
47
|
-
Parameters:
|
48
|
-
in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
|
49
|
-
out_channels (int, *optional*, defaults to 3): Number of channels in the output.
|
50
|
-
down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
|
51
|
-
Tuple of downsample block types.
|
52
|
-
up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
|
53
|
-
Tuple of upsample block types.
|
54
|
-
block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
|
55
|
-
Tuple of block output channels.
|
56
|
-
layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
|
57
|
-
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
|
58
|
-
latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
|
59
|
-
sample_size (`int`, *optional*, defaults to `32`): Sample input size.
|
60
|
-
num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
|
61
|
-
norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
|
62
|
-
vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
|
63
|
-
scaling_factor (`float`, *optional*, defaults to `0.18215`):
|
64
|
-
The component-wise standard deviation of the trained latent space computed using the first batch of the
|
65
|
-
training set. This is used to scale the latent space to have unit variance when training the diffusion
|
66
|
-
model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
|
67
|
-
diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
|
68
|
-
/ scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
|
69
|
-
Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
|
70
|
-
norm_type (`str`, *optional*, defaults to `"group"`):
|
71
|
-
Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
|
72
|
-
"""
|
73
|
-
|
74
|
-
@register_to_config
|
75
|
-
def __init__(
|
76
|
-
self,
|
77
|
-
in_channels: int = 3,
|
78
|
-
out_channels: int = 3,
|
79
|
-
down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
|
80
|
-
up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
|
81
|
-
block_out_channels: Tuple[int, ...] = (64,),
|
82
|
-
layers_per_block: int = 1,
|
83
|
-
act_fn: str = "silu",
|
84
|
-
latent_channels: int = 3,
|
85
|
-
sample_size: int = 32,
|
86
|
-
num_vq_embeddings: int = 256,
|
87
|
-
norm_num_groups: int = 32,
|
88
|
-
vq_embed_dim: Optional[int] = None,
|
89
|
-
scaling_factor: float = 0.18215,
|
90
|
-
norm_type: str = "group", # group, spatial
|
91
|
-
mid_block_add_attention=True,
|
92
|
-
lookup_from_codebook=False,
|
93
|
-
force_upcast=False,
|
94
|
-
):
|
95
|
-
super().__init__()
|
96
|
-
|
97
|
-
# pass init params to Encoder
|
98
|
-
self.encoder = Encoder(
|
99
|
-
in_channels=in_channels,
|
100
|
-
out_channels=latent_channels,
|
101
|
-
down_block_types=down_block_types,
|
102
|
-
block_out_channels=block_out_channels,
|
103
|
-
layers_per_block=layers_per_block,
|
104
|
-
act_fn=act_fn,
|
105
|
-
norm_num_groups=norm_num_groups,
|
106
|
-
double_z=False,
|
107
|
-
mid_block_add_attention=mid_block_add_attention,
|
108
|
-
)
|
109
|
-
|
110
|
-
vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
|
111
|
-
|
112
|
-
self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
|
113
|
-
self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
|
114
|
-
self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
|
115
|
-
|
116
|
-
# pass init params to Decoder
|
117
|
-
self.decoder = Decoder(
|
118
|
-
in_channels=latent_channels,
|
119
|
-
out_channels=out_channels,
|
120
|
-
up_block_types=up_block_types,
|
121
|
-
block_out_channels=block_out_channels,
|
122
|
-
layers_per_block=layers_per_block,
|
123
|
-
act_fn=act_fn,
|
124
|
-
norm_num_groups=norm_num_groups,
|
125
|
-
norm_type=norm_type,
|
126
|
-
mid_block_add_attention=mid_block_add_attention,
|
127
|
-
)
|
128
|
-
|
129
|
-
@apply_forward_hook
|
130
|
-
def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
|
131
|
-
h = self.encoder(x)
|
132
|
-
h = self.quant_conv(h)
|
133
|
-
|
134
|
-
if not return_dict:
|
135
|
-
return (h,)
|
136
|
-
|
137
|
-
return VQEncoderOutput(latents=h)
|
138
|
-
|
139
|
-
@apply_forward_hook
|
140
|
-
def decode(
|
141
|
-
self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
|
142
|
-
) -> Union[DecoderOutput, torch.Tensor]:
|
143
|
-
# also go through quantization layer
|
144
|
-
if not force_not_quantize:
|
145
|
-
quant, commit_loss, _ = self.quantize(h)
|
146
|
-
elif self.config.lookup_from_codebook:
|
147
|
-
quant = self.quantize.get_codebook_entry(h, shape)
|
148
|
-
commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
|
149
|
-
else:
|
150
|
-
quant = h
|
151
|
-
commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
|
152
|
-
quant2 = self.post_quant_conv(quant)
|
153
|
-
dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
|
154
|
-
|
155
|
-
if not return_dict:
|
156
|
-
return dec, commit_loss
|
157
|
-
|
158
|
-
return DecoderOutput(sample=dec, commit_loss=commit_loss)
|
159
|
-
|
160
|
-
def forward(
|
161
|
-
self, sample: torch.Tensor, return_dict: bool = True
|
162
|
-
) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
|
163
|
-
r"""
|
164
|
-
The [`VQModel`] forward method.
|
165
|
-
|
166
|
-
Args:
|
167
|
-
sample (`torch.Tensor`): Input sample.
|
168
|
-
return_dict (`bool`, *optional*, defaults to `True`):
|
169
|
-
Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
|
170
|
-
|
171
|
-
Returns:
|
172
|
-
[`~models.vq_model.VQEncoderOutput`] or `tuple`:
|
173
|
-
If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
|
174
|
-
is returned.
|
175
|
-
"""
|
176
|
-
|
177
|
-
h = self.encode(sample).latents
|
178
|
-
dec = self.decode(h)
|
179
|
-
|
180
|
-
if not return_dict:
|
181
|
-
return dec.sample, dec.commit_loss
|
182
|
-
return dec
|
23
|
+
class VQModel(VQModel):
|
24
|
+
deprecation_message = "Importing `VQModel` from `diffusers.models.vq_model` is deprecated and this will be removed in a future version. Please use `from diffusers.models.autoencoders.vq_model import VQModel`, instead."
|
25
|
+
deprecate("VQModel", "0.31", deprecation_message)
|
diffusers/pipelines/__init__.py
CHANGED
@@ -220,6 +220,7 @@ else:
|
|
220
220
|
"StableDiffusionLDM3DPipeline",
|
221
221
|
]
|
222
222
|
)
|
223
|
+
_import_structure["stable_diffusion_3"] = ["StableDiffusion3Pipeline", "StableDiffusion3Img2ImgPipeline"]
|
223
224
|
_import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
|
224
225
|
_import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
|
225
226
|
_import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
|
@@ -485,6 +486,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
485
486
|
StableUnCLIPImg2ImgPipeline,
|
486
487
|
StableUnCLIPPipeline,
|
487
488
|
)
|
489
|
+
from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
|
488
490
|
from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
|
489
491
|
from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
|
490
492
|
from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
|
@@ -316,9 +316,10 @@ class AnimateDiffPipeline(
|
|
316
316
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
317
317
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
318
318
|
|
319
|
-
if
|
320
|
-
|
321
|
-
|
319
|
+
if self.text_encoder is not None:
|
320
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
321
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
322
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
322
323
|
|
323
324
|
return prompt_embeds, negative_prompt_embeds
|
324
325
|
|
@@ -420,9 +420,10 @@ class AnimateDiffVideoToVideoPipeline(
|
|
420
420
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
421
421
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
422
422
|
|
423
|
-
if
|
424
|
-
|
425
|
-
|
423
|
+
if self.text_encoder is not None:
|
424
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
425
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
426
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
426
427
|
|
427
428
|
return prompt_embeds, negative_prompt_embeds
|
428
429
|
|
@@ -463,9 +463,10 @@ class StableDiffusionControlNetPipeline(
|
|
463
463
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
464
464
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
465
465
|
|
466
|
-
if
|
467
|
-
|
468
|
-
|
466
|
+
if self.text_encoder is not None:
|
467
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
468
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
469
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
469
470
|
|
470
471
|
return prompt_embeds, negative_prompt_embeds
|
471
472
|
|
@@ -441,9 +441,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
441
441
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
442
442
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
443
443
|
|
444
|
-
if
|
445
|
-
|
446
|
-
|
444
|
+
if self.text_encoder is not None:
|
445
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
446
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
447
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
447
448
|
|
448
449
|
return prompt_embeds, negative_prompt_embeds
|
449
450
|
|
@@ -566,9 +566,10 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
566
566
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
567
567
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
568
568
|
|
569
|
-
if
|
570
|
-
|
571
|
-
|
569
|
+
if self.text_encoder is not None:
|
570
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
571
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
572
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
572
573
|
|
573
574
|
return prompt_embeds, negative_prompt_embeds
|
574
575
|
|
@@ -390,9 +390,10 @@ class StableDiffusionControlNetXSPipeline(
|
|
390
390
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
391
391
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
392
392
|
|
393
|
-
if
|
394
|
-
|
395
|
-
|
393
|
+
if self.text_encoder is not None:
|
394
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
395
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
396
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
396
397
|
|
397
398
|
return prompt_embeds, negative_prompt_embeds
|
398
399
|
|
@@ -17,7 +17,7 @@ class IFWatermarker(ModelMixin, ConfigMixin):
|
|
17
17
|
self.watermark_image_as_pil = None
|
18
18
|
|
19
19
|
def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
|
20
|
-
#
|
20
|
+
# Copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
|
21
21
|
|
22
22
|
h = images[0].height
|
23
23
|
w = images[0].width
|
@@ -456,9 +456,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
|
|
456
456
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
457
457
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
458
458
|
|
459
|
-
if
|
460
|
-
|
461
|
-
|
459
|
+
if self.text_encoder is not None:
|
460
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
461
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
462
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
462
463
|
|
463
464
|
return prompt_embeds, negative_prompt_embeds
|
464
465
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
CHANGED
@@ -426,9 +426,10 @@ class StableDiffusionInpaintPipelineLegacy(
|
|
426
426
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
427
427
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
428
428
|
|
429
|
-
if
|
430
|
-
|
431
|
-
|
429
|
+
if self.text_encoder is not None:
|
430
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
431
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
432
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
432
433
|
|
433
434
|
return prompt_embeds, negative_prompt_embeds
|
434
435
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
CHANGED
@@ -364,9 +364,10 @@ class StableDiffusionModelEditingPipeline(
|
|
364
364
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
365
365
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
366
366
|
|
367
|
-
if
|
368
|
-
|
369
|
-
|
367
|
+
if self.text_encoder is not None:
|
368
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
369
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
370
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
370
371
|
|
371
372
|
return prompt_embeds, negative_prompt_embeds
|
372
373
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
CHANGED
@@ -355,9 +355,10 @@ class StableDiffusionParadigmsPipeline(
|
|
355
355
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
356
356
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
357
357
|
|
358
|
-
if
|
359
|
-
|
360
|
-
|
358
|
+
if self.text_encoder is not None:
|
359
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
360
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
361
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
361
362
|
|
362
363
|
return prompt_embeds, negative_prompt_embeds
|
363
364
|
|
diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
CHANGED
@@ -578,9 +578,10 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin
|
|
578
578
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
579
579
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
580
580
|
|
581
|
-
if
|
582
|
-
|
583
|
-
|
581
|
+
if self.text_encoder is not None:
|
582
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
583
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
584
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
584
585
|
|
585
586
|
return prompt_embeds, negative_prompt_embeds
|
586
587
|
|
@@ -52,7 +52,9 @@ EXAMPLE_DOC_STRING = """
|
|
52
52
|
>>> import torch
|
53
53
|
>>> from diffusers import HunyuanDiTPipeline
|
54
54
|
|
55
|
-
>>> pipe = HunyuanDiTPipeline.from_pretrained(
|
55
|
+
>>> pipe = HunyuanDiTPipeline.from_pretrained(
|
56
|
+
... "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
|
57
|
+
... )
|
56
58
|
>>> pipe.to("cuda")
|
57
59
|
|
58
60
|
>>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
|
@@ -226,16 +228,22 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
226
228
|
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
|
227
229
|
)
|
228
230
|
|
229
|
-
self.vae_scale_factor =
|
231
|
+
self.vae_scale_factor = (
|
232
|
+
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
233
|
+
)
|
230
234
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
231
235
|
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
232
|
-
self.default_sample_size =
|
236
|
+
self.default_sample_size = (
|
237
|
+
self.transformer.config.sample_size
|
238
|
+
if hasattr(self, "transformer") and self.transformer is not None
|
239
|
+
else 128
|
240
|
+
)
|
233
241
|
|
234
242
|
def encode_prompt(
|
235
243
|
self,
|
236
244
|
prompt: str,
|
237
|
-
device: torch.device,
|
238
|
-
dtype: torch.dtype,
|
245
|
+
device: torch.device = None,
|
246
|
+
dtype: torch.dtype = None,
|
239
247
|
num_images_per_prompt: int = 1,
|
240
248
|
do_classifier_free_guidance: bool = True,
|
241
249
|
negative_prompt: Optional[str] = None,
|
@@ -279,6 +287,17 @@ class HunyuanDiTPipeline(DiffusionPipeline):
|
|
279
287
|
text_encoder_index (`int`, *optional*):
|
280
288
|
Index of the text encoder to use. `0` for clip and `1` for T5.
|
281
289
|
"""
|
290
|
+
if dtype is None:
|
291
|
+
if self.text_encoder_2 is not None:
|
292
|
+
dtype = self.text_encoder_2.dtype
|
293
|
+
elif self.transformer is not None:
|
294
|
+
dtype = self.transformer.dtype
|
295
|
+
else:
|
296
|
+
dtype = None
|
297
|
+
|
298
|
+
if device is None:
|
299
|
+
device = self._execution_device
|
300
|
+
|
282
301
|
tokenizers = [self.tokenizer, self.tokenizer_2]
|
283
302
|
text_encoders = [self.text_encoder, self.text_encoder_2]
|
284
303
|
|
@@ -405,9 +405,10 @@ class LatentConsistencyModelImg2ImgPipeline(
|
|
405
405
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
406
406
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
407
407
|
|
408
|
-
if
|
409
|
-
|
410
|
-
|
408
|
+
if self.text_encoder is not None:
|
409
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
410
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
411
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
411
412
|
|
412
413
|
return prompt_embeds, negative_prompt_embeds
|
413
414
|
|
@@ -389,9 +389,10 @@ class LatentConsistencyModelPipeline(
|
|
389
389
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
390
390
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
391
391
|
|
392
|
-
if
|
393
|
-
|
394
|
-
|
392
|
+
if self.text_encoder is not None:
|
393
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
394
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
395
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
395
396
|
|
396
397
|
return prompt_embeds, negative_prompt_embeds
|
397
398
|
|
@@ -245,9 +245,9 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
245
245
|
) -> Union[np.ndarray, torch.Tensor]:
|
246
246
|
"""
|
247
247
|
Converts a monochrome image into an RGB image by applying the specified colormap. This function mimics the
|
248
|
-
behavior of matplotlib.colormaps, but allows the user to use the most discriminative color
|
249
|
-
without having to install or import matplotlib. For all other cases, the function will attempt to use
|
250
|
-
native implementation.
|
248
|
+
behavior of matplotlib.colormaps, but allows the user to use the most discriminative color maps ("Spectral",
|
249
|
+
"binary") without having to install or import matplotlib. For all other cases, the function will attempt to use
|
250
|
+
the native implementation.
|
251
251
|
|
252
252
|
Args:
|
253
253
|
image: 2D tensor of values between 0 and 1, either as np.ndarray or torch.Tensor.
|
@@ -255,7 +255,7 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
255
255
|
bytes: Whether to return the output as uint8 or floating point image.
|
256
256
|
_force_method:
|
257
257
|
Can be used to specify whether to use the native implementation (`"matplotlib"`), the efficient custom
|
258
|
-
implementation of the
|
258
|
+
implementation of the select color maps (`"custom"`), or rely on autodetection (`None`, default).
|
259
259
|
|
260
260
|
Returns:
|
261
261
|
An RGB-colorized tensor corresponding to the input image.
|
@@ -265,6 +265,26 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
265
265
|
if _force_method not in (None, "matplotlib", "custom"):
|
266
266
|
raise ValueError("_force_method must be either `None`, `'matplotlib'` or `'custom'`.")
|
267
267
|
|
268
|
+
supported_cmaps = {
|
269
|
+
"binary": [
|
270
|
+
(1.0, 1.0, 1.0),
|
271
|
+
(0.0, 0.0, 0.0),
|
272
|
+
],
|
273
|
+
"Spectral": [ # Taken from matplotlib/_cm.py
|
274
|
+
(0.61960784313725492, 0.003921568627450980, 0.25882352941176473), # 0.0 -> [0]
|
275
|
+
(0.83529411764705885, 0.24313725490196078, 0.30980392156862746),
|
276
|
+
(0.95686274509803926, 0.42745098039215684, 0.2627450980392157),
|
277
|
+
(0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
|
278
|
+
(0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
|
279
|
+
(1.0, 1.0, 0.74901960784313726),
|
280
|
+
(0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
|
281
|
+
(0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
|
282
|
+
(0.4, 0.76078431372549016, 0.6470588235294118),
|
283
|
+
(0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
|
284
|
+
(0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
|
285
|
+
],
|
286
|
+
}
|
287
|
+
|
268
288
|
def method_matplotlib(image, cmap, bytes=False):
|
269
289
|
if is_matplotlib_available():
|
270
290
|
import matplotlib
|
@@ -298,24 +318,19 @@ class MarigoldImageProcessor(ConfigMixin):
|
|
298
318
|
else:
|
299
319
|
image = image.float()
|
300
320
|
|
301
|
-
|
302
|
-
|
321
|
+
is_cmap_reversed = cmap.endswith("_r")
|
322
|
+
if is_cmap_reversed:
|
323
|
+
cmap = cmap[:-2]
|
303
324
|
|
304
|
-
|
305
|
-
(
|
306
|
-
|
307
|
-
|
308
|
-
(0.99215686274509807, 0.68235294117647061, 0.38039215686274508),
|
309
|
-
(0.99607843137254903, 0.8784313725490196, 0.54509803921568623),
|
310
|
-
(1.0, 1.0, 0.74901960784313726),
|
311
|
-
(0.90196078431372551, 0.96078431372549022, 0.59607843137254901),
|
312
|
-
(0.6705882352941176, 0.8666666666666667, 0.64313725490196083),
|
313
|
-
(0.4, 0.76078431372549016, 0.6470588235294118),
|
314
|
-
(0.19607843137254902, 0.53333333333333333, 0.74117647058823533),
|
315
|
-
(0.36862745098039218, 0.30980392156862746, 0.63529411764705879), # 1.0 -> [K-1]
|
316
|
-
)
|
325
|
+
if cmap not in supported_cmaps:
|
326
|
+
raise ValueError(
|
327
|
+
f"Only {list(supported_cmaps.keys())} color maps are available without installing matplotlib."
|
328
|
+
)
|
317
329
|
|
318
|
-
cmap =
|
330
|
+
cmap = supported_cmaps[cmap]
|
331
|
+
if is_cmap_reversed:
|
332
|
+
cmap = cmap[::-1]
|
333
|
+
cmap = torch.tensor(cmap, dtype=torch.float, device=image.device) # [K,3]
|
319
334
|
K = cmap.shape[0]
|
320
335
|
|
321
336
|
pos = image.clamp(min=0, max=1) * (K - 1)
|
@@ -375,9 +375,10 @@ class PIAPipeline(
|
|
375
375
|
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
376
376
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
377
377
|
|
378
|
-
if
|
379
|
-
|
380
|
-
|
378
|
+
if self.text_encoder is not None:
|
379
|
+
if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
|
380
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
381
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
381
382
|
|
382
383
|
return prompt_embeds, negative_prompt_embeds
|
383
384
|
|
@@ -394,7 +394,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
|
|
394
394
|
|
395
395
|
# get unconditional embeddings for classifier free guidance
|
396
396
|
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
397
|
-
uncond_tokens = [negative_prompt] * batch_size
|
397
|
+
uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
|
398
398
|
uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
|
399
399
|
max_length = prompt_embeds.shape[1]
|
400
400
|
uncond_input = self.tokenizer(
|
@@ -320,7 +320,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
|
|
320
320
|
|
321
321
|
# get unconditional embeddings for classifier free guidance
|
322
322
|
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
323
|
-
uncond_tokens = [negative_prompt] * batch_size
|
323
|
+
uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
|
324
324
|
uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
|
325
325
|
max_length = prompt_embeds.shape[1]
|
326
326
|
uncond_input = self.tokenizer(
|