diffusers 0.28.0__py3-none-any.whl → 0.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +9 -1
- diffusers/configuration_utils.py +17 -0
- diffusers/models/__init__.py +6 -0
- diffusers/models/activations.py +12 -0
- diffusers/models/attention_processor.py +108 -0
- diffusers/models/embeddings.py +216 -8
- diffusers/models/model_loading_utils.py +28 -0
- diffusers/models/modeling_outputs.py +14 -0
- diffusers/models/modeling_utils.py +57 -1
- diffusers/models/normalization.py +2 -1
- diffusers/models/transformers/__init__.py +3 -0
- diffusers/models/transformers/dit_transformer_2d.py +240 -0
- diffusers/models/transformers/hunyuan_transformer_2d.py +427 -0
- diffusers/models/transformers/pixart_transformer_2d.py +336 -0
- diffusers/models/transformers/transformer_2d.py +37 -45
- diffusers/pipelines/__init__.py +2 -0
- diffusers/pipelines/dit/pipeline_dit.py +4 -4
- diffusers/pipelines/hunyuandit/__init__.py +48 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +881 -0
- diffusers/pipelines/pipeline_loading_utils.py +1 -0
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +4 -4
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +2 -2
- diffusers/utils/dummy_pt_objects.py +45 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +15 -0
- {diffusers-0.28.0.dist-info → diffusers-0.28.1.dist-info}/METADATA +44 -44
- {diffusers-0.28.0.dist-info → diffusers-0.28.1.dist-info}/RECORD +30 -25
- {diffusers-0.28.0.dist-info → diffusers-0.28.1.dist-info}/WHEEL +1 -1
- {diffusers-0.28.0.dist-info → diffusers-0.28.1.dist-info}/LICENSE +0 -0
- {diffusers-0.28.0.dist-info → diffusers-0.28.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.28.0.dist-info → diffusers-0.28.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,336 @@
|
|
1
|
+
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
from typing import Any, Dict, Optional
|
15
|
+
|
16
|
+
import torch
|
17
|
+
from torch import nn
|
18
|
+
|
19
|
+
from ...configuration_utils import ConfigMixin, register_to_config
|
20
|
+
from ...utils import is_torch_version, logging
|
21
|
+
from ..attention import BasicTransformerBlock
|
22
|
+
from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
|
23
|
+
from ..modeling_outputs import Transformer2DModelOutput
|
24
|
+
from ..modeling_utils import ModelMixin
|
25
|
+
from ..normalization import AdaLayerNormSingle
|
26
|
+
|
27
|
+
|
28
|
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
29
|
+
|
30
|
+
|
31
|
+
class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
|
32
|
+
r"""
|
33
|
+
A 2D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
|
34
|
+
https://arxiv.org/abs/2403.04692).
|
35
|
+
|
36
|
+
Parameters:
|
37
|
+
num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
|
38
|
+
attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
|
39
|
+
in_channels (int, defaults to 4): The number of channels in the input.
|
40
|
+
out_channels (int, optional):
|
41
|
+
The number of channels in the output. Specify this parameter if the output channel number differs from the
|
42
|
+
input.
|
43
|
+
num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
|
44
|
+
dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
|
45
|
+
norm_num_groups (int, optional, defaults to 32):
|
46
|
+
Number of groups for group normalization within Transformer blocks.
|
47
|
+
cross_attention_dim (int, optional):
|
48
|
+
The dimensionality for cross-attention layers, typically matching the encoder's hidden dimension.
|
49
|
+
attention_bias (bool, optional, defaults to True):
|
50
|
+
Configure if the Transformer blocks' attention should contain a bias parameter.
|
51
|
+
sample_size (int, defaults to 128):
|
52
|
+
The width of the latent images. This parameter is fixed during training.
|
53
|
+
patch_size (int, defaults to 2):
|
54
|
+
Size of the patches the model processes, relevant for architectures working on non-sequential data.
|
55
|
+
activation_fn (str, optional, defaults to "gelu-approximate"):
|
56
|
+
Activation function to use in feed-forward networks within Transformer blocks.
|
57
|
+
num_embeds_ada_norm (int, optional, defaults to 1000):
|
58
|
+
Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
|
59
|
+
inference.
|
60
|
+
upcast_attention (bool, optional, defaults to False):
|
61
|
+
If true, upcasts the attention mechanism dimensions for potentially improved performance.
|
62
|
+
norm_type (str, optional, defaults to "ada_norm_zero"):
|
63
|
+
Specifies the type of normalization used, can be 'ada_norm_zero'.
|
64
|
+
norm_elementwise_affine (bool, optional, defaults to False):
|
65
|
+
If true, enables element-wise affine parameters in the normalization layers.
|
66
|
+
norm_eps (float, optional, defaults to 1e-6):
|
67
|
+
A small constant added to the denominator in normalization layers to prevent division by zero.
|
68
|
+
interpolation_scale (int, optional): Scale factor to use during interpolating the position embeddings.
|
69
|
+
use_additional_conditions (bool, optional): If we're using additional conditions as inputs.
|
70
|
+
attention_type (str, optional, defaults to "default"): Kind of attention mechanism to be used.
|
71
|
+
caption_channels (int, optional, defaults to None):
|
72
|
+
Number of channels to use for projecting the caption embeddings.
|
73
|
+
use_linear_projection (bool, optional, defaults to False):
|
74
|
+
Deprecated argument. Will be removed in a future version.
|
75
|
+
num_vector_embeds (bool, optional, defaults to False):
|
76
|
+
Deprecated argument. Will be removed in a future version.
|
77
|
+
"""
|
78
|
+
|
79
|
+
_supports_gradient_checkpointing = True
|
80
|
+
_no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
|
81
|
+
|
82
|
+
@register_to_config
|
83
|
+
def __init__(
|
84
|
+
self,
|
85
|
+
num_attention_heads: int = 16,
|
86
|
+
attention_head_dim: int = 72,
|
87
|
+
in_channels: int = 4,
|
88
|
+
out_channels: Optional[int] = 8,
|
89
|
+
num_layers: int = 28,
|
90
|
+
dropout: float = 0.0,
|
91
|
+
norm_num_groups: int = 32,
|
92
|
+
cross_attention_dim: Optional[int] = 1152,
|
93
|
+
attention_bias: bool = True,
|
94
|
+
sample_size: int = 128,
|
95
|
+
patch_size: int = 2,
|
96
|
+
activation_fn: str = "gelu-approximate",
|
97
|
+
num_embeds_ada_norm: Optional[int] = 1000,
|
98
|
+
upcast_attention: bool = False,
|
99
|
+
norm_type: str = "ada_norm_single",
|
100
|
+
norm_elementwise_affine: bool = False,
|
101
|
+
norm_eps: float = 1e-6,
|
102
|
+
interpolation_scale: Optional[int] = None,
|
103
|
+
use_additional_conditions: Optional[bool] = None,
|
104
|
+
caption_channels: Optional[int] = None,
|
105
|
+
attention_type: Optional[str] = "default",
|
106
|
+
):
|
107
|
+
super().__init__()
|
108
|
+
|
109
|
+
# Validate inputs.
|
110
|
+
if norm_type != "ada_norm_single":
|
111
|
+
raise NotImplementedError(
|
112
|
+
f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
|
113
|
+
)
|
114
|
+
elif norm_type == "ada_norm_single" and num_embeds_ada_norm is None:
|
115
|
+
raise ValueError(
|
116
|
+
f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
|
117
|
+
)
|
118
|
+
|
119
|
+
# Set some common variables used across the board.
|
120
|
+
self.attention_head_dim = attention_head_dim
|
121
|
+
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
122
|
+
self.out_channels = in_channels if out_channels is None else out_channels
|
123
|
+
if use_additional_conditions is None:
|
124
|
+
if sample_size == 128:
|
125
|
+
use_additional_conditions = True
|
126
|
+
else:
|
127
|
+
use_additional_conditions = False
|
128
|
+
self.use_additional_conditions = use_additional_conditions
|
129
|
+
|
130
|
+
self.gradient_checkpointing = False
|
131
|
+
|
132
|
+
# 2. Initialize the position embedding and transformer blocks.
|
133
|
+
self.height = self.config.sample_size
|
134
|
+
self.width = self.config.sample_size
|
135
|
+
|
136
|
+
interpolation_scale = (
|
137
|
+
self.config.interpolation_scale
|
138
|
+
if self.config.interpolation_scale is not None
|
139
|
+
else max(self.config.sample_size // 64, 1)
|
140
|
+
)
|
141
|
+
self.pos_embed = PatchEmbed(
|
142
|
+
height=self.config.sample_size,
|
143
|
+
width=self.config.sample_size,
|
144
|
+
patch_size=self.config.patch_size,
|
145
|
+
in_channels=self.config.in_channels,
|
146
|
+
embed_dim=self.inner_dim,
|
147
|
+
interpolation_scale=interpolation_scale,
|
148
|
+
)
|
149
|
+
|
150
|
+
self.transformer_blocks = nn.ModuleList(
|
151
|
+
[
|
152
|
+
BasicTransformerBlock(
|
153
|
+
self.inner_dim,
|
154
|
+
self.config.num_attention_heads,
|
155
|
+
self.config.attention_head_dim,
|
156
|
+
dropout=self.config.dropout,
|
157
|
+
cross_attention_dim=self.config.cross_attention_dim,
|
158
|
+
activation_fn=self.config.activation_fn,
|
159
|
+
num_embeds_ada_norm=self.config.num_embeds_ada_norm,
|
160
|
+
attention_bias=self.config.attention_bias,
|
161
|
+
upcast_attention=self.config.upcast_attention,
|
162
|
+
norm_type=norm_type,
|
163
|
+
norm_elementwise_affine=self.config.norm_elementwise_affine,
|
164
|
+
norm_eps=self.config.norm_eps,
|
165
|
+
attention_type=self.config.attention_type,
|
166
|
+
)
|
167
|
+
for _ in range(self.config.num_layers)
|
168
|
+
]
|
169
|
+
)
|
170
|
+
|
171
|
+
# 3. Output blocks.
|
172
|
+
self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
|
173
|
+
self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
|
174
|
+
self.proj_out = nn.Linear(self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels)
|
175
|
+
|
176
|
+
self.adaln_single = AdaLayerNormSingle(
|
177
|
+
self.inner_dim, use_additional_conditions=self.use_additional_conditions
|
178
|
+
)
|
179
|
+
self.caption_projection = None
|
180
|
+
if self.config.caption_channels is not None:
|
181
|
+
self.caption_projection = PixArtAlphaTextProjection(
|
182
|
+
in_features=self.config.caption_channels, hidden_size=self.inner_dim
|
183
|
+
)
|
184
|
+
|
185
|
+
def _set_gradient_checkpointing(self, module, value=False):
|
186
|
+
if hasattr(module, "gradient_checkpointing"):
|
187
|
+
module.gradient_checkpointing = value
|
188
|
+
|
189
|
+
def forward(
|
190
|
+
self,
|
191
|
+
hidden_states: torch.Tensor,
|
192
|
+
encoder_hidden_states: Optional[torch.Tensor] = None,
|
193
|
+
timestep: Optional[torch.LongTensor] = None,
|
194
|
+
added_cond_kwargs: Dict[str, torch.Tensor] = None,
|
195
|
+
cross_attention_kwargs: Dict[str, Any] = None,
|
196
|
+
attention_mask: Optional[torch.Tensor] = None,
|
197
|
+
encoder_attention_mask: Optional[torch.Tensor] = None,
|
198
|
+
return_dict: bool = True,
|
199
|
+
):
|
200
|
+
"""
|
201
|
+
The [`PixArtTransformer2DModel`] forward method.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
|
205
|
+
Input `hidden_states`.
|
206
|
+
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
|
207
|
+
Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
|
208
|
+
self-attention.
|
209
|
+
timestep (`torch.LongTensor`, *optional*):
|
210
|
+
Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
|
211
|
+
added_cond_kwargs: (`Dict[str, Any]`, *optional*): Additional conditions to be used as inputs.
|
212
|
+
cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
|
213
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
214
|
+
`self.processor` in
|
215
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
216
|
+
attention_mask ( `torch.Tensor`, *optional*):
|
217
|
+
An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
|
218
|
+
is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
|
219
|
+
negative values to the attention scores corresponding to "discard" tokens.
|
220
|
+
encoder_attention_mask ( `torch.Tensor`, *optional*):
|
221
|
+
Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
|
222
|
+
|
223
|
+
* Mask `(batch, sequence_length)` True = keep, False = discard.
|
224
|
+
* Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
|
225
|
+
|
226
|
+
If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
|
227
|
+
above. This bias will be added to the cross-attention scores.
|
228
|
+
return_dict (`bool`, *optional*, defaults to `True`):
|
229
|
+
Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
|
230
|
+
tuple.
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
|
234
|
+
`tuple` where the first element is the sample tensor.
|
235
|
+
"""
|
236
|
+
if self.use_additional_conditions and added_cond_kwargs is None:
|
237
|
+
raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
|
238
|
+
|
239
|
+
# ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
|
240
|
+
# we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
|
241
|
+
# we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
|
242
|
+
# expects mask of shape:
|
243
|
+
# [batch, key_tokens]
|
244
|
+
# adds singleton query_tokens dimension:
|
245
|
+
# [batch, 1, key_tokens]
|
246
|
+
# this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
|
247
|
+
# [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
|
248
|
+
# [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
|
249
|
+
if attention_mask is not None and attention_mask.ndim == 2:
|
250
|
+
# assume that mask is expressed as:
|
251
|
+
# (1 = keep, 0 = discard)
|
252
|
+
# convert mask into a bias that can be added to attention scores:
|
253
|
+
# (keep = +0, discard = -10000.0)
|
254
|
+
attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
|
255
|
+
attention_mask = attention_mask.unsqueeze(1)
|
256
|
+
|
257
|
+
# convert encoder_attention_mask to a bias the same way we do for attention_mask
|
258
|
+
if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
|
259
|
+
encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
|
260
|
+
encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
|
261
|
+
|
262
|
+
# 1. Input
|
263
|
+
batch_size = hidden_states.shape[0]
|
264
|
+
height, width = (
|
265
|
+
hidden_states.shape[-2] // self.config.patch_size,
|
266
|
+
hidden_states.shape[-1] // self.config.patch_size,
|
267
|
+
)
|
268
|
+
hidden_states = self.pos_embed(hidden_states)
|
269
|
+
|
270
|
+
timestep, embedded_timestep = self.adaln_single(
|
271
|
+
timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
|
272
|
+
)
|
273
|
+
|
274
|
+
if self.caption_projection is not None:
|
275
|
+
encoder_hidden_states = self.caption_projection(encoder_hidden_states)
|
276
|
+
encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
|
277
|
+
|
278
|
+
# 2. Blocks
|
279
|
+
for block in self.transformer_blocks:
|
280
|
+
if self.training and self.gradient_checkpointing:
|
281
|
+
|
282
|
+
def create_custom_forward(module, return_dict=None):
|
283
|
+
def custom_forward(*inputs):
|
284
|
+
if return_dict is not None:
|
285
|
+
return module(*inputs, return_dict=return_dict)
|
286
|
+
else:
|
287
|
+
return module(*inputs)
|
288
|
+
|
289
|
+
return custom_forward
|
290
|
+
|
291
|
+
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
292
|
+
hidden_states = torch.utils.checkpoint.checkpoint(
|
293
|
+
create_custom_forward(block),
|
294
|
+
hidden_states,
|
295
|
+
attention_mask,
|
296
|
+
encoder_hidden_states,
|
297
|
+
encoder_attention_mask,
|
298
|
+
timestep,
|
299
|
+
cross_attention_kwargs,
|
300
|
+
None,
|
301
|
+
**ckpt_kwargs,
|
302
|
+
)
|
303
|
+
else:
|
304
|
+
hidden_states = block(
|
305
|
+
hidden_states,
|
306
|
+
attention_mask=attention_mask,
|
307
|
+
encoder_hidden_states=encoder_hidden_states,
|
308
|
+
encoder_attention_mask=encoder_attention_mask,
|
309
|
+
timestep=timestep,
|
310
|
+
cross_attention_kwargs=cross_attention_kwargs,
|
311
|
+
class_labels=None,
|
312
|
+
)
|
313
|
+
|
314
|
+
# 3. Output
|
315
|
+
shift, scale = (
|
316
|
+
self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
|
317
|
+
).chunk(2, dim=1)
|
318
|
+
hidden_states = self.norm_out(hidden_states)
|
319
|
+
# Modulation
|
320
|
+
hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(hidden_states.device)
|
321
|
+
hidden_states = self.proj_out(hidden_states)
|
322
|
+
hidden_states = hidden_states.squeeze(1)
|
323
|
+
|
324
|
+
# unpatchify
|
325
|
+
hidden_states = hidden_states.reshape(
|
326
|
+
shape=(-1, height, width, self.config.patch_size, self.config.patch_size, self.out_channels)
|
327
|
+
)
|
328
|
+
hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
|
329
|
+
output = hidden_states.reshape(
|
330
|
+
shape=(-1, self.out_channels, height * self.config.patch_size, width * self.config.patch_size)
|
331
|
+
)
|
332
|
+
|
333
|
+
if not return_dict:
|
334
|
+
return (output,)
|
335
|
+
|
336
|
+
return Transformer2DModelOutput(sample=output)
|
@@ -11,39 +11,30 @@
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
|
-
from dataclasses import dataclass
|
15
14
|
from typing import Any, Dict, Optional
|
16
15
|
|
17
16
|
import torch
|
18
17
|
import torch.nn.functional as F
|
19
18
|
from torch import nn
|
20
19
|
|
21
|
-
from ...configuration_utils import
|
22
|
-
from ...utils import
|
20
|
+
from ...configuration_utils import LegacyConfigMixin, register_to_config
|
21
|
+
from ...utils import deprecate, is_torch_version, logging
|
23
22
|
from ..attention import BasicTransformerBlock
|
24
23
|
from ..embeddings import ImagePositionalEmbeddings, PatchEmbed, PixArtAlphaTextProjection
|
25
|
-
from ..
|
24
|
+
from ..modeling_outputs import Transformer2DModelOutput
|
25
|
+
from ..modeling_utils import LegacyModelMixin
|
26
26
|
from ..normalization import AdaLayerNormSingle
|
27
27
|
|
28
28
|
|
29
29
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
30
30
|
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
"""
|
35
|
-
The output of [`Transformer2DModel`].
|
36
|
-
|
37
|
-
Args:
|
38
|
-
sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
|
39
|
-
The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
|
40
|
-
distributions for the unnoised latent pixels.
|
41
|
-
"""
|
42
|
-
|
43
|
-
sample: torch.Tensor
|
32
|
+
class Transformer2DModelOutput(Transformer2DModelOutput):
|
33
|
+
deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead."
|
34
|
+
deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
|
44
35
|
|
45
36
|
|
46
|
-
class Transformer2DModel(
|
37
|
+
class Transformer2DModel(LegacyModelMixin, LegacyConfigMixin):
|
47
38
|
"""
|
48
39
|
A 2D Transformer model for image-like data.
|
49
40
|
|
@@ -116,40 +107,12 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
|
|
116
107
|
f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
|
117
108
|
)
|
118
109
|
|
119
|
-
# Set some common variables used across the board.
|
120
|
-
self.use_linear_projection = use_linear_projection
|
121
|
-
self.interpolation_scale = interpolation_scale
|
122
|
-
self.caption_channels = caption_channels
|
123
|
-
self.num_attention_heads = num_attention_heads
|
124
|
-
self.attention_head_dim = attention_head_dim
|
125
|
-
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
126
|
-
self.in_channels = in_channels
|
127
|
-
self.out_channels = in_channels if out_channels is None else out_channels
|
128
|
-
self.gradient_checkpointing = False
|
129
|
-
if use_additional_conditions is None:
|
130
|
-
if norm_type == "ada_norm_single" and sample_size == 128:
|
131
|
-
use_additional_conditions = True
|
132
|
-
else:
|
133
|
-
use_additional_conditions = False
|
134
|
-
self.use_additional_conditions = use_additional_conditions
|
135
|
-
|
136
110
|
# 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
|
137
111
|
# Define whether input is continuous or discrete depending on configuration
|
138
112
|
self.is_input_continuous = (in_channels is not None) and (patch_size is None)
|
139
113
|
self.is_input_vectorized = num_vector_embeds is not None
|
140
114
|
self.is_input_patches = in_channels is not None and patch_size is not None
|
141
115
|
|
142
|
-
if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
|
143
|
-
deprecation_message = (
|
144
|
-
f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
|
145
|
-
" incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
|
146
|
-
" Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
|
147
|
-
" results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
|
148
|
-
" would be very nice if you could open a Pull request for the `transformer/config.json` file"
|
149
|
-
)
|
150
|
-
deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
|
151
|
-
norm_type = "ada_norm"
|
152
|
-
|
153
116
|
if self.is_input_continuous and self.is_input_vectorized:
|
154
117
|
raise ValueError(
|
155
118
|
f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
|
@@ -166,6 +129,35 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
|
|
166
129
|
f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
|
167
130
|
)
|
168
131
|
|
132
|
+
if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
|
133
|
+
deprecation_message = (
|
134
|
+
f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
|
135
|
+
" incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
|
136
|
+
" Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
|
137
|
+
" results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
|
138
|
+
" would be very nice if you could open a Pull request for the `transformer/config.json` file"
|
139
|
+
)
|
140
|
+
deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
|
141
|
+
norm_type = "ada_norm"
|
142
|
+
|
143
|
+
# Set some common variables used across the board.
|
144
|
+
self.use_linear_projection = use_linear_projection
|
145
|
+
self.interpolation_scale = interpolation_scale
|
146
|
+
self.caption_channels = caption_channels
|
147
|
+
self.num_attention_heads = num_attention_heads
|
148
|
+
self.attention_head_dim = attention_head_dim
|
149
|
+
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
150
|
+
self.in_channels = in_channels
|
151
|
+
self.out_channels = in_channels if out_channels is None else out_channels
|
152
|
+
self.gradient_checkpointing = False
|
153
|
+
|
154
|
+
if use_additional_conditions is None:
|
155
|
+
if norm_type == "ada_norm_single" and sample_size == 128:
|
156
|
+
use_additional_conditions = True
|
157
|
+
else:
|
158
|
+
use_additional_conditions = False
|
159
|
+
self.use_additional_conditions = use_additional_conditions
|
160
|
+
|
169
161
|
# 2. Initialize the right blocks.
|
170
162
|
# These functions follow a common structure:
|
171
163
|
# a. Initialize the input blocks. b. Initialize the transformer blocks.
|
diffusers/pipelines/__init__.py
CHANGED
@@ -150,6 +150,7 @@ else:
|
|
150
150
|
"IFPipeline",
|
151
151
|
"IFSuperResolutionPipeline",
|
152
152
|
]
|
153
|
+
_import_structure["hunyuandit"] = ["HunyuanDiTPipeline"]
|
153
154
|
_import_structure["kandinsky"] = [
|
154
155
|
"KandinskyCombinedPipeline",
|
155
156
|
"KandinskyImg2ImgCombinedPipeline",
|
@@ -418,6 +419,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
418
419
|
VersatileDiffusionTextToImagePipeline,
|
419
420
|
VQDiffusionPipeline,
|
420
421
|
)
|
422
|
+
from .hunyuandit import HunyuanDiTPipeline
|
421
423
|
from .i2vgen_xl import I2VGenXLPipeline
|
422
424
|
from .kandinsky import (
|
423
425
|
KandinskyCombinedPipeline,
|
@@ -22,7 +22,7 @@ from typing import Dict, List, Optional, Tuple, Union
|
|
22
22
|
|
23
23
|
import torch
|
24
24
|
|
25
|
-
from ...models import AutoencoderKL,
|
25
|
+
from ...models import AutoencoderKL, DiTTransformer2DModel
|
26
26
|
from ...schedulers import KarrasDiffusionSchedulers
|
27
27
|
from ...utils.torch_utils import randn_tensor
|
28
28
|
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
@@ -36,8 +36,8 @@ class DiTPipeline(DiffusionPipeline):
|
|
36
36
|
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
37
37
|
|
38
38
|
Parameters:
|
39
|
-
transformer ([`
|
40
|
-
A class conditioned `
|
39
|
+
transformer ([`DiTTransformer2DModel`]):
|
40
|
+
A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents.
|
41
41
|
vae ([`AutoencoderKL`]):
|
42
42
|
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
|
43
43
|
scheduler ([`DDIMScheduler`]):
|
@@ -48,7 +48,7 @@ class DiTPipeline(DiffusionPipeline):
|
|
48
48
|
|
49
49
|
def __init__(
|
50
50
|
self,
|
51
|
-
transformer:
|
51
|
+
transformer: DiTTransformer2DModel,
|
52
52
|
vae: AutoencoderKL,
|
53
53
|
scheduler: KarrasDiffusionSchedulers,
|
54
54
|
id2label: Optional[Dict[int, str]] = None,
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from ...utils import (
|
4
|
+
DIFFUSERS_SLOW_IMPORT,
|
5
|
+
OptionalDependencyNotAvailable,
|
6
|
+
_LazyModule,
|
7
|
+
get_objects_from_module,
|
8
|
+
is_torch_available,
|
9
|
+
is_transformers_available,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
_dummy_objects = {}
|
14
|
+
_import_structure = {}
|
15
|
+
|
16
|
+
|
17
|
+
try:
|
18
|
+
if not (is_transformers_available() and is_torch_available()):
|
19
|
+
raise OptionalDependencyNotAvailable()
|
20
|
+
except OptionalDependencyNotAvailable:
|
21
|
+
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
22
|
+
|
23
|
+
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
|
+
else:
|
25
|
+
_import_structure["pipeline_hunyuandit"] = ["HunyuanDiTPipeline"]
|
26
|
+
|
27
|
+
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
28
|
+
try:
|
29
|
+
if not (is_transformers_available() and is_torch_available()):
|
30
|
+
raise OptionalDependencyNotAvailable()
|
31
|
+
|
32
|
+
except OptionalDependencyNotAvailable:
|
33
|
+
from ...utils.dummy_torch_and_transformers_objects import *
|
34
|
+
else:
|
35
|
+
from .pipeline_hunyuandit import HunyuanDiTPipeline
|
36
|
+
|
37
|
+
else:
|
38
|
+
import sys
|
39
|
+
|
40
|
+
sys.modules[__name__] = _LazyModule(
|
41
|
+
__name__,
|
42
|
+
globals()["__file__"],
|
43
|
+
_import_structure,
|
44
|
+
module_spec=__spec__,
|
45
|
+
)
|
46
|
+
|
47
|
+
for name, value in _dummy_objects.items():
|
48
|
+
setattr(sys.modules[__name__], name, value)
|