diffusers 0.28.0__py3-none-any.whl → 0.28.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. diffusers/__init__.py +9 -1
  2. diffusers/configuration_utils.py +17 -0
  3. diffusers/loaders/single_file_utils.py +1 -1
  4. diffusers/models/__init__.py +6 -0
  5. diffusers/models/activations.py +12 -0
  6. diffusers/models/attention_processor.py +108 -0
  7. diffusers/models/embeddings.py +216 -8
  8. diffusers/models/model_loading_utils.py +28 -0
  9. diffusers/models/modeling_outputs.py +14 -0
  10. diffusers/models/modeling_utils.py +57 -1
  11. diffusers/models/normalization.py +2 -1
  12. diffusers/models/transformers/__init__.py +3 -0
  13. diffusers/models/transformers/dit_transformer_2d.py +240 -0
  14. diffusers/models/transformers/hunyuan_transformer_2d.py +427 -0
  15. diffusers/models/transformers/pixart_transformer_2d.py +336 -0
  16. diffusers/models/transformers/transformer_2d.py +37 -45
  17. diffusers/pipelines/__init__.py +2 -0
  18. diffusers/pipelines/dit/pipeline_dit.py +4 -4
  19. diffusers/pipelines/hunyuandit/__init__.py +48 -0
  20. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +881 -0
  21. diffusers/pipelines/pipeline_loading_utils.py +1 -0
  22. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +4 -4
  23. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +2 -2
  24. diffusers/utils/dummy_pt_objects.py +45 -0
  25. diffusers/utils/dummy_torch_and_transformers_objects.py +15 -0
  26. {diffusers-0.28.0.dist-info → diffusers-0.28.2.dist-info}/METADATA +44 -44
  27. {diffusers-0.28.0.dist-info → diffusers-0.28.2.dist-info}/RECORD +31 -26
  28. {diffusers-0.28.0.dist-info → diffusers-0.28.2.dist-info}/WHEEL +1 -1
  29. {diffusers-0.28.0.dist-info → diffusers-0.28.2.dist-info}/LICENSE +0 -0
  30. {diffusers-0.28.0.dist-info → diffusers-0.28.2.dist-info}/entry_points.txt +0 -0
  31. {diffusers-0.28.0.dist-info → diffusers-0.28.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,336 @@
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, Optional
15
+
16
+ import torch
17
+ from torch import nn
18
+
19
+ from ...configuration_utils import ConfigMixin, register_to_config
20
+ from ...utils import is_torch_version, logging
21
+ from ..attention import BasicTransformerBlock
22
+ from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
23
+ from ..modeling_outputs import Transformer2DModelOutput
24
+ from ..modeling_utils import ModelMixin
25
+ from ..normalization import AdaLayerNormSingle
26
+
27
+
28
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
29
+
30
+
31
+ class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
32
+ r"""
33
+ A 2D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
34
+ https://arxiv.org/abs/2403.04692).
35
+
36
+ Parameters:
37
+ num_attention_heads (int, optional, defaults to 16): The number of heads to use for multi-head attention.
38
+ attention_head_dim (int, optional, defaults to 72): The number of channels in each head.
39
+ in_channels (int, defaults to 4): The number of channels in the input.
40
+ out_channels (int, optional):
41
+ The number of channels in the output. Specify this parameter if the output channel number differs from the
42
+ input.
43
+ num_layers (int, optional, defaults to 28): The number of layers of Transformer blocks to use.
44
+ dropout (float, optional, defaults to 0.0): The dropout probability to use within the Transformer blocks.
45
+ norm_num_groups (int, optional, defaults to 32):
46
+ Number of groups for group normalization within Transformer blocks.
47
+ cross_attention_dim (int, optional):
48
+ The dimensionality for cross-attention layers, typically matching the encoder's hidden dimension.
49
+ attention_bias (bool, optional, defaults to True):
50
+ Configure if the Transformer blocks' attention should contain a bias parameter.
51
+ sample_size (int, defaults to 128):
52
+ The width of the latent images. This parameter is fixed during training.
53
+ patch_size (int, defaults to 2):
54
+ Size of the patches the model processes, relevant for architectures working on non-sequential data.
55
+ activation_fn (str, optional, defaults to "gelu-approximate"):
56
+ Activation function to use in feed-forward networks within Transformer blocks.
57
+ num_embeds_ada_norm (int, optional, defaults to 1000):
58
+ Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
59
+ inference.
60
+ upcast_attention (bool, optional, defaults to False):
61
+ If true, upcasts the attention mechanism dimensions for potentially improved performance.
62
+ norm_type (str, optional, defaults to "ada_norm_zero"):
63
+ Specifies the type of normalization used, can be 'ada_norm_zero'.
64
+ norm_elementwise_affine (bool, optional, defaults to False):
65
+ If true, enables element-wise affine parameters in the normalization layers.
66
+ norm_eps (float, optional, defaults to 1e-6):
67
+ A small constant added to the denominator in normalization layers to prevent division by zero.
68
+ interpolation_scale (int, optional): Scale factor to use during interpolating the position embeddings.
69
+ use_additional_conditions (bool, optional): If we're using additional conditions as inputs.
70
+ attention_type (str, optional, defaults to "default"): Kind of attention mechanism to be used.
71
+ caption_channels (int, optional, defaults to None):
72
+ Number of channels to use for projecting the caption embeddings.
73
+ use_linear_projection (bool, optional, defaults to False):
74
+ Deprecated argument. Will be removed in a future version.
75
+ num_vector_embeds (bool, optional, defaults to False):
76
+ Deprecated argument. Will be removed in a future version.
77
+ """
78
+
79
+ _supports_gradient_checkpointing = True
80
+ _no_split_modules = ["BasicTransformerBlock", "PatchEmbed"]
81
+
82
+ @register_to_config
83
+ def __init__(
84
+ self,
85
+ num_attention_heads: int = 16,
86
+ attention_head_dim: int = 72,
87
+ in_channels: int = 4,
88
+ out_channels: Optional[int] = 8,
89
+ num_layers: int = 28,
90
+ dropout: float = 0.0,
91
+ norm_num_groups: int = 32,
92
+ cross_attention_dim: Optional[int] = 1152,
93
+ attention_bias: bool = True,
94
+ sample_size: int = 128,
95
+ patch_size: int = 2,
96
+ activation_fn: str = "gelu-approximate",
97
+ num_embeds_ada_norm: Optional[int] = 1000,
98
+ upcast_attention: bool = False,
99
+ norm_type: str = "ada_norm_single",
100
+ norm_elementwise_affine: bool = False,
101
+ norm_eps: float = 1e-6,
102
+ interpolation_scale: Optional[int] = None,
103
+ use_additional_conditions: Optional[bool] = None,
104
+ caption_channels: Optional[int] = None,
105
+ attention_type: Optional[str] = "default",
106
+ ):
107
+ super().__init__()
108
+
109
+ # Validate inputs.
110
+ if norm_type != "ada_norm_single":
111
+ raise NotImplementedError(
112
+ f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
113
+ )
114
+ elif norm_type == "ada_norm_single" and num_embeds_ada_norm is None:
115
+ raise ValueError(
116
+ f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
117
+ )
118
+
119
+ # Set some common variables used across the board.
120
+ self.attention_head_dim = attention_head_dim
121
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
122
+ self.out_channels = in_channels if out_channels is None else out_channels
123
+ if use_additional_conditions is None:
124
+ if sample_size == 128:
125
+ use_additional_conditions = True
126
+ else:
127
+ use_additional_conditions = False
128
+ self.use_additional_conditions = use_additional_conditions
129
+
130
+ self.gradient_checkpointing = False
131
+
132
+ # 2. Initialize the position embedding and transformer blocks.
133
+ self.height = self.config.sample_size
134
+ self.width = self.config.sample_size
135
+
136
+ interpolation_scale = (
137
+ self.config.interpolation_scale
138
+ if self.config.interpolation_scale is not None
139
+ else max(self.config.sample_size // 64, 1)
140
+ )
141
+ self.pos_embed = PatchEmbed(
142
+ height=self.config.sample_size,
143
+ width=self.config.sample_size,
144
+ patch_size=self.config.patch_size,
145
+ in_channels=self.config.in_channels,
146
+ embed_dim=self.inner_dim,
147
+ interpolation_scale=interpolation_scale,
148
+ )
149
+
150
+ self.transformer_blocks = nn.ModuleList(
151
+ [
152
+ BasicTransformerBlock(
153
+ self.inner_dim,
154
+ self.config.num_attention_heads,
155
+ self.config.attention_head_dim,
156
+ dropout=self.config.dropout,
157
+ cross_attention_dim=self.config.cross_attention_dim,
158
+ activation_fn=self.config.activation_fn,
159
+ num_embeds_ada_norm=self.config.num_embeds_ada_norm,
160
+ attention_bias=self.config.attention_bias,
161
+ upcast_attention=self.config.upcast_attention,
162
+ norm_type=norm_type,
163
+ norm_elementwise_affine=self.config.norm_elementwise_affine,
164
+ norm_eps=self.config.norm_eps,
165
+ attention_type=self.config.attention_type,
166
+ )
167
+ for _ in range(self.config.num_layers)
168
+ ]
169
+ )
170
+
171
+ # 3. Output blocks.
172
+ self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
173
+ self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim**0.5)
174
+ self.proj_out = nn.Linear(self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels)
175
+
176
+ self.adaln_single = AdaLayerNormSingle(
177
+ self.inner_dim, use_additional_conditions=self.use_additional_conditions
178
+ )
179
+ self.caption_projection = None
180
+ if self.config.caption_channels is not None:
181
+ self.caption_projection = PixArtAlphaTextProjection(
182
+ in_features=self.config.caption_channels, hidden_size=self.inner_dim
183
+ )
184
+
185
+ def _set_gradient_checkpointing(self, module, value=False):
186
+ if hasattr(module, "gradient_checkpointing"):
187
+ module.gradient_checkpointing = value
188
+
189
+ def forward(
190
+ self,
191
+ hidden_states: torch.Tensor,
192
+ encoder_hidden_states: Optional[torch.Tensor] = None,
193
+ timestep: Optional[torch.LongTensor] = None,
194
+ added_cond_kwargs: Dict[str, torch.Tensor] = None,
195
+ cross_attention_kwargs: Dict[str, Any] = None,
196
+ attention_mask: Optional[torch.Tensor] = None,
197
+ encoder_attention_mask: Optional[torch.Tensor] = None,
198
+ return_dict: bool = True,
199
+ ):
200
+ """
201
+ The [`PixArtTransformer2DModel`] forward method.
202
+
203
+ Args:
204
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
205
+ Input `hidden_states`.
206
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
207
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
208
+ self-attention.
209
+ timestep (`torch.LongTensor`, *optional*):
210
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
211
+ added_cond_kwargs: (`Dict[str, Any]`, *optional*): Additional conditions to be used as inputs.
212
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
213
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
214
+ `self.processor` in
215
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
216
+ attention_mask ( `torch.Tensor`, *optional*):
217
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
218
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
219
+ negative values to the attention scores corresponding to "discard" tokens.
220
+ encoder_attention_mask ( `torch.Tensor`, *optional*):
221
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
222
+
223
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
224
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
225
+
226
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
227
+ above. This bias will be added to the cross-attention scores.
228
+ return_dict (`bool`, *optional*, defaults to `True`):
229
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
230
+ tuple.
231
+
232
+ Returns:
233
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
234
+ `tuple` where the first element is the sample tensor.
235
+ """
236
+ if self.use_additional_conditions and added_cond_kwargs is None:
237
+ raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
238
+
239
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
240
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
241
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
242
+ # expects mask of shape:
243
+ # [batch, key_tokens]
244
+ # adds singleton query_tokens dimension:
245
+ # [batch, 1, key_tokens]
246
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
247
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
248
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
249
+ if attention_mask is not None and attention_mask.ndim == 2:
250
+ # assume that mask is expressed as:
251
+ # (1 = keep, 0 = discard)
252
+ # convert mask into a bias that can be added to attention scores:
253
+ # (keep = +0, discard = -10000.0)
254
+ attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
255
+ attention_mask = attention_mask.unsqueeze(1)
256
+
257
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
258
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
259
+ encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
260
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
261
+
262
+ # 1. Input
263
+ batch_size = hidden_states.shape[0]
264
+ height, width = (
265
+ hidden_states.shape[-2] // self.config.patch_size,
266
+ hidden_states.shape[-1] // self.config.patch_size,
267
+ )
268
+ hidden_states = self.pos_embed(hidden_states)
269
+
270
+ timestep, embedded_timestep = self.adaln_single(
271
+ timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
272
+ )
273
+
274
+ if self.caption_projection is not None:
275
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states)
276
+ encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
277
+
278
+ # 2. Blocks
279
+ for block in self.transformer_blocks:
280
+ if self.training and self.gradient_checkpointing:
281
+
282
+ def create_custom_forward(module, return_dict=None):
283
+ def custom_forward(*inputs):
284
+ if return_dict is not None:
285
+ return module(*inputs, return_dict=return_dict)
286
+ else:
287
+ return module(*inputs)
288
+
289
+ return custom_forward
290
+
291
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
292
+ hidden_states = torch.utils.checkpoint.checkpoint(
293
+ create_custom_forward(block),
294
+ hidden_states,
295
+ attention_mask,
296
+ encoder_hidden_states,
297
+ encoder_attention_mask,
298
+ timestep,
299
+ cross_attention_kwargs,
300
+ None,
301
+ **ckpt_kwargs,
302
+ )
303
+ else:
304
+ hidden_states = block(
305
+ hidden_states,
306
+ attention_mask=attention_mask,
307
+ encoder_hidden_states=encoder_hidden_states,
308
+ encoder_attention_mask=encoder_attention_mask,
309
+ timestep=timestep,
310
+ cross_attention_kwargs=cross_attention_kwargs,
311
+ class_labels=None,
312
+ )
313
+
314
+ # 3. Output
315
+ shift, scale = (
316
+ self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
317
+ ).chunk(2, dim=1)
318
+ hidden_states = self.norm_out(hidden_states)
319
+ # Modulation
320
+ hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(hidden_states.device)
321
+ hidden_states = self.proj_out(hidden_states)
322
+ hidden_states = hidden_states.squeeze(1)
323
+
324
+ # unpatchify
325
+ hidden_states = hidden_states.reshape(
326
+ shape=(-1, height, width, self.config.patch_size, self.config.patch_size, self.out_channels)
327
+ )
328
+ hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
329
+ output = hidden_states.reshape(
330
+ shape=(-1, self.out_channels, height * self.config.patch_size, width * self.config.patch_size)
331
+ )
332
+
333
+ if not return_dict:
334
+ return (output,)
335
+
336
+ return Transformer2DModelOutput(sample=output)
@@ -11,39 +11,30 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from dataclasses import dataclass
15
14
  from typing import Any, Dict, Optional
16
15
 
17
16
  import torch
18
17
  import torch.nn.functional as F
19
18
  from torch import nn
20
19
 
21
- from ...configuration_utils import ConfigMixin, register_to_config
22
- from ...utils import BaseOutput, deprecate, is_torch_version, logging
20
+ from ...configuration_utils import LegacyConfigMixin, register_to_config
21
+ from ...utils import deprecate, is_torch_version, logging
23
22
  from ..attention import BasicTransformerBlock
24
23
  from ..embeddings import ImagePositionalEmbeddings, PatchEmbed, PixArtAlphaTextProjection
25
- from ..modeling_utils import ModelMixin
24
+ from ..modeling_outputs import Transformer2DModelOutput
25
+ from ..modeling_utils import LegacyModelMixin
26
26
  from ..normalization import AdaLayerNormSingle
27
27
 
28
28
 
29
29
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
30
30
 
31
31
 
32
- @dataclass
33
- class Transformer2DModelOutput(BaseOutput):
34
- """
35
- The output of [`Transformer2DModel`].
36
-
37
- Args:
38
- sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
39
- The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
40
- distributions for the unnoised latent pixels.
41
- """
42
-
43
- sample: torch.Tensor
32
+ class Transformer2DModelOutput(Transformer2DModelOutput):
33
+ deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead."
34
+ deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
44
35
 
45
36
 
46
- class Transformer2DModel(ModelMixin, ConfigMixin):
37
+ class Transformer2DModel(LegacyModelMixin, LegacyConfigMixin):
47
38
  """
48
39
  A 2D Transformer model for image-like data.
49
40
 
@@ -116,40 +107,12 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
116
107
  f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
117
108
  )
118
109
 
119
- # Set some common variables used across the board.
120
- self.use_linear_projection = use_linear_projection
121
- self.interpolation_scale = interpolation_scale
122
- self.caption_channels = caption_channels
123
- self.num_attention_heads = num_attention_heads
124
- self.attention_head_dim = attention_head_dim
125
- self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
126
- self.in_channels = in_channels
127
- self.out_channels = in_channels if out_channels is None else out_channels
128
- self.gradient_checkpointing = False
129
- if use_additional_conditions is None:
130
- if norm_type == "ada_norm_single" and sample_size == 128:
131
- use_additional_conditions = True
132
- else:
133
- use_additional_conditions = False
134
- self.use_additional_conditions = use_additional_conditions
135
-
136
110
  # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
137
111
  # Define whether input is continuous or discrete depending on configuration
138
112
  self.is_input_continuous = (in_channels is not None) and (patch_size is None)
139
113
  self.is_input_vectorized = num_vector_embeds is not None
140
114
  self.is_input_patches = in_channels is not None and patch_size is not None
141
115
 
142
- if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
143
- deprecation_message = (
144
- f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
145
- " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
146
- " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
147
- " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
148
- " would be very nice if you could open a Pull request for the `transformer/config.json` file"
149
- )
150
- deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
151
- norm_type = "ada_norm"
152
-
153
116
  if self.is_input_continuous and self.is_input_vectorized:
154
117
  raise ValueError(
155
118
  f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
@@ -166,6 +129,35 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
166
129
  f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
167
130
  )
168
131
 
132
+ if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
133
+ deprecation_message = (
134
+ f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
135
+ " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
136
+ " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
137
+ " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
138
+ " would be very nice if you could open a Pull request for the `transformer/config.json` file"
139
+ )
140
+ deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
141
+ norm_type = "ada_norm"
142
+
143
+ # Set some common variables used across the board.
144
+ self.use_linear_projection = use_linear_projection
145
+ self.interpolation_scale = interpolation_scale
146
+ self.caption_channels = caption_channels
147
+ self.num_attention_heads = num_attention_heads
148
+ self.attention_head_dim = attention_head_dim
149
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
150
+ self.in_channels = in_channels
151
+ self.out_channels = in_channels if out_channels is None else out_channels
152
+ self.gradient_checkpointing = False
153
+
154
+ if use_additional_conditions is None:
155
+ if norm_type == "ada_norm_single" and sample_size == 128:
156
+ use_additional_conditions = True
157
+ else:
158
+ use_additional_conditions = False
159
+ self.use_additional_conditions = use_additional_conditions
160
+
169
161
  # 2. Initialize the right blocks.
170
162
  # These functions follow a common structure:
171
163
  # a. Initialize the input blocks. b. Initialize the transformer blocks.
@@ -150,6 +150,7 @@ else:
150
150
  "IFPipeline",
151
151
  "IFSuperResolutionPipeline",
152
152
  ]
153
+ _import_structure["hunyuandit"] = ["HunyuanDiTPipeline"]
153
154
  _import_structure["kandinsky"] = [
154
155
  "KandinskyCombinedPipeline",
155
156
  "KandinskyImg2ImgCombinedPipeline",
@@ -418,6 +419,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
418
419
  VersatileDiffusionTextToImagePipeline,
419
420
  VQDiffusionPipeline,
420
421
  )
422
+ from .hunyuandit import HunyuanDiTPipeline
421
423
  from .i2vgen_xl import I2VGenXLPipeline
422
424
  from .kandinsky import (
423
425
  KandinskyCombinedPipeline,
@@ -22,7 +22,7 @@ from typing import Dict, List, Optional, Tuple, Union
22
22
 
23
23
  import torch
24
24
 
25
- from ...models import AutoencoderKL, Transformer2DModel
25
+ from ...models import AutoencoderKL, DiTTransformer2DModel
26
26
  from ...schedulers import KarrasDiffusionSchedulers
27
27
  from ...utils.torch_utils import randn_tensor
28
28
  from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -36,8 +36,8 @@ class DiTPipeline(DiffusionPipeline):
36
36
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
37
37
 
38
38
  Parameters:
39
- transformer ([`Transformer2DModel`]):
40
- A class conditioned `Transformer2DModel` to denoise the encoded image latents.
39
+ transformer ([`DiTTransformer2DModel`]):
40
+ A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents.
41
41
  vae ([`AutoencoderKL`]):
42
42
  Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
43
43
  scheduler ([`DDIMScheduler`]):
@@ -48,7 +48,7 @@ class DiTPipeline(DiffusionPipeline):
48
48
 
49
49
  def __init__(
50
50
  self,
51
- transformer: Transformer2DModel,
51
+ transformer: DiTTransformer2DModel,
52
52
  vae: AutoencoderKL,
53
53
  scheduler: KarrasDiffusionSchedulers,
54
54
  id2label: Optional[Dict[int, str]] = None,
@@ -0,0 +1,48 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _import_structure = {}
15
+
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_hunyuandit"] = ["HunyuanDiTPipeline"]
26
+
27
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
+ try:
29
+ if not (is_transformers_available() and is_torch_available()):
30
+ raise OptionalDependencyNotAvailable()
31
+
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils.dummy_torch_and_transformers_objects import *
34
+ else:
35
+ from .pipeline_hunyuandit import HunyuanDiTPipeline
36
+
37
+ else:
38
+ import sys
39
+
40
+ sys.modules[__name__] = _LazyModule(
41
+ __name__,
42
+ globals()["__file__"],
43
+ _import_structure,
44
+ module_spec=__spec__,
45
+ )
46
+
47
+ for name, value in _dummy_objects.items():
48
+ setattr(sys.modules[__name__], name, value)