diffusers 0.30.2__py3-none-any.whl → 0.30.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +5 -1
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +109 -6
- diffusers/models/embeddings.py +68 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +35 -34
- diffusers/pipelines/__init__.py +6 -2
- diffusers/pipelines/cogvideo/__init__.py +4 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +2 -17
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +827 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +812 -0
- diffusers/pipelines/cogvideo/pipeline_output.py +20 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +30 -0
- {diffusers-0.30.2.dist-info → diffusers-0.30.3.dist-info}/METADATA +1 -1
- {diffusers-0.30.2.dist-info → diffusers-0.30.3.dist-info}/RECORD +17 -14
- {diffusers-0.30.2.dist-info → diffusers-0.30.3.dist-info}/WHEEL +1 -1
- {diffusers-0.30.2.dist-info → diffusers-0.30.3.dist-info}/LICENSE +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.30.3.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.2.dist-info → diffusers-0.30.3.dist-info}/top_level.txt +0 -0
diffusers/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.30.
|
1
|
+
__version__ = "0.30.3"
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING
|
4
4
|
|
@@ -252,7 +252,9 @@ else:
|
|
252
252
|
"BlipDiffusionControlNetPipeline",
|
253
253
|
"BlipDiffusionPipeline",
|
254
254
|
"CLIPImageProjection",
|
255
|
+
"CogVideoXImageToVideoPipeline",
|
255
256
|
"CogVideoXPipeline",
|
257
|
+
"CogVideoXVideoToVideoPipeline",
|
256
258
|
"CycleDiffusionPipeline",
|
257
259
|
"FluxPipeline",
|
258
260
|
"HunyuanDiTControlNetPipeline",
|
@@ -691,7 +693,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
691
693
|
AudioLDMPipeline,
|
692
694
|
AuraFlowPipeline,
|
693
695
|
CLIPImageProjection,
|
696
|
+
CogVideoXImageToVideoPipeline,
|
694
697
|
CogVideoXPipeline,
|
698
|
+
CogVideoXVideoToVideoPipeline,
|
695
699
|
CycleDiffusionPipeline,
|
696
700
|
FluxPipeline,
|
697
701
|
HunyuanDiTControlNetPipeline,
|
@@ -999,6 +999,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
|
999
999
|
# setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
|
1000
1000
|
# number of temporal frames.
|
1001
1001
|
self.num_latent_frames_batch_size = 2
|
1002
|
+
self.num_sample_frames_batch_size = 8
|
1002
1003
|
|
1003
1004
|
# We make the minimum height and width of sample for tiling half that of the generally supported
|
1004
1005
|
self.tile_sample_min_height = sample_height // 2
|
@@ -1081,6 +1082,31 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
|
1081
1082
|
"""
|
1082
1083
|
self.use_slicing = False
|
1083
1084
|
|
1085
|
+
def _encode(self, x: torch.Tensor) -> torch.Tensor:
|
1086
|
+
batch_size, num_channels, num_frames, height, width = x.shape
|
1087
|
+
|
1088
|
+
if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
|
1089
|
+
return self.tiled_encode(x)
|
1090
|
+
|
1091
|
+
frame_batch_size = self.num_sample_frames_batch_size
|
1092
|
+
# Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
|
1093
|
+
num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
|
1094
|
+
enc = []
|
1095
|
+
for i in range(num_batches):
|
1096
|
+
remaining_frames = num_frames % frame_batch_size
|
1097
|
+
start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
|
1098
|
+
end_frame = frame_batch_size * (i + 1) + remaining_frames
|
1099
|
+
x_intermediate = x[:, :, start_frame:end_frame]
|
1100
|
+
x_intermediate = self.encoder(x_intermediate)
|
1101
|
+
if self.quant_conv is not None:
|
1102
|
+
x_intermediate = self.quant_conv(x_intermediate)
|
1103
|
+
enc.append(x_intermediate)
|
1104
|
+
|
1105
|
+
self._clear_fake_context_parallel_cache()
|
1106
|
+
enc = torch.cat(enc, dim=2)
|
1107
|
+
|
1108
|
+
return enc
|
1109
|
+
|
1084
1110
|
@apply_forward_hook
|
1085
1111
|
def encode(
|
1086
1112
|
self, x: torch.Tensor, return_dict: bool = True
|
@@ -1094,13 +1120,17 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
|
1094
1120
|
Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
|
1095
1121
|
|
1096
1122
|
Returns:
|
1097
|
-
The latent representations of the encoded
|
1123
|
+
The latent representations of the encoded videos. If `return_dict` is True, a
|
1098
1124
|
[`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
|
1099
1125
|
"""
|
1100
|
-
|
1101
|
-
|
1102
|
-
h =
|
1126
|
+
if self.use_slicing and x.shape[0] > 1:
|
1127
|
+
encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
|
1128
|
+
h = torch.cat(encoded_slices)
|
1129
|
+
else:
|
1130
|
+
h = self._encode(x)
|
1131
|
+
|
1103
1132
|
posterior = DiagonalGaussianDistribution(h)
|
1133
|
+
|
1104
1134
|
if not return_dict:
|
1105
1135
|
return (posterior,)
|
1106
1136
|
return AutoencoderKLOutput(latent_dist=posterior)
|
@@ -1112,8 +1142,9 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
|
1112
1142
|
return self.tiled_decode(z, return_dict=return_dict)
|
1113
1143
|
|
1114
1144
|
frame_batch_size = self.num_latent_frames_batch_size
|
1145
|
+
num_batches = num_frames // frame_batch_size
|
1115
1146
|
dec = []
|
1116
|
-
for i in range(
|
1147
|
+
for i in range(num_batches):
|
1117
1148
|
remaining_frames = num_frames % frame_batch_size
|
1118
1149
|
start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
|
1119
1150
|
end_frame = frame_batch_size * (i + 1) + remaining_frames
|
@@ -1172,6 +1203,77 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
|
1172
1203
|
)
|
1173
1204
|
return b
|
1174
1205
|
|
1206
|
+
def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
|
1207
|
+
r"""Encode a batch of images using a tiled encoder.
|
1208
|
+
|
1209
|
+
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
|
1210
|
+
steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
|
1211
|
+
different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
|
1212
|
+
tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
|
1213
|
+
output, but they should be much less noticeable.
|
1214
|
+
|
1215
|
+
Args:
|
1216
|
+
x (`torch.Tensor`): Input batch of videos.
|
1217
|
+
|
1218
|
+
Returns:
|
1219
|
+
`torch.Tensor`:
|
1220
|
+
The latent representation of the encoded videos.
|
1221
|
+
"""
|
1222
|
+
# For a rough memory estimate, take a look at the `tiled_decode` method.
|
1223
|
+
batch_size, num_channels, num_frames, height, width = x.shape
|
1224
|
+
|
1225
|
+
overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
|
1226
|
+
overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
|
1227
|
+
blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
|
1228
|
+
blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
|
1229
|
+
row_limit_height = self.tile_latent_min_height - blend_extent_height
|
1230
|
+
row_limit_width = self.tile_latent_min_width - blend_extent_width
|
1231
|
+
frame_batch_size = self.num_sample_frames_batch_size
|
1232
|
+
|
1233
|
+
# Split x into overlapping tiles and encode them separately.
|
1234
|
+
# The tiles have an overlap to avoid seams between tiles.
|
1235
|
+
rows = []
|
1236
|
+
for i in range(0, height, overlap_height):
|
1237
|
+
row = []
|
1238
|
+
for j in range(0, width, overlap_width):
|
1239
|
+
# Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
|
1240
|
+
num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
|
1241
|
+
time = []
|
1242
|
+
for k in range(num_batches):
|
1243
|
+
remaining_frames = num_frames % frame_batch_size
|
1244
|
+
start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
|
1245
|
+
end_frame = frame_batch_size * (k + 1) + remaining_frames
|
1246
|
+
tile = x[
|
1247
|
+
:,
|
1248
|
+
:,
|
1249
|
+
start_frame:end_frame,
|
1250
|
+
i : i + self.tile_sample_min_height,
|
1251
|
+
j : j + self.tile_sample_min_width,
|
1252
|
+
]
|
1253
|
+
tile = self.encoder(tile)
|
1254
|
+
if self.quant_conv is not None:
|
1255
|
+
tile = self.quant_conv(tile)
|
1256
|
+
time.append(tile)
|
1257
|
+
self._clear_fake_context_parallel_cache()
|
1258
|
+
row.append(torch.cat(time, dim=2))
|
1259
|
+
rows.append(row)
|
1260
|
+
|
1261
|
+
result_rows = []
|
1262
|
+
for i, row in enumerate(rows):
|
1263
|
+
result_row = []
|
1264
|
+
for j, tile in enumerate(row):
|
1265
|
+
# blend the above tile and the left tile
|
1266
|
+
# to the current tile and add the current tile to the result row
|
1267
|
+
if i > 0:
|
1268
|
+
tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
|
1269
|
+
if j > 0:
|
1270
|
+
tile = self.blend_h(row[j - 1], tile, blend_extent_width)
|
1271
|
+
result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
|
1272
|
+
result_rows.append(torch.cat(result_row, dim=4))
|
1273
|
+
|
1274
|
+
enc = torch.cat(result_rows, dim=3)
|
1275
|
+
return enc
|
1276
|
+
|
1175
1277
|
def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
|
1176
1278
|
r"""
|
1177
1279
|
Decode a batch of images using a tiled decoder.
|
@@ -1212,8 +1314,9 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
|
1212
1314
|
for i in range(0, height, overlap_height):
|
1213
1315
|
row = []
|
1214
1316
|
for j in range(0, width, overlap_width):
|
1317
|
+
num_batches = num_frames // frame_batch_size
|
1215
1318
|
time = []
|
1216
|
-
for k in range(
|
1319
|
+
for k in range(num_batches):
|
1217
1320
|
remaining_frames = num_frames % frame_batch_size
|
1218
1321
|
start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
|
1219
1322
|
end_frame = frame_batch_size * (k + 1) + remaining_frames
|
diffusers/models/embeddings.py
CHANGED
@@ -342,15 +342,61 @@ class CogVideoXPatchEmbed(nn.Module):
|
|
342
342
|
embed_dim: int = 1920,
|
343
343
|
text_embed_dim: int = 4096,
|
344
344
|
bias: bool = True,
|
345
|
+
sample_width: int = 90,
|
346
|
+
sample_height: int = 60,
|
347
|
+
sample_frames: int = 49,
|
348
|
+
temporal_compression_ratio: int = 4,
|
349
|
+
max_text_seq_length: int = 226,
|
350
|
+
spatial_interpolation_scale: float = 1.875,
|
351
|
+
temporal_interpolation_scale: float = 1.0,
|
352
|
+
use_positional_embeddings: bool = True,
|
353
|
+
use_learned_positional_embeddings: bool = True,
|
345
354
|
) -> None:
|
346
355
|
super().__init__()
|
356
|
+
|
347
357
|
self.patch_size = patch_size
|
358
|
+
self.embed_dim = embed_dim
|
359
|
+
self.sample_height = sample_height
|
360
|
+
self.sample_width = sample_width
|
361
|
+
self.sample_frames = sample_frames
|
362
|
+
self.temporal_compression_ratio = temporal_compression_ratio
|
363
|
+
self.max_text_seq_length = max_text_seq_length
|
364
|
+
self.spatial_interpolation_scale = spatial_interpolation_scale
|
365
|
+
self.temporal_interpolation_scale = temporal_interpolation_scale
|
366
|
+
self.use_positional_embeddings = use_positional_embeddings
|
367
|
+
self.use_learned_positional_embeddings = use_learned_positional_embeddings
|
348
368
|
|
349
369
|
self.proj = nn.Conv2d(
|
350
370
|
in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
|
351
371
|
)
|
352
372
|
self.text_proj = nn.Linear(text_embed_dim, embed_dim)
|
353
373
|
|
374
|
+
if use_positional_embeddings or use_learned_positional_embeddings:
|
375
|
+
persistent = use_learned_positional_embeddings
|
376
|
+
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
377
|
+
self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
|
378
|
+
|
379
|
+
def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
|
380
|
+
post_patch_height = sample_height // self.patch_size
|
381
|
+
post_patch_width = sample_width // self.patch_size
|
382
|
+
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
383
|
+
num_patches = post_patch_height * post_patch_width * post_time_compression_frames
|
384
|
+
|
385
|
+
pos_embedding = get_3d_sincos_pos_embed(
|
386
|
+
self.embed_dim,
|
387
|
+
(post_patch_width, post_patch_height),
|
388
|
+
post_time_compression_frames,
|
389
|
+
self.spatial_interpolation_scale,
|
390
|
+
self.temporal_interpolation_scale,
|
391
|
+
)
|
392
|
+
pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
|
393
|
+
joint_pos_embedding = torch.zeros(
|
394
|
+
1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
|
395
|
+
)
|
396
|
+
joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
|
397
|
+
|
398
|
+
return joint_pos_embedding
|
399
|
+
|
354
400
|
def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
|
355
401
|
r"""
|
356
402
|
Args:
|
@@ -371,6 +417,28 @@ class CogVideoXPatchEmbed(nn.Module):
|
|
371
417
|
embeds = torch.cat(
|
372
418
|
[text_embeds, image_embeds], dim=1
|
373
419
|
).contiguous() # [batch, seq_length + num_frames x height x width, channels]
|
420
|
+
|
421
|
+
if self.use_positional_embeddings or self.use_learned_positional_embeddings:
|
422
|
+
if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
|
423
|
+
raise ValueError(
|
424
|
+
"It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
|
425
|
+
"If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
|
426
|
+
)
|
427
|
+
|
428
|
+
pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
|
429
|
+
|
430
|
+
if (
|
431
|
+
self.sample_height != height
|
432
|
+
or self.sample_width != width
|
433
|
+
or self.sample_frames != pre_time_compression_frames
|
434
|
+
):
|
435
|
+
pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
|
436
|
+
pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype)
|
437
|
+
else:
|
438
|
+
pos_embedding = self.pos_embedding
|
439
|
+
|
440
|
+
embeds = embeds + pos_embedding
|
441
|
+
|
374
442
|
return embeds
|
375
443
|
|
376
444
|
|
@@ -23,7 +23,7 @@ from ...utils import is_torch_version, logging
|
|
23
23
|
from ...utils.torch_utils import maybe_allow_in_graph
|
24
24
|
from ..attention import Attention, FeedForward
|
25
25
|
from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
|
26
|
-
from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
|
26
|
+
from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
|
27
27
|
from ..modeling_outputs import Transformer2DModelOutput
|
28
28
|
from ..modeling_utils import ModelMixin
|
29
29
|
from ..normalization import AdaLayerNorm, CogVideoXLayerNormZero
|
@@ -235,37 +235,42 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
|
235
235
|
spatial_interpolation_scale: float = 1.875,
|
236
236
|
temporal_interpolation_scale: float = 1.0,
|
237
237
|
use_rotary_positional_embeddings: bool = False,
|
238
|
+
use_learned_positional_embeddings: bool = False,
|
238
239
|
):
|
239
240
|
super().__init__()
|
240
241
|
inner_dim = num_attention_heads * attention_head_dim
|
241
242
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
243
|
+
if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
|
244
|
+
raise ValueError(
|
245
|
+
"There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
|
246
|
+
"embeddings. If you're using a custom model and/or believe this should be supported, please open an "
|
247
|
+
"issue at https://github.com/huggingface/diffusers/issues."
|
248
|
+
)
|
246
249
|
|
247
250
|
# 1. Patch embedding
|
248
|
-
self.patch_embed = CogVideoXPatchEmbed(
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
251
|
+
self.patch_embed = CogVideoXPatchEmbed(
|
252
|
+
patch_size=patch_size,
|
253
|
+
in_channels=in_channels,
|
254
|
+
embed_dim=inner_dim,
|
255
|
+
text_embed_dim=text_embed_dim,
|
256
|
+
bias=True,
|
257
|
+
sample_width=sample_width,
|
258
|
+
sample_height=sample_height,
|
259
|
+
sample_frames=sample_frames,
|
260
|
+
temporal_compression_ratio=temporal_compression_ratio,
|
261
|
+
max_text_seq_length=max_text_seq_length,
|
262
|
+
spatial_interpolation_scale=spatial_interpolation_scale,
|
263
|
+
temporal_interpolation_scale=temporal_interpolation_scale,
|
264
|
+
use_positional_embeddings=not use_rotary_positional_embeddings,
|
265
|
+
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
258
266
|
)
|
259
|
-
|
260
|
-
pos_embedding = torch.zeros(1, max_text_seq_length + self.num_patches, inner_dim, requires_grad=False)
|
261
|
-
pos_embedding.data[:, max_text_seq_length:].copy_(spatial_pos_embedding)
|
262
|
-
self.register_buffer("pos_embedding", pos_embedding, persistent=False)
|
267
|
+
self.embedding_dropout = nn.Dropout(dropout)
|
263
268
|
|
264
|
-
#
|
269
|
+
# 2. Time embeddings
|
265
270
|
self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
|
266
271
|
self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
|
267
272
|
|
268
|
-
#
|
273
|
+
# 3. Define spatio-temporal transformers blocks
|
269
274
|
self.transformer_blocks = nn.ModuleList(
|
270
275
|
[
|
271
276
|
CogVideoXBlock(
|
@@ -284,7 +289,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
|
284
289
|
)
|
285
290
|
self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
|
286
291
|
|
287
|
-
#
|
292
|
+
# 4. Output blocks
|
288
293
|
self.norm_out = AdaLayerNorm(
|
289
294
|
embedding_dim=time_embed_dim,
|
290
295
|
output_dim=2 * inner_dim,
|
@@ -422,20 +427,13 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
|
422
427
|
|
423
428
|
# 2. Patch embedding
|
424
429
|
hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
|
430
|
+
hidden_states = self.embedding_dropout(hidden_states)
|
425
431
|
|
426
|
-
# 3. Position embedding
|
427
432
|
text_seq_length = encoder_hidden_states.shape[1]
|
428
|
-
if not self.config.use_rotary_positional_embeddings:
|
429
|
-
seq_length = height * width * num_frames // (self.config.patch_size**2)
|
430
|
-
|
431
|
-
pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
|
432
|
-
hidden_states = hidden_states + pos_embeds
|
433
|
-
hidden_states = self.embedding_dropout(hidden_states)
|
434
|
-
|
435
433
|
encoder_hidden_states = hidden_states[:, :text_seq_length]
|
436
434
|
hidden_states = hidden_states[:, text_seq_length:]
|
437
435
|
|
438
|
-
#
|
436
|
+
# 3. Transformer blocks
|
439
437
|
for i, block in enumerate(self.transformer_blocks):
|
440
438
|
if self.training and self.gradient_checkpointing:
|
441
439
|
|
@@ -471,13 +469,16 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
|
|
471
469
|
hidden_states = self.norm_final(hidden_states)
|
472
470
|
hidden_states = hidden_states[:, text_seq_length:]
|
473
471
|
|
474
|
-
#
|
472
|
+
# 4. Final block
|
475
473
|
hidden_states = self.norm_out(hidden_states, temb=emb)
|
476
474
|
hidden_states = self.proj_out(hidden_states)
|
477
475
|
|
478
|
-
#
|
476
|
+
# 5. Unpatchify
|
477
|
+
# Note: we use `-1` instead of `channels`:
|
478
|
+
# - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
|
479
|
+
# - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
|
479
480
|
p = self.config.patch_size
|
480
|
-
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p,
|
481
|
+
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
|
481
482
|
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
|
482
483
|
|
483
484
|
if not return_dict:
|
diffusers/pipelines/__init__.py
CHANGED
@@ -132,7 +132,11 @@ else:
|
|
132
132
|
"AudioLDM2UNet2DConditionModel",
|
133
133
|
]
|
134
134
|
_import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
|
135
|
-
_import_structure["cogvideo"] = [
|
135
|
+
_import_structure["cogvideo"] = [
|
136
|
+
"CogVideoXPipeline",
|
137
|
+
"CogVideoXImageToVideoPipeline",
|
138
|
+
"CogVideoXVideoToVideoPipeline",
|
139
|
+
]
|
136
140
|
_import_structure["controlnet"].extend(
|
137
141
|
[
|
138
142
|
"BlipDiffusionControlNetPipeline",
|
@@ -452,7 +456,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
452
456
|
)
|
453
457
|
from .aura_flow import AuraFlowPipeline
|
454
458
|
from .blip_diffusion import BlipDiffusionPipeline
|
455
|
-
from .cogvideo import CogVideoXPipeline
|
459
|
+
from .cogvideo import CogVideoXImageToVideoPipeline, CogVideoXPipeline, CogVideoXVideoToVideoPipeline
|
456
460
|
from .controlnet import (
|
457
461
|
BlipDiffusionControlNetPipeline,
|
458
462
|
StableDiffusionControlNetImg2ImgPipeline,
|
@@ -23,6 +23,8 @@ except OptionalDependencyNotAvailable:
|
|
23
23
|
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
24
|
else:
|
25
25
|
_import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
|
26
|
+
_import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
|
27
|
+
_import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
|
26
28
|
|
27
29
|
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
28
30
|
try:
|
@@ -33,6 +35,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
33
35
|
from ...utils.dummy_torch_and_transformers_objects import *
|
34
36
|
else:
|
35
37
|
from .pipeline_cogvideox import CogVideoXPipeline
|
38
|
+
from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
|
39
|
+
from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
|
36
40
|
|
37
41
|
else:
|
38
42
|
import sys
|
@@ -15,7 +15,6 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from dataclasses import dataclass
|
19
18
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
20
19
|
|
21
20
|
import torch
|
@@ -26,9 +25,10 @@ from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
|
26
25
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
27
26
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
28
27
|
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
29
|
-
from ...utils import
|
28
|
+
from ...utils import logging, replace_example_docstring
|
30
29
|
from ...utils.torch_utils import randn_tensor
|
31
30
|
from ...video_processor import VideoProcessor
|
31
|
+
from .pipeline_output import CogVideoXPipelineOutput
|
32
32
|
|
33
33
|
|
34
34
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
@@ -136,21 +136,6 @@ def retrieve_timesteps(
|
|
136
136
|
return timesteps, num_inference_steps
|
137
137
|
|
138
138
|
|
139
|
-
@dataclass
|
140
|
-
class CogVideoXPipelineOutput(BaseOutput):
|
141
|
-
r"""
|
142
|
-
Output class for CogVideo pipelines.
|
143
|
-
|
144
|
-
Args:
|
145
|
-
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
146
|
-
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
147
|
-
denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
148
|
-
`(batch_size, num_frames, channels, height, width)`.
|
149
|
-
"""
|
150
|
-
|
151
|
-
frames: torch.Tensor
|
152
|
-
|
153
|
-
|
154
139
|
class CogVideoXPipeline(DiffusionPipeline):
|
155
140
|
r"""
|
156
141
|
Pipeline for text-to-video generation using CogVideoX.
|