diffusers 0.30.1__py3-none-any.whl → 0.30.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffusers/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.30.1"
1
+ __version__ = "0.30.3"
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
@@ -252,7 +252,9 @@ else:
252
252
  "BlipDiffusionControlNetPipeline",
253
253
  "BlipDiffusionPipeline",
254
254
  "CLIPImageProjection",
255
+ "CogVideoXImageToVideoPipeline",
255
256
  "CogVideoXPipeline",
257
+ "CogVideoXVideoToVideoPipeline",
256
258
  "CycleDiffusionPipeline",
257
259
  "FluxPipeline",
258
260
  "HunyuanDiTControlNetPipeline",
@@ -691,7 +693,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
691
693
  AudioLDMPipeline,
692
694
  AuraFlowPipeline,
693
695
  CLIPImageProjection,
696
+ CogVideoXImageToVideoPipeline,
694
697
  CogVideoXPipeline,
698
+ CogVideoXVideoToVideoPipeline,
695
699
  CycleDiffusionPipeline,
696
700
  FluxPipeline,
697
701
  HunyuanDiTControlNetPipeline,
@@ -208,6 +208,8 @@ class IPAdapterMixin:
208
208
  pretrained_model_name_or_path_or_dict,
209
209
  subfolder=image_encoder_subfolder,
210
210
  low_cpu_mem_usage=low_cpu_mem_usage,
211
+ cache_dir=cache_dir,
212
+ local_files_only=local_files_only,
211
213
  ).to(self.device, dtype=self.dtype)
212
214
  self.register_modules(image_encoder=image_encoder)
213
215
  else:
@@ -91,11 +91,11 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
91
91
  "xl_inpaint": {"pretrained_model_name_or_path": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"},
92
92
  "playground-v2-5": {"pretrained_model_name_or_path": "playgroundai/playground-v2.5-1024px-aesthetic"},
93
93
  "upscale": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-x4-upscaler"},
94
- "inpainting": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-inpainting"},
94
+ "inpainting": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8-inpainting"},
95
95
  "inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"},
96
96
  "controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"},
97
97
  "v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"},
98
- "v1": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5"},
98
+ "v1": {"pretrained_model_name_or_path": "Lykon/dreamshaper-8"},
99
99
  "stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"},
100
100
  "stable_cascade_stage_b_lite": {
101
101
  "pretrained_model_name_or_path": "stabilityai/stable-cascade",
@@ -999,6 +999,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
999
999
  # setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
1000
1000
  # number of temporal frames.
1001
1001
  self.num_latent_frames_batch_size = 2
1002
+ self.num_sample_frames_batch_size = 8
1002
1003
 
1003
1004
  # We make the minimum height and width of sample for tiling half that of the generally supported
1004
1005
  self.tile_sample_min_height = sample_height // 2
@@ -1081,6 +1082,31 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
1081
1082
  """
1082
1083
  self.use_slicing = False
1083
1084
 
1085
+ def _encode(self, x: torch.Tensor) -> torch.Tensor:
1086
+ batch_size, num_channels, num_frames, height, width = x.shape
1087
+
1088
+ if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
1089
+ return self.tiled_encode(x)
1090
+
1091
+ frame_batch_size = self.num_sample_frames_batch_size
1092
+ # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
1093
+ num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
1094
+ enc = []
1095
+ for i in range(num_batches):
1096
+ remaining_frames = num_frames % frame_batch_size
1097
+ start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
1098
+ end_frame = frame_batch_size * (i + 1) + remaining_frames
1099
+ x_intermediate = x[:, :, start_frame:end_frame]
1100
+ x_intermediate = self.encoder(x_intermediate)
1101
+ if self.quant_conv is not None:
1102
+ x_intermediate = self.quant_conv(x_intermediate)
1103
+ enc.append(x_intermediate)
1104
+
1105
+ self._clear_fake_context_parallel_cache()
1106
+ enc = torch.cat(enc, dim=2)
1107
+
1108
+ return enc
1109
+
1084
1110
  @apply_forward_hook
1085
1111
  def encode(
1086
1112
  self, x: torch.Tensor, return_dict: bool = True
@@ -1094,13 +1120,17 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
1094
1120
  Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
1095
1121
 
1096
1122
  Returns:
1097
- The latent representations of the encoded images. If `return_dict` is True, a
1123
+ The latent representations of the encoded videos. If `return_dict` is True, a
1098
1124
  [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
1099
1125
  """
1100
- h = self.encoder(x)
1101
- if self.quant_conv is not None:
1102
- h = self.quant_conv(h)
1126
+ if self.use_slicing and x.shape[0] > 1:
1127
+ encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
1128
+ h = torch.cat(encoded_slices)
1129
+ else:
1130
+ h = self._encode(x)
1131
+
1103
1132
  posterior = DiagonalGaussianDistribution(h)
1133
+
1104
1134
  if not return_dict:
1105
1135
  return (posterior,)
1106
1136
  return AutoencoderKLOutput(latent_dist=posterior)
@@ -1112,8 +1142,9 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
1112
1142
  return self.tiled_decode(z, return_dict=return_dict)
1113
1143
 
1114
1144
  frame_batch_size = self.num_latent_frames_batch_size
1145
+ num_batches = num_frames // frame_batch_size
1115
1146
  dec = []
1116
- for i in range(num_frames // frame_batch_size):
1147
+ for i in range(num_batches):
1117
1148
  remaining_frames = num_frames % frame_batch_size
1118
1149
  start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
1119
1150
  end_frame = frame_batch_size * (i + 1) + remaining_frames
@@ -1172,6 +1203,77 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
1172
1203
  )
1173
1204
  return b
1174
1205
 
1206
+ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
1207
+ r"""Encode a batch of images using a tiled encoder.
1208
+
1209
+ When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
1210
+ steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
1211
+ different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
1212
+ tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
1213
+ output, but they should be much less noticeable.
1214
+
1215
+ Args:
1216
+ x (`torch.Tensor`): Input batch of videos.
1217
+
1218
+ Returns:
1219
+ `torch.Tensor`:
1220
+ The latent representation of the encoded videos.
1221
+ """
1222
+ # For a rough memory estimate, take a look at the `tiled_decode` method.
1223
+ batch_size, num_channels, num_frames, height, width = x.shape
1224
+
1225
+ overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
1226
+ overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
1227
+ blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
1228
+ blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
1229
+ row_limit_height = self.tile_latent_min_height - blend_extent_height
1230
+ row_limit_width = self.tile_latent_min_width - blend_extent_width
1231
+ frame_batch_size = self.num_sample_frames_batch_size
1232
+
1233
+ # Split x into overlapping tiles and encode them separately.
1234
+ # The tiles have an overlap to avoid seams between tiles.
1235
+ rows = []
1236
+ for i in range(0, height, overlap_height):
1237
+ row = []
1238
+ for j in range(0, width, overlap_width):
1239
+ # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
1240
+ num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
1241
+ time = []
1242
+ for k in range(num_batches):
1243
+ remaining_frames = num_frames % frame_batch_size
1244
+ start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
1245
+ end_frame = frame_batch_size * (k + 1) + remaining_frames
1246
+ tile = x[
1247
+ :,
1248
+ :,
1249
+ start_frame:end_frame,
1250
+ i : i + self.tile_sample_min_height,
1251
+ j : j + self.tile_sample_min_width,
1252
+ ]
1253
+ tile = self.encoder(tile)
1254
+ if self.quant_conv is not None:
1255
+ tile = self.quant_conv(tile)
1256
+ time.append(tile)
1257
+ self._clear_fake_context_parallel_cache()
1258
+ row.append(torch.cat(time, dim=2))
1259
+ rows.append(row)
1260
+
1261
+ result_rows = []
1262
+ for i, row in enumerate(rows):
1263
+ result_row = []
1264
+ for j, tile in enumerate(row):
1265
+ # blend the above tile and the left tile
1266
+ # to the current tile and add the current tile to the result row
1267
+ if i > 0:
1268
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
1269
+ if j > 0:
1270
+ tile = self.blend_h(row[j - 1], tile, blend_extent_width)
1271
+ result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
1272
+ result_rows.append(torch.cat(result_row, dim=4))
1273
+
1274
+ enc = torch.cat(result_rows, dim=3)
1275
+ return enc
1276
+
1175
1277
  def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
1176
1278
  r"""
1177
1279
  Decode a batch of images using a tiled decoder.
@@ -1212,8 +1314,9 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
1212
1314
  for i in range(0, height, overlap_height):
1213
1315
  row = []
1214
1316
  for j in range(0, width, overlap_width):
1317
+ num_batches = num_frames // frame_batch_size
1215
1318
  time = []
1216
- for k in range(num_frames // frame_batch_size):
1319
+ for k in range(num_batches):
1217
1320
  remaining_frames = num_frames % frame_batch_size
1218
1321
  start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
1219
1322
  end_frame = frame_batch_size * (k + 1) + remaining_frames
@@ -342,15 +342,61 @@ class CogVideoXPatchEmbed(nn.Module):
342
342
  embed_dim: int = 1920,
343
343
  text_embed_dim: int = 4096,
344
344
  bias: bool = True,
345
+ sample_width: int = 90,
346
+ sample_height: int = 60,
347
+ sample_frames: int = 49,
348
+ temporal_compression_ratio: int = 4,
349
+ max_text_seq_length: int = 226,
350
+ spatial_interpolation_scale: float = 1.875,
351
+ temporal_interpolation_scale: float = 1.0,
352
+ use_positional_embeddings: bool = True,
353
+ use_learned_positional_embeddings: bool = True,
345
354
  ) -> None:
346
355
  super().__init__()
356
+
347
357
  self.patch_size = patch_size
358
+ self.embed_dim = embed_dim
359
+ self.sample_height = sample_height
360
+ self.sample_width = sample_width
361
+ self.sample_frames = sample_frames
362
+ self.temporal_compression_ratio = temporal_compression_ratio
363
+ self.max_text_seq_length = max_text_seq_length
364
+ self.spatial_interpolation_scale = spatial_interpolation_scale
365
+ self.temporal_interpolation_scale = temporal_interpolation_scale
366
+ self.use_positional_embeddings = use_positional_embeddings
367
+ self.use_learned_positional_embeddings = use_learned_positional_embeddings
348
368
 
349
369
  self.proj = nn.Conv2d(
350
370
  in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
351
371
  )
352
372
  self.text_proj = nn.Linear(text_embed_dim, embed_dim)
353
373
 
374
+ if use_positional_embeddings or use_learned_positional_embeddings:
375
+ persistent = use_learned_positional_embeddings
376
+ pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
377
+ self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
378
+
379
+ def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
380
+ post_patch_height = sample_height // self.patch_size
381
+ post_patch_width = sample_width // self.patch_size
382
+ post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
383
+ num_patches = post_patch_height * post_patch_width * post_time_compression_frames
384
+
385
+ pos_embedding = get_3d_sincos_pos_embed(
386
+ self.embed_dim,
387
+ (post_patch_width, post_patch_height),
388
+ post_time_compression_frames,
389
+ self.spatial_interpolation_scale,
390
+ self.temporal_interpolation_scale,
391
+ )
392
+ pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
393
+ joint_pos_embedding = torch.zeros(
394
+ 1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
395
+ )
396
+ joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
397
+
398
+ return joint_pos_embedding
399
+
354
400
  def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
355
401
  r"""
356
402
  Args:
@@ -371,6 +417,28 @@ class CogVideoXPatchEmbed(nn.Module):
371
417
  embeds = torch.cat(
372
418
  [text_embeds, image_embeds], dim=1
373
419
  ).contiguous() # [batch, seq_length + num_frames x height x width, channels]
420
+
421
+ if self.use_positional_embeddings or self.use_learned_positional_embeddings:
422
+ if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
423
+ raise ValueError(
424
+ "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
425
+ "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
426
+ )
427
+
428
+ pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
429
+
430
+ if (
431
+ self.sample_height != height
432
+ or self.sample_width != width
433
+ or self.sample_frames != pre_time_compression_frames
434
+ ):
435
+ pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
436
+ pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype)
437
+ else:
438
+ pos_embedding = self.pos_embedding
439
+
440
+ embeds = embeds + pos_embedding
441
+
374
442
  return embeds
375
443
 
376
444
 
@@ -23,7 +23,7 @@ from ...utils import is_torch_version, logging
23
23
  from ...utils.torch_utils import maybe_allow_in_graph
24
24
  from ..attention import Attention, FeedForward
25
25
  from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
26
- from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
26
+ from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
27
27
  from ..modeling_outputs import Transformer2DModelOutput
28
28
  from ..modeling_utils import ModelMixin
29
29
  from ..normalization import AdaLayerNorm, CogVideoXLayerNormZero
@@ -235,37 +235,42 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
235
235
  spatial_interpolation_scale: float = 1.875,
236
236
  temporal_interpolation_scale: float = 1.0,
237
237
  use_rotary_positional_embeddings: bool = False,
238
+ use_learned_positional_embeddings: bool = False,
238
239
  ):
239
240
  super().__init__()
240
241
  inner_dim = num_attention_heads * attention_head_dim
241
242
 
242
- post_patch_height = sample_height // patch_size
243
- post_patch_width = sample_width // patch_size
244
- post_time_compression_frames = (sample_frames - 1) // temporal_compression_ratio + 1
245
- self.num_patches = post_patch_height * post_patch_width * post_time_compression_frames
243
+ if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
244
+ raise ValueError(
245
+ "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
246
+ "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
247
+ "issue at https://github.com/huggingface/diffusers/issues."
248
+ )
246
249
 
247
250
  # 1. Patch embedding
248
- self.patch_embed = CogVideoXPatchEmbed(patch_size, in_channels, inner_dim, text_embed_dim, bias=True)
249
- self.embedding_dropout = nn.Dropout(dropout)
250
-
251
- # 2. 3D positional embeddings
252
- spatial_pos_embedding = get_3d_sincos_pos_embed(
253
- inner_dim,
254
- (post_patch_width, post_patch_height),
255
- post_time_compression_frames,
256
- spatial_interpolation_scale,
257
- temporal_interpolation_scale,
251
+ self.patch_embed = CogVideoXPatchEmbed(
252
+ patch_size=patch_size,
253
+ in_channels=in_channels,
254
+ embed_dim=inner_dim,
255
+ text_embed_dim=text_embed_dim,
256
+ bias=True,
257
+ sample_width=sample_width,
258
+ sample_height=sample_height,
259
+ sample_frames=sample_frames,
260
+ temporal_compression_ratio=temporal_compression_ratio,
261
+ max_text_seq_length=max_text_seq_length,
262
+ spatial_interpolation_scale=spatial_interpolation_scale,
263
+ temporal_interpolation_scale=temporal_interpolation_scale,
264
+ use_positional_embeddings=not use_rotary_positional_embeddings,
265
+ use_learned_positional_embeddings=use_learned_positional_embeddings,
258
266
  )
259
- spatial_pos_embedding = torch.from_numpy(spatial_pos_embedding).flatten(0, 1)
260
- pos_embedding = torch.zeros(1, max_text_seq_length + self.num_patches, inner_dim, requires_grad=False)
261
- pos_embedding.data[:, max_text_seq_length:].copy_(spatial_pos_embedding)
262
- self.register_buffer("pos_embedding", pos_embedding, persistent=False)
267
+ self.embedding_dropout = nn.Dropout(dropout)
263
268
 
264
- # 3. Time embeddings
269
+ # 2. Time embeddings
265
270
  self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
266
271
  self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
267
272
 
268
- # 4. Define spatio-temporal transformers blocks
273
+ # 3. Define spatio-temporal transformers blocks
269
274
  self.transformer_blocks = nn.ModuleList(
270
275
  [
271
276
  CogVideoXBlock(
@@ -284,7 +289,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
284
289
  )
285
290
  self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
286
291
 
287
- # 5. Output blocks
292
+ # 4. Output blocks
288
293
  self.norm_out = AdaLayerNorm(
289
294
  embedding_dim=time_embed_dim,
290
295
  output_dim=2 * inner_dim,
@@ -422,20 +427,13 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
422
427
 
423
428
  # 2. Patch embedding
424
429
  hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
430
+ hidden_states = self.embedding_dropout(hidden_states)
425
431
 
426
- # 3. Position embedding
427
432
  text_seq_length = encoder_hidden_states.shape[1]
428
- if not self.config.use_rotary_positional_embeddings:
429
- seq_length = height * width * num_frames // (self.config.patch_size**2)
430
-
431
- pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
432
- hidden_states = hidden_states + pos_embeds
433
- hidden_states = self.embedding_dropout(hidden_states)
434
-
435
433
  encoder_hidden_states = hidden_states[:, :text_seq_length]
436
434
  hidden_states = hidden_states[:, text_seq_length:]
437
435
 
438
- # 4. Transformer blocks
436
+ # 3. Transformer blocks
439
437
  for i, block in enumerate(self.transformer_blocks):
440
438
  if self.training and self.gradient_checkpointing:
441
439
 
@@ -471,13 +469,16 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
471
469
  hidden_states = self.norm_final(hidden_states)
472
470
  hidden_states = hidden_states[:, text_seq_length:]
473
471
 
474
- # 5. Final block
472
+ # 4. Final block
475
473
  hidden_states = self.norm_out(hidden_states, temb=emb)
476
474
  hidden_states = self.proj_out(hidden_states)
477
475
 
478
- # 6. Unpatchify
476
+ # 5. Unpatchify
477
+ # Note: we use `-1` instead of `channels`:
478
+ # - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
479
+ # - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
479
480
  p = self.config.patch_size
480
- output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, channels, p, p)
481
+ output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
481
482
  output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
482
483
 
483
484
  if not return_dict:
@@ -132,7 +132,11 @@ else:
132
132
  "AudioLDM2UNet2DConditionModel",
133
133
  ]
134
134
  _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
135
- _import_structure["cogvideo"] = ["CogVideoXPipeline"]
135
+ _import_structure["cogvideo"] = [
136
+ "CogVideoXPipeline",
137
+ "CogVideoXImageToVideoPipeline",
138
+ "CogVideoXVideoToVideoPipeline",
139
+ ]
136
140
  _import_structure["controlnet"].extend(
137
141
  [
138
142
  "BlipDiffusionControlNetPipeline",
@@ -452,7 +456,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
452
456
  )
453
457
  from .aura_flow import AuraFlowPipeline
454
458
  from .blip_diffusion import BlipDiffusionPipeline
455
- from .cogvideo import CogVideoXPipeline
459
+ from .cogvideo import CogVideoXImageToVideoPipeline, CogVideoXPipeline, CogVideoXVideoToVideoPipeline
456
460
  from .controlnet import (
457
461
  BlipDiffusionControlNetPipeline,
458
462
  StableDiffusionControlNetImg2ImgPipeline,
@@ -23,6 +23,8 @@ except OptionalDependencyNotAvailable:
23
23
  _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
24
  else:
25
25
  _import_structure["pipeline_cogvideox"] = ["CogVideoXPipeline"]
26
+ _import_structure["pipeline_cogvideox_image2video"] = ["CogVideoXImageToVideoPipeline"]
27
+ _import_structure["pipeline_cogvideox_video2video"] = ["CogVideoXVideoToVideoPipeline"]
26
28
 
27
29
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
30
  try:
@@ -33,6 +35,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
33
35
  from ...utils.dummy_torch_and_transformers_objects import *
34
36
  else:
35
37
  from .pipeline_cogvideox import CogVideoXPipeline
38
+ from .pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
39
+ from .pipeline_cogvideox_video2video import CogVideoXVideoToVideoPipeline
36
40
 
37
41
  else:
38
42
  import sys
@@ -15,7 +15,6 @@
15
15
 
16
16
  import inspect
17
17
  import math
18
- from dataclasses import dataclass
19
18
  from typing import Callable, Dict, List, Optional, Tuple, Union
20
19
 
21
20
  import torch
@@ -26,9 +25,10 @@ from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
26
25
  from ...models.embeddings import get_3d_rotary_pos_embed
27
26
  from ...pipelines.pipeline_utils import DiffusionPipeline
28
27
  from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
29
- from ...utils import BaseOutput, logging, replace_example_docstring
28
+ from ...utils import logging, replace_example_docstring
30
29
  from ...utils.torch_utils import randn_tensor
31
30
  from ...video_processor import VideoProcessor
31
+ from .pipeline_output import CogVideoXPipelineOutput
32
32
 
33
33
 
34
34
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -136,21 +136,6 @@ def retrieve_timesteps(
136
136
  return timesteps, num_inference_steps
137
137
 
138
138
 
139
- @dataclass
140
- class CogVideoXPipelineOutput(BaseOutput):
141
- r"""
142
- Output class for CogVideo pipelines.
143
-
144
- Args:
145
- frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
146
- List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
147
- denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
148
- `(batch_size, num_frames, channels, height, width)`.
149
- """
150
-
151
- frames: torch.Tensor
152
-
153
-
154
139
  class CogVideoXPipeline(DiffusionPipeline):
155
140
  r"""
156
141
  Pipeline for text-to-video generation using CogVideoX.