diffusers 0.30.0__py3-none-any.whl → 0.30.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,21 @@ class AuraFlowPatchEmbed(nn.Module):
68
68
  self.height, self.width = height // patch_size, width // patch_size
69
69
  self.base_size = height // patch_size
70
70
 
71
+ def pe_selection_index_based_on_dim(self, h, w):
72
+ # select subset of positional embedding based on H, W, where H, W is size of latent
73
+ # PE will be viewed as 2d-grid, and H/p x W/p of the PE will be selected
74
+ # because original input are in flattened format, we have to flatten this 2d grid as well.
75
+ h_p, w_p = h // self.patch_size, w // self.patch_size
76
+ original_pe_indexes = torch.arange(self.pos_embed.shape[1])
77
+ h_max, w_max = int(self.pos_embed_max_size**0.5), int(self.pos_embed_max_size**0.5)
78
+ original_pe_indexes = original_pe_indexes.view(h_max, w_max)
79
+ starth = h_max // 2 - h_p // 2
80
+ endh = starth + h_p
81
+ startw = w_max // 2 - w_p // 2
82
+ endw = startw + w_p
83
+ original_pe_indexes = original_pe_indexes[starth:endh, startw:endw]
84
+ return original_pe_indexes.flatten()
85
+
71
86
  def forward(self, latent):
72
87
  batch_size, num_channels, height, width = latent.size()
73
88
  latent = latent.view(
@@ -80,7 +95,8 @@ class AuraFlowPatchEmbed(nn.Module):
80
95
  )
81
96
  latent = latent.permute(0, 2, 4, 1, 3, 5).flatten(-3).flatten(1, 2)
82
97
  latent = self.proj(latent)
83
- return latent + self.pos_embed
98
+ pe_index = self.pe_selection_index_based_on_dim(height, width)
99
+ return latent + self.pos_embed[:, pe_index]
84
100
 
85
101
 
86
102
  # Taken from the original Aura flow inference code.
@@ -13,7 +13,7 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- from typing import Any, Dict, Optional, Union
16
+ from typing import Any, Dict, Optional, Tuple, Union
17
17
 
18
18
  import torch
19
19
  from torch import nn
@@ -22,6 +22,7 @@ from ...configuration_utils import ConfigMixin, register_to_config
22
22
  from ...utils import is_torch_version, logging
23
23
  from ...utils.torch_utils import maybe_allow_in_graph
24
24
  from ..attention import Attention, FeedForward
25
+ from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
25
26
  from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
26
27
  from ..modeling_outputs import Transformer2DModelOutput
27
28
  from ..modeling_utils import ModelMixin
@@ -37,13 +38,20 @@ class CogVideoXBlock(nn.Module):
37
38
  Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
38
39
 
39
40
  Parameters:
40
- dim (`int`): The number of channels in the input and output.
41
- num_attention_heads (`int`): The number of heads to use for multi-head attention.
42
- attention_head_dim (`int`): The number of channels in each head.
43
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
44
- activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
45
- attention_bias (:
46
- obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
41
+ dim (`int`):
42
+ The number of channels in the input and output.
43
+ num_attention_heads (`int`):
44
+ The number of heads to use for multi-head attention.
45
+ attention_head_dim (`int`):
46
+ The number of channels in each head.
47
+ time_embed_dim (`int`):
48
+ The number of channels in timestep embedding.
49
+ dropout (`float`, defaults to `0.0`):
50
+ The dropout probability to use.
51
+ activation_fn (`str`, defaults to `"gelu-approximate"`):
52
+ Activation function to be used in feed-forward.
53
+ attention_bias (`bool`, defaults to `False`):
54
+ Whether or not to use bias in attention projection layers.
47
55
  qk_norm (`bool`, defaults to `True`):
48
56
  Whether or not to use normalization after query and key projections in Attention.
49
57
  norm_elementwise_affine (`bool`, defaults to `True`):
@@ -90,6 +98,7 @@ class CogVideoXBlock(nn.Module):
90
98
  eps=1e-6,
91
99
  bias=attention_bias,
92
100
  out_bias=attention_out_bias,
101
+ processor=CogVideoXAttnProcessor2_0(),
93
102
  )
94
103
 
95
104
  # 2. Feed Forward
@@ -109,24 +118,24 @@ class CogVideoXBlock(nn.Module):
109
118
  hidden_states: torch.Tensor,
110
119
  encoder_hidden_states: torch.Tensor,
111
120
  temb: torch.Tensor,
121
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
112
122
  ) -> torch.Tensor:
123
+ text_seq_length = encoder_hidden_states.size(1)
124
+
125
+ # norm & modulate
113
126
  norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
114
127
  hidden_states, encoder_hidden_states, temb
115
128
  )
116
129
 
117
130
  # attention
118
- text_length = norm_encoder_hidden_states.size(1)
119
-
120
- # CogVideoX uses concatenated text + video embeddings with self-attention instead of using
121
- # them in cross-attention individually
122
- norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
123
- attn_output = self.attn1(
131
+ attn_hidden_states, attn_encoder_hidden_states = self.attn1(
124
132
  hidden_states=norm_hidden_states,
125
- encoder_hidden_states=None,
133
+ encoder_hidden_states=norm_encoder_hidden_states,
134
+ image_rotary_emb=image_rotary_emb,
126
135
  )
127
136
 
128
- hidden_states = hidden_states + gate_msa * attn_output[:, text_length:]
129
- encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_output[:, :text_length]
137
+ hidden_states = hidden_states + gate_msa * attn_hidden_states
138
+ encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
130
139
 
131
140
  # norm & modulate
132
141
  norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
@@ -137,8 +146,9 @@ class CogVideoXBlock(nn.Module):
137
146
  norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
138
147
  ff_output = self.ff(norm_hidden_states)
139
148
 
140
- hidden_states = hidden_states + gate_ff * ff_output[:, text_length:]
141
- encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_length]
149
+ hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
150
+ encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
151
+
142
152
  return hidden_states, encoder_hidden_states
143
153
 
144
154
 
@@ -147,36 +157,53 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
147
157
  A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
148
158
 
149
159
  Parameters:
150
- num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
151
- attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
152
- in_channels (`int`, *optional*):
160
+ num_attention_heads (`int`, defaults to `30`):
161
+ The number of heads to use for multi-head attention.
162
+ attention_head_dim (`int`, defaults to `64`):
163
+ The number of channels in each head.
164
+ in_channels (`int`, defaults to `16`):
153
165
  The number of channels in the input.
154
- out_channels (`int`, *optional*):
166
+ out_channels (`int`, *optional*, defaults to `16`):
155
167
  The number of channels in the output.
156
- num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
157
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
158
- cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
159
- attention_bias (`bool`, *optional*):
160
- Configure if the `TransformerBlocks` attention should contain a bias parameter.
161
- sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
162
- This is fixed during training since it is used to learn a number of position embeddings.
163
- patch_size (`int`, *optional*):
168
+ flip_sin_to_cos (`bool`, defaults to `True`):
169
+ Whether to flip the sin to cos in the time embedding.
170
+ time_embed_dim (`int`, defaults to `512`):
171
+ Output dimension of timestep embeddings.
172
+ text_embed_dim (`int`, defaults to `4096`):
173
+ Input dimension of text embeddings from the text encoder.
174
+ num_layers (`int`, defaults to `30`):
175
+ The number of layers of Transformer blocks to use.
176
+ dropout (`float`, defaults to `0.0`):
177
+ The dropout probability to use.
178
+ attention_bias (`bool`, defaults to `True`):
179
+ Whether or not to use bias in the attention projection layers.
180
+ sample_width (`int`, defaults to `90`):
181
+ The width of the input latents.
182
+ sample_height (`int`, defaults to `60`):
183
+ The height of the input latents.
184
+ sample_frames (`int`, defaults to `49`):
185
+ The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
186
+ instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
187
+ but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
188
+ K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
189
+ patch_size (`int`, defaults to `2`):
164
190
  The size of the patches to use in the patch embedding layer.
165
- activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
166
- num_embeds_ada_norm ( `int`, *optional*):
167
- The number of diffusion steps used during training. Pass if at least one of the norm_layers is
168
- `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
169
- added to the hidden states. During inference, you can denoise for up to but not more steps than
170
- `num_embeds_ada_norm`.
171
- norm_type (`str`, *optional*, defaults to `"layer_norm"`):
172
- The type of normalization to use. Options are `"layer_norm"` or `"ada_layer_norm"`.
173
- norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
191
+ temporal_compression_ratio (`int`, defaults to `4`):
192
+ The compression ratio across the temporal dimension. See documentation for `sample_frames`.
193
+ max_text_seq_length (`int`, defaults to `226`):
194
+ The maximum sequence length of the input text embeddings.
195
+ activation_fn (`str`, defaults to `"gelu-approximate"`):
196
+ Activation function to use in feed-forward.
197
+ timestep_activation_fn (`str`, defaults to `"silu"`):
198
+ Activation function to use when generating the timestep embeddings.
199
+ norm_elementwise_affine (`bool`, defaults to `True`):
174
200
  Whether or not to use elementwise affine in normalization layers.
175
- norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use in normalization layers.
176
- caption_channels (`int`, *optional*):
177
- The number of channels in the caption embeddings.
178
- video_length (`int`, *optional*):
179
- The number of frames in the video-like data.
201
+ norm_eps (`float`, defaults to `1e-5`):
202
+ The epsilon value to use in normalization layers.
203
+ spatial_interpolation_scale (`float`, defaults to `1.875`):
204
+ Scaling factor to apply in 3D positional embeddings across spatial dimensions.
205
+ temporal_interpolation_scale (`float`, defaults to `1.0`):
206
+ Scaling factor to apply in 3D positional embeddings across temporal dimensions.
180
207
  """
181
208
 
182
209
  _supports_gradient_checkpointing = True
@@ -186,7 +213,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
186
213
  self,
187
214
  num_attention_heads: int = 30,
188
215
  attention_head_dim: int = 64,
189
- in_channels: Optional[int] = 16,
216
+ in_channels: int = 16,
190
217
  out_channels: Optional[int] = 16,
191
218
  flip_sin_to_cos: bool = True,
192
219
  freq_shift: int = 0,
@@ -207,6 +234,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
207
234
  norm_eps: float = 1e-5,
208
235
  spatial_interpolation_scale: float = 1.875,
209
236
  temporal_interpolation_scale: float = 1.0,
237
+ use_rotary_positional_embeddings: bool = False,
210
238
  ):
211
239
  super().__init__()
212
240
  inner_dim = num_attention_heads * attention_head_dim
@@ -271,12 +299,113 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
271
299
  def _set_gradient_checkpointing(self, module, value=False):
272
300
  self.gradient_checkpointing = value
273
301
 
302
+ @property
303
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
304
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
305
+ r"""
306
+ Returns:
307
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
308
+ indexed by its weight name.
309
+ """
310
+ # set recursively
311
+ processors = {}
312
+
313
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
314
+ if hasattr(module, "get_processor"):
315
+ processors[f"{name}.processor"] = module.get_processor()
316
+
317
+ for sub_name, child in module.named_children():
318
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
319
+
320
+ return processors
321
+
322
+ for name, module in self.named_children():
323
+ fn_recursive_add_processors(name, module, processors)
324
+
325
+ return processors
326
+
327
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
328
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
329
+ r"""
330
+ Sets the attention processor to use to compute attention.
331
+
332
+ Parameters:
333
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
334
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
335
+ for **all** `Attention` layers.
336
+
337
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
338
+ processor. This is strongly recommended when setting trainable attention processors.
339
+
340
+ """
341
+ count = len(self.attn_processors.keys())
342
+
343
+ if isinstance(processor, dict) and len(processor) != count:
344
+ raise ValueError(
345
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
346
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
347
+ )
348
+
349
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
350
+ if hasattr(module, "set_processor"):
351
+ if not isinstance(processor, dict):
352
+ module.set_processor(processor)
353
+ else:
354
+ module.set_processor(processor.pop(f"{name}.processor"))
355
+
356
+ for sub_name, child in module.named_children():
357
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
358
+
359
+ for name, module in self.named_children():
360
+ fn_recursive_attn_processor(name, module, processor)
361
+
362
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
363
+ def fuse_qkv_projections(self):
364
+ """
365
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
366
+ are fused. For cross-attention modules, key and value projection matrices are fused.
367
+
368
+ <Tip warning={true}>
369
+
370
+ This API is 🧪 experimental.
371
+
372
+ </Tip>
373
+ """
374
+ self.original_attn_processors = None
375
+
376
+ for _, attn_processor in self.attn_processors.items():
377
+ if "Added" in str(attn_processor.__class__.__name__):
378
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
379
+
380
+ self.original_attn_processors = self.attn_processors
381
+
382
+ for module in self.modules():
383
+ if isinstance(module, Attention):
384
+ module.fuse_projections(fuse=True)
385
+
386
+ self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
387
+
388
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
389
+ def unfuse_qkv_projections(self):
390
+ """Disables the fused QKV projection if enabled.
391
+
392
+ <Tip warning={true}>
393
+
394
+ This API is 🧪 experimental.
395
+
396
+ </Tip>
397
+
398
+ """
399
+ if self.original_attn_processors is not None:
400
+ self.set_attn_processor(self.original_attn_processors)
401
+
274
402
  def forward(
275
403
  self,
276
404
  hidden_states: torch.Tensor,
277
405
  encoder_hidden_states: torch.Tensor,
278
406
  timestep: Union[int, float, torch.LongTensor],
279
407
  timestep_cond: Optional[torch.Tensor] = None,
408
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
280
409
  return_dict: bool = True,
281
410
  ):
282
411
  batch_size, num_frames, channels, height, width = hidden_states.shape
@@ -295,16 +424,18 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
295
424
  hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
296
425
 
297
426
  # 3. Position embedding
298
- seq_length = height * width * num_frames // (self.config.patch_size**2)
427
+ text_seq_length = encoder_hidden_states.shape[1]
428
+ if not self.config.use_rotary_positional_embeddings:
429
+ seq_length = height * width * num_frames // (self.config.patch_size**2)
299
430
 
300
- pos_embeds = self.pos_embedding[:, : self.config.max_text_seq_length + seq_length]
301
- hidden_states = hidden_states + pos_embeds
302
- hidden_states = self.embedding_dropout(hidden_states)
431
+ pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
432
+ hidden_states = hidden_states + pos_embeds
433
+ hidden_states = self.embedding_dropout(hidden_states)
303
434
 
304
- encoder_hidden_states = hidden_states[:, : self.config.max_text_seq_length]
305
- hidden_states = hidden_states[:, self.config.max_text_seq_length :]
435
+ encoder_hidden_states = hidden_states[:, :text_seq_length]
436
+ hidden_states = hidden_states[:, text_seq_length:]
306
437
 
307
- # 5. Transformer blocks
438
+ # 4. Transformer blocks
308
439
  for i, block in enumerate(self.transformer_blocks):
309
440
  if self.training and self.gradient_checkpointing:
310
441
 
@@ -320,6 +451,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
320
451
  hidden_states,
321
452
  encoder_hidden_states,
322
453
  emb,
454
+ image_rotary_emb,
323
455
  **ckpt_kwargs,
324
456
  )
325
457
  else:
@@ -327,15 +459,23 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
327
459
  hidden_states=hidden_states,
328
460
  encoder_hidden_states=encoder_hidden_states,
329
461
  temb=emb,
462
+ image_rotary_emb=image_rotary_emb,
330
463
  )
331
464
 
332
- hidden_states = self.norm_final(hidden_states)
465
+ if not self.config.use_rotary_positional_embeddings:
466
+ # CogVideoX-2B
467
+ hidden_states = self.norm_final(hidden_states)
468
+ else:
469
+ # CogVideoX-5B
470
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
471
+ hidden_states = self.norm_final(hidden_states)
472
+ hidden_states = hidden_states[:, text_seq_length:]
333
473
 
334
- # 6. Final block
474
+ # 5. Final block
335
475
  hidden_states = self.norm_out(hidden_states, temb=emb)
336
476
  hidden_states = self.proj_out(hidden_states)
337
477
 
338
- # 7. Unpatchify
478
+ # 6. Unpatchify
339
479
  p = self.config.patch_size
340
480
  output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, channels, p, p)
341
481
  output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
@@ -23,6 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer
23
23
 
24
24
  from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
25
  from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
26
+ from ...models.embeddings import get_3d_rotary_pos_embed
26
27
  from ...pipelines.pipeline_utils import DiffusionPipeline
27
28
  from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
28
29
  from ...utils import BaseOutput, logging, replace_example_docstring
@@ -40,6 +41,7 @@ EXAMPLE_DOC_STRING = """
40
41
  >>> from diffusers import CogVideoXPipeline
41
42
  >>> from diffusers.utils import export_to_video
42
43
 
44
+ >>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
43
45
  >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to("cuda")
44
46
  >>> prompt = (
45
47
  ... "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
@@ -55,6 +57,25 @@ EXAMPLE_DOC_STRING = """
55
57
  """
56
58
 
57
59
 
60
+ # Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
61
+ def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
62
+ tw = tgt_width
63
+ th = tgt_height
64
+ h, w = src
65
+ r = h / w
66
+ if r > (th / tw):
67
+ resize_height = th
68
+ resize_width = int(round(th / h * w))
69
+ else:
70
+ resize_width = tw
71
+ resize_height = int(round(tw / w * h))
72
+
73
+ crop_top = int(round((th - resize_height) / 2.0))
74
+ crop_left = int(round((tw - resize_width) / 2.0))
75
+
76
+ return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
77
+
78
+
58
79
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
59
80
  def retrieve_timesteps(
60
81
  scheduler,
@@ -332,20 +353,11 @@ class CogVideoXPipeline(DiffusionPipeline):
332
353
  latents = latents * self.scheduler.init_noise_sigma
333
354
  return latents
334
355
 
335
- def decode_latents(self, latents: torch.Tensor, num_seconds: int):
356
+ def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
336
357
  latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
337
358
  latents = 1 / self.vae.config.scaling_factor * latents
338
359
 
339
- frames = []
340
- for i in range(num_seconds):
341
- start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
342
-
343
- current_frames = self.vae.decode(latents[:, :, start_frame:end_frame]).sample
344
- frames.append(current_frames)
345
-
346
- self.vae.clear_fake_context_parallel_cache()
347
-
348
- frames = torch.cat(frames, dim=2)
360
+ frames = self.vae.decode(latents).sample
349
361
  return frames
350
362
 
351
363
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -418,6 +430,46 @@ class CogVideoXPipeline(DiffusionPipeline):
418
430
  f" {negative_prompt_embeds.shape}."
419
431
  )
420
432
 
433
+ def fuse_qkv_projections(self) -> None:
434
+ r"""Enables fused QKV projections."""
435
+ self.fusing_transformer = True
436
+ self.transformer.fuse_qkv_projections()
437
+
438
+ def unfuse_qkv_projections(self) -> None:
439
+ r"""Disable QKV projection fusion if enabled."""
440
+ if not self.fusing_transformer:
441
+ logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
442
+ else:
443
+ self.transformer.unfuse_qkv_projections()
444
+ self.fusing_transformer = False
445
+
446
+ def _prepare_rotary_positional_embeddings(
447
+ self,
448
+ height: int,
449
+ width: int,
450
+ num_frames: int,
451
+ device: torch.device,
452
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
453
+ grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
454
+ grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
455
+ base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
456
+ base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
457
+
458
+ grid_crops_coords = get_resize_crop_region_for_grid(
459
+ (grid_height, grid_width), base_size_width, base_size_height
460
+ )
461
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
462
+ embed_dim=self.transformer.config.attention_head_dim,
463
+ crops_coords=grid_crops_coords,
464
+ grid_size=(grid_height, grid_width),
465
+ temporal_size=num_frames,
466
+ use_real=True,
467
+ )
468
+
469
+ freqs_cos = freqs_cos.to(device=device)
470
+ freqs_sin = freqs_sin.to(device=device)
471
+ return freqs_cos, freqs_sin
472
+
421
473
  @property
422
474
  def guidance_scale(self):
423
475
  return self._guidance_scale
@@ -438,8 +490,7 @@ class CogVideoXPipeline(DiffusionPipeline):
438
490
  negative_prompt: Optional[Union[str, List[str]]] = None,
439
491
  height: int = 480,
440
492
  width: int = 720,
441
- num_frames: int = 48,
442
- fps: int = 8,
493
+ num_frames: int = 49,
443
494
  num_inference_steps: int = 50,
444
495
  timesteps: Optional[List[int]] = None,
445
496
  guidance_scale: float = 6,
@@ -534,9 +585,10 @@ class CogVideoXPipeline(DiffusionPipeline):
534
585
  `tuple`. When returning a tuple, the first element is a list with the generated images.
535
586
  """
536
587
 
537
- assert (
538
- num_frames <= 48 and num_frames % fps == 0 and fps == 8
539
- ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
588
+ if num_frames > 49:
589
+ raise ValueError(
590
+ "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
591
+ )
540
592
 
541
593
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
542
594
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
@@ -593,7 +645,6 @@ class CogVideoXPipeline(DiffusionPipeline):
593
645
 
594
646
  # 5. Prepare latents.
595
647
  latent_channels = self.transformer.config.in_channels
596
- num_frames += 1
597
648
  latents = self.prepare_latents(
598
649
  batch_size * num_videos_per_prompt,
599
650
  latent_channels,
@@ -609,7 +660,14 @@ class CogVideoXPipeline(DiffusionPipeline):
609
660
  # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
610
661
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
611
662
 
612
- # 7. Denoising loop
663
+ # 7. Create rotary embeds if required
664
+ image_rotary_emb = (
665
+ self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
666
+ if self.transformer.config.use_rotary_positional_embeddings
667
+ else None
668
+ )
669
+
670
+ # 8. Denoising loop
613
671
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
614
672
 
615
673
  with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -630,6 +688,7 @@ class CogVideoXPipeline(DiffusionPipeline):
630
688
  hidden_states=latent_model_input,
631
689
  encoder_hidden_states=prompt_embeds,
632
690
  timestep=timestep,
691
+ image_rotary_emb=image_rotary_emb,
633
692
  return_dict=False,
634
693
  )[0]
635
694
  noise_pred = noise_pred.float()
@@ -673,7 +732,7 @@ class CogVideoXPipeline(DiffusionPipeline):
673
732
  progress_bar.update()
674
733
 
675
734
  if not output_type == "latent":
676
- video = self.decode_latents(latents, num_frames // fps)
735
+ video = self.decode_latents(latents)
677
736
  video = self.video_processor.postprocess_video(video=video, output_type=output_type)
678
737
  else:
679
738
  video = latents
@@ -9,7 +9,7 @@ import numpy as np
9
9
  import PIL.Image
10
10
  import PIL.ImageOps
11
11
 
12
- from .import_utils import BACKENDS_MAPPING, is_opencv_available
12
+ from .import_utils import BACKENDS_MAPPING, is_imageio_available, is_opencv_available
13
13
  from .logging import get_logger
14
14
 
15
15
 
@@ -112,9 +112,9 @@ def export_to_obj(mesh, output_obj_path: str = None):
112
112
  f.writelines("\n".join(combined_data))
113
113
 
114
114
 
115
- def export_to_video(
115
+ def _legacy_export_to_video(
116
116
  video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 10
117
- ) -> str:
117
+ ):
118
118
  if is_opencv_available():
119
119
  import cv2
120
120
  else:
@@ -134,4 +134,51 @@ def export_to_video(
134
134
  for i in range(len(video_frames)):
135
135
  img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
136
136
  video_writer.write(img)
137
+
138
+ return output_video_path
139
+
140
+
141
+ def export_to_video(
142
+ video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 10
143
+ ) -> str:
144
+ # TODO: Dhruv. Remove by Diffusers release 0.33.0
145
+ # Added to prevent breaking existing code
146
+ if not is_imageio_available():
147
+ logger.warning(
148
+ (
149
+ "It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. \n"
150
+ "These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. \n"
151
+ "Support for the OpenCV backend will be deprecated in a future Diffusers version"
152
+ )
153
+ )
154
+ return _legacy_export_to_video(video_frames, output_video_path, fps)
155
+
156
+ if is_imageio_available():
157
+ import imageio
158
+ else:
159
+ raise ImportError(BACKENDS_MAPPING["imageio"][1].format("export_to_video"))
160
+
161
+ try:
162
+ imageio.plugins.ffmpeg.get_exe()
163
+ except AttributeError:
164
+ raise AttributeError(
165
+ (
166
+ "Found an existing imageio backend in your environment. Attempting to export video with imageio. \n"
167
+ "Unable to find a compatible ffmpeg installation in your environment to use with imageio. Please install via `pip install imageio-ffmpeg"
168
+ )
169
+ )
170
+
171
+ if output_video_path is None:
172
+ output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
173
+
174
+ if isinstance(video_frames[0], np.ndarray):
175
+ video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
176
+
177
+ elif isinstance(video_frames[0], PIL.Image.Image):
178
+ video_frames = [np.array(frame) for frame in video_frames]
179
+
180
+ with imageio.get_writer(output_video_path, fps=fps) as writer:
181
+ for frame in video_frames:
182
+ writer.append_data(frame)
183
+
137
184
  return output_video_path