diffusers 0.30.3__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +34 -2
- diffusers/configuration_utils.py +12 -0
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +257 -54
- diffusers/loaders/__init__.py +2 -0
- diffusers/loaders/ip_adapter.py +5 -1
- diffusers/loaders/lora_base.py +14 -7
- diffusers/loaders/lora_conversion_utils.py +332 -0
- diffusers/loaders/lora_pipeline.py +707 -41
- diffusers/loaders/peft.py +1 -0
- diffusers/loaders/single_file_utils.py +81 -4
- diffusers/loaders/textual_inversion.py +2 -0
- diffusers/loaders/unet.py +39 -8
- diffusers/models/__init__.py +4 -0
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +86 -10
- diffusers/models/attention_processor.py +169 -133
- diffusers/models/autoencoders/autoencoder_kl.py +71 -11
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +187 -88
- diffusers/models/controlnet_flux.py +536 -0
- diffusers/models/controlnet_sd3.py +7 -3
- diffusers/models/controlnet_sparsectrl.py +0 -1
- diffusers/models/embeddings.py +170 -61
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +182 -14
- diffusers/models/modeling_utils.py +283 -46
- diffusers/models/normalization.py +79 -0
- diffusers/models/transformers/__init__.py +1 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +1 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +23 -2
- diffusers/models/transformers/pixart_transformer_2d.py +9 -1
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +161 -44
- diffusers/models/transformers/transformer_sd3.py +7 -1
- diffusers/models/unets/unet_2d_condition.py +8 -8
- diffusers/models/unets/unet_motion_model.py +41 -63
- diffusers/models/upsampling.py +6 -6
- diffusers/pipelines/__init__.py +35 -6
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +44 -20
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -66
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -1
- diffusers/pipelines/auto_pipeline.py +39 -8
- diffusers/pipelines/cogvideo/__init__.py +2 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +30 -17
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +794 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +41 -31
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +42 -29
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/pipeline_controlnet.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +8 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +36 -13
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +9 -1
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +8 -1
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +17 -3
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +3 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +10 -0
- diffusers/pipelines/flux/pipeline_flux.py +53 -20
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +984 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +988 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1182 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +850 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1015 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +15 -3
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/lumina/pipeline_lumina.py +2 -2
- diffusers/pipelines/pag/__init__.py +6 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1544 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +2 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1685 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +17 -5
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +12 -3
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1091 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_loading_utils.py +225 -27
- diffusers/pipelines/pipeline_utils.py +123 -180
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +1 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +28 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +12 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +20 -4
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +3 -3
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -14
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -14
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +126 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +558 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/quantization_config.py +391 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +4 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +4 -1
- diffusers/schedulers/scheduling_deis_multistep.py +78 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +82 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +80 -1
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +82 -1
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_sasolver.py +78 -1
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +78 -1
- diffusers/training_utils.py +48 -18
- diffusers/utils/__init__.py +2 -1
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +165 -0
- diffusers/utils/hub_utils.py +16 -4
- diffusers/utils/import_utils.py +31 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +3 -3
- diffusers/utils/testing_utils.py +59 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/METADATA +7 -6
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/RECORD +172 -149
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/WHEEL +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.31.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import PIL
|
21
21
|
import torch
|
@@ -23,6 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer
|
|
23
23
|
|
24
24
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
25
25
|
from ...image_processor import PipelineImageInput
|
26
|
+
from ...loaders import CogVideoXLoraLoaderMixin
|
26
27
|
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
27
28
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
28
29
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
@@ -87,7 +88,7 @@ def retrieve_timesteps(
|
|
87
88
|
sigmas: Optional[List[float]] = None,
|
88
89
|
**kwargs,
|
89
90
|
):
|
90
|
-
"""
|
91
|
+
r"""
|
91
92
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
92
93
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
93
94
|
|
@@ -152,7 +153,7 @@ def retrieve_latents(
|
|
152
153
|
raise AttributeError("Could not access latents of provided encoder_output")
|
153
154
|
|
154
155
|
|
155
|
-
class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
156
|
+
class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
156
157
|
r"""
|
157
158
|
Pipeline for image-to-video generation using CogVideoX.
|
158
159
|
|
@@ -207,6 +208,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
207
208
|
self.vae_scale_factor_temporal = (
|
208
209
|
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
209
210
|
)
|
211
|
+
self.vae_scaling_factor_image = (
|
212
|
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
213
|
+
)
|
210
214
|
|
211
215
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
212
216
|
|
@@ -348,6 +352,12 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
348
352
|
generator: Optional[torch.Generator] = None,
|
349
353
|
latents: Optional[torch.Tensor] = None,
|
350
354
|
):
|
355
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
356
|
+
raise ValueError(
|
357
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
358
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
359
|
+
)
|
360
|
+
|
351
361
|
num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
|
352
362
|
shape = (
|
353
363
|
batch_size,
|
@@ -357,12 +367,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
357
367
|
width // self.vae_scale_factor_spatial,
|
358
368
|
)
|
359
369
|
|
360
|
-
if isinstance(generator, list) and len(generator) != batch_size:
|
361
|
-
raise ValueError(
|
362
|
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
363
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
364
|
-
)
|
365
|
-
|
366
370
|
image = image.unsqueeze(2) # [B, C, F, H, W]
|
367
371
|
|
368
372
|
if isinstance(generator, list):
|
@@ -373,7 +377,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
373
377
|
image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
|
374
378
|
|
375
379
|
image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
|
376
|
-
image_latents = self.
|
380
|
+
image_latents = self.vae_scaling_factor_image * image_latents
|
377
381
|
|
378
382
|
padding_shape = (
|
379
383
|
batch_size,
|
@@ -397,7 +401,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
397
401
|
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
|
398
402
|
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
399
403
|
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
400
|
-
latents = 1 / self.
|
404
|
+
latents = 1 / self.vae_scaling_factor_image * latents
|
401
405
|
|
402
406
|
frames = self.vae.decode(latents).sample
|
403
407
|
return frames
|
@@ -438,7 +442,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
438
442
|
width,
|
439
443
|
negative_prompt,
|
440
444
|
callback_on_step_end_tensor_inputs,
|
441
|
-
video=None,
|
442
445
|
latents=None,
|
443
446
|
prompt_embeds=None,
|
444
447
|
negative_prompt_embeds=None,
|
@@ -494,9 +497,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
494
497
|
f" {negative_prompt_embeds.shape}."
|
495
498
|
)
|
496
499
|
|
497
|
-
if video is not None and latents is not None:
|
498
|
-
raise ValueError("Only one of `video` or `latents` should be provided")
|
499
|
-
|
500
500
|
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
|
501
501
|
def fuse_qkv_projections(self) -> None:
|
502
502
|
r"""Enables fused QKV projections."""
|
@@ -547,6 +547,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
547
547
|
def num_timesteps(self):
|
548
548
|
return self._num_timesteps
|
549
549
|
|
550
|
+
@property
|
551
|
+
def attention_kwargs(self):
|
552
|
+
return self._attention_kwargs
|
553
|
+
|
550
554
|
@property
|
551
555
|
def interrupt(self):
|
552
556
|
return self._interrupt
|
@@ -573,6 +577,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
573
577
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
574
578
|
output_type: str = "pil",
|
575
579
|
return_dict: bool = True,
|
580
|
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
576
581
|
callback_on_step_end: Optional[
|
577
582
|
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
578
583
|
] = None,
|
@@ -584,7 +589,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
584
589
|
|
585
590
|
Args:
|
586
591
|
image (`PipelineImageInput`):
|
587
|
-
The input
|
592
|
+
The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
|
588
593
|
prompt (`str` or `List[str]`, *optional*):
|
589
594
|
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
590
595
|
instead.
|
@@ -592,14 +597,14 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
592
597
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
593
598
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
594
599
|
less than `1`).
|
595
|
-
height (`int`, *optional*, defaults to self.
|
596
|
-
The height in pixels of the generated image. This is set to
|
597
|
-
width (`int`, *optional*, defaults to self.
|
598
|
-
The width in pixels of the generated image. This is set to
|
600
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
601
|
+
The height in pixels of the generated image. This is set to 480 by default for the best results.
|
602
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
603
|
+
The width in pixels of the generated image. This is set to 720 by default for the best results.
|
599
604
|
num_frames (`int`, defaults to `48`):
|
600
605
|
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
|
601
606
|
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
|
602
|
-
num_seconds is 6 and fps is
|
607
|
+
num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
|
603
608
|
needs to be satisfied is that of divisibility mentioned above.
|
604
609
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
605
610
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
@@ -636,6 +641,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
636
641
|
return_dict (`bool`, *optional*, defaults to `True`):
|
637
642
|
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
638
643
|
of a plain tuple.
|
644
|
+
attention_kwargs (`dict`, *optional*):
|
645
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
646
|
+
`self.processor` in
|
647
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
639
648
|
callback_on_step_end (`Callable`, *optional*):
|
640
649
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
641
650
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -665,22 +674,22 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
665
674
|
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
666
675
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
667
676
|
|
668
|
-
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
669
|
-
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
670
677
|
num_videos_per_prompt = 1
|
671
678
|
|
672
679
|
# 1. Check inputs. Raise error if not correct
|
673
680
|
self.check_inputs(
|
674
|
-
image,
|
675
|
-
prompt,
|
676
|
-
height,
|
677
|
-
width,
|
678
|
-
negative_prompt,
|
679
|
-
callback_on_step_end_tensor_inputs,
|
680
|
-
|
681
|
-
|
681
|
+
image=image,
|
682
|
+
prompt=prompt,
|
683
|
+
height=height,
|
684
|
+
width=width,
|
685
|
+
negative_prompt=negative_prompt,
|
686
|
+
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
687
|
+
latents=latents,
|
688
|
+
prompt_embeds=prompt_embeds,
|
689
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
682
690
|
)
|
683
691
|
self._guidance_scale = guidance_scale
|
692
|
+
self._attention_kwargs = attention_kwargs
|
684
693
|
self._interrupt = False
|
685
694
|
|
686
695
|
# 2. Default call parameters
|
@@ -770,6 +779,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
|
|
770
779
|
encoder_hidden_states=prompt_embeds,
|
771
780
|
timestep=timestep,
|
772
781
|
image_rotary_emb=image_rotary_emb,
|
782
|
+
attention_kwargs=attention_kwargs,
|
773
783
|
return_dict=False,
|
774
784
|
)[0]
|
775
785
|
noise_pred = noise_pred.float()
|
@@ -15,21 +15,19 @@
|
|
15
15
|
|
16
16
|
import inspect
|
17
17
|
import math
|
18
|
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import torch
|
21
21
|
from PIL import Image
|
22
22
|
from transformers import T5EncoderModel, T5Tokenizer
|
23
23
|
|
24
24
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
25
|
+
from ...loaders import CogVideoXLoraLoaderMixin
|
25
26
|
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
26
27
|
from ...models.embeddings import get_3d_rotary_pos_embed
|
27
28
|
from ...pipelines.pipeline_utils import DiffusionPipeline
|
28
29
|
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
29
|
-
from ...utils import
|
30
|
-
logging,
|
31
|
-
replace_example_docstring,
|
32
|
-
)
|
30
|
+
from ...utils import logging, replace_example_docstring
|
33
31
|
from ...utils.torch_utils import randn_tensor
|
34
32
|
from ...video_processor import VideoProcessor
|
35
33
|
from .pipeline_output import CogVideoXPipelineOutput
|
@@ -96,7 +94,7 @@ def retrieve_timesteps(
|
|
96
94
|
sigmas: Optional[List[float]] = None,
|
97
95
|
**kwargs,
|
98
96
|
):
|
99
|
-
"""
|
97
|
+
r"""
|
100
98
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
101
99
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
102
100
|
|
@@ -161,7 +159,7 @@ def retrieve_latents(
|
|
161
159
|
raise AttributeError("Could not access latents of provided encoder_output")
|
162
160
|
|
163
161
|
|
164
|
-
class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
162
|
+
class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
165
163
|
r"""
|
166
164
|
Pipeline for video-to-video generation using CogVideoX.
|
167
165
|
|
@@ -206,12 +204,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
206
204
|
self.register_modules(
|
207
205
|
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
|
208
206
|
)
|
207
|
+
|
209
208
|
self.vae_scale_factor_spatial = (
|
210
209
|
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
211
210
|
)
|
212
211
|
self.vae_scale_factor_temporal = (
|
213
212
|
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
214
213
|
)
|
214
|
+
self.vae_scaling_factor_image = (
|
215
|
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
216
|
+
)
|
215
217
|
|
216
218
|
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
217
219
|
|
@@ -353,6 +355,12 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
353
355
|
latents: Optional[torch.Tensor] = None,
|
354
356
|
timestep: Optional[torch.Tensor] = None,
|
355
357
|
):
|
358
|
+
if isinstance(generator, list) and len(generator) != batch_size:
|
359
|
+
raise ValueError(
|
360
|
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
361
|
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
362
|
+
)
|
363
|
+
|
356
364
|
num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
|
357
365
|
|
358
366
|
shape = (
|
@@ -363,12 +371,6 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
363
371
|
width // self.vae_scale_factor_spatial,
|
364
372
|
)
|
365
373
|
|
366
|
-
if isinstance(generator, list) and len(generator) != batch_size:
|
367
|
-
raise ValueError(
|
368
|
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
369
|
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
370
|
-
)
|
371
|
-
|
372
374
|
if latents is None:
|
373
375
|
if isinstance(generator, list):
|
374
376
|
if len(generator) != batch_size:
|
@@ -384,7 +386,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
384
386
|
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
|
385
387
|
|
386
388
|
init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
|
387
|
-
init_latents = self.
|
389
|
+
init_latents = self.vae_scaling_factor_image * init_latents
|
388
390
|
|
389
391
|
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
390
392
|
latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
@@ -398,7 +400,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
398
400
|
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
|
399
401
|
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
|
400
402
|
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
401
|
-
latents = 1 / self.
|
403
|
+
latents = 1 / self.vae_scaling_factor_image * latents
|
402
404
|
|
403
405
|
frames = self.vae.decode(latents).sample
|
404
406
|
return frames
|
@@ -541,6 +543,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
541
543
|
def num_timesteps(self):
|
542
544
|
return self._num_timesteps
|
543
545
|
|
546
|
+
@property
|
547
|
+
def attention_kwargs(self):
|
548
|
+
return self._attention_kwargs
|
549
|
+
|
544
550
|
@property
|
545
551
|
def interrupt(self):
|
546
552
|
return self._interrupt
|
@@ -567,6 +573,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
567
573
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
568
574
|
output_type: str = "pil",
|
569
575
|
return_dict: bool = True,
|
576
|
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
570
577
|
callback_on_step_end: Optional[
|
571
578
|
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
572
579
|
] = None,
|
@@ -586,10 +593,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
586
593
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
587
594
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
588
595
|
less than `1`).
|
589
|
-
height (`int`, *optional*, defaults to self.
|
590
|
-
The height in pixels of the generated image. This is set to
|
591
|
-
width (`int`, *optional*, defaults to self.
|
592
|
-
The width in pixels of the generated image. This is set to
|
596
|
+
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
597
|
+
The height in pixels of the generated image. This is set to 480 by default for the best results.
|
598
|
+
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
|
599
|
+
The width in pixels of the generated image. This is set to 720 by default for the best results.
|
593
600
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
594
601
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
595
602
|
expense of slower inference.
|
@@ -627,6 +634,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
627
634
|
return_dict (`bool`, *optional*, defaults to `True`):
|
628
635
|
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
629
636
|
of a plain tuple.
|
637
|
+
attention_kwargs (`dict`, *optional*):
|
638
|
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
639
|
+
`self.processor` in
|
640
|
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
630
641
|
callback_on_step_end (`Callable`, *optional*):
|
631
642
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
632
643
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -651,22 +662,23 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
651
662
|
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
652
663
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
653
664
|
|
654
|
-
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
655
|
-
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
656
665
|
num_videos_per_prompt = 1
|
657
666
|
|
658
667
|
# 1. Check inputs. Raise error if not correct
|
659
668
|
self.check_inputs(
|
660
|
-
prompt,
|
661
|
-
height,
|
662
|
-
width,
|
663
|
-
strength,
|
664
|
-
negative_prompt,
|
665
|
-
callback_on_step_end_tensor_inputs,
|
666
|
-
|
667
|
-
|
669
|
+
prompt=prompt,
|
670
|
+
height=height,
|
671
|
+
width=width,
|
672
|
+
strength=strength,
|
673
|
+
negative_prompt=negative_prompt,
|
674
|
+
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
675
|
+
video=video,
|
676
|
+
latents=latents,
|
677
|
+
prompt_embeds=prompt_embeds,
|
678
|
+
negative_prompt_embeds=negative_prompt_embeds,
|
668
679
|
)
|
669
680
|
self._guidance_scale = guidance_scale
|
681
|
+
self._attention_kwargs = attention_kwargs
|
670
682
|
self._interrupt = False
|
671
683
|
|
672
684
|
# 2. Default call parameters
|
@@ -755,6 +767,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline):
|
|
755
767
|
encoder_hidden_states=prompt_embeds,
|
756
768
|
timestep=timestep,
|
757
769
|
image_rotary_emb=image_rotary_emb,
|
770
|
+
attention_kwargs=attention_kwargs,
|
758
771
|
return_dict=False,
|
759
772
|
)[0]
|
760
773
|
noise_pred = noise_pred.float()
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from typing import TYPE_CHECKING
|
2
|
+
|
3
|
+
from ...utils import (
|
4
|
+
DIFFUSERS_SLOW_IMPORT,
|
5
|
+
OptionalDependencyNotAvailable,
|
6
|
+
_LazyModule,
|
7
|
+
get_objects_from_module,
|
8
|
+
is_torch_available,
|
9
|
+
is_transformers_available,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
_dummy_objects = {}
|
14
|
+
_additional_imports = {}
|
15
|
+
_import_structure = {"pipeline_output": ["CogView3PlusPipelineOutput"]}
|
16
|
+
|
17
|
+
try:
|
18
|
+
if not (is_transformers_available() and is_torch_available()):
|
19
|
+
raise OptionalDependencyNotAvailable()
|
20
|
+
except OptionalDependencyNotAvailable:
|
21
|
+
from ...utils import dummy_torch_and_transformers_objects # noqa F403
|
22
|
+
|
23
|
+
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
24
|
+
else:
|
25
|
+
_import_structure["pipeline_cogview3plus"] = ["CogView3PlusPipeline"]
|
26
|
+
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
27
|
+
try:
|
28
|
+
if not (is_transformers_available() and is_torch_available()):
|
29
|
+
raise OptionalDependencyNotAvailable()
|
30
|
+
except OptionalDependencyNotAvailable:
|
31
|
+
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
|
32
|
+
else:
|
33
|
+
from .pipeline_cogview3plus import CogView3PlusPipeline
|
34
|
+
else:
|
35
|
+
import sys
|
36
|
+
|
37
|
+
sys.modules[__name__] = _LazyModule(
|
38
|
+
__name__,
|
39
|
+
globals()["__file__"],
|
40
|
+
_import_structure,
|
41
|
+
module_spec=__spec__,
|
42
|
+
)
|
43
|
+
|
44
|
+
for name, value in _dummy_objects.items():
|
45
|
+
setattr(sys.modules[__name__], name, value)
|
46
|
+
for name, value in _additional_imports.items():
|
47
|
+
setattr(sys.modules[__name__], name, value)
|