diffusers 0.27.2__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +19 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +20 -26
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +42 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
- diffusers/schedulers/scheduling_edm_euler.py +50 -31
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
- diffusers/schedulers/scheduling_euler_discrete.py +160 -68
- diffusers/schedulers/scheduling_heun_discrete.py +57 -39
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +24 -26
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/METADATA +47 -47
- diffusers-0.28.0.dist-info/RECORD +414 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/WHEEL +1 -1
- diffusers-0.27.2.dist-info/RECORD +0 -399
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -21,11 +21,12 @@ import PIL.Image
|
|
21
21
|
import torch
|
22
22
|
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
|
23
23
|
|
24
|
-
from ...image_processor import PipelineImageInput
|
24
|
+
from ...image_processor import PipelineImageInput
|
25
25
|
from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
|
26
26
|
from ...schedulers import EulerDiscreteScheduler
|
27
27
|
from ...utils import BaseOutput, logging, replace_example_docstring
|
28
28
|
from ...utils.torch_utils import is_compiled_module, randn_tensor
|
29
|
+
from ...video_processor import VideoProcessor
|
29
30
|
from ..pipeline_utils import DiffusionPipeline
|
30
31
|
|
31
32
|
|
@@ -37,10 +38,14 @@ EXAMPLE_DOC_STRING = """
|
|
37
38
|
>>> from diffusers import StableVideoDiffusionPipeline
|
38
39
|
>>> from diffusers.utils import load_image, export_to_video
|
39
40
|
|
40
|
-
>>> pipe = StableVideoDiffusionPipeline.from_pretrained(
|
41
|
+
>>> pipe = StableVideoDiffusionPipeline.from_pretrained(
|
42
|
+
... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
|
43
|
+
... )
|
41
44
|
>>> pipe.to("cuda")
|
42
45
|
|
43
|
-
>>> image = load_image(
|
46
|
+
>>> image = load_image(
|
47
|
+
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
|
48
|
+
... )
|
44
49
|
>>> image = image.resize((1024, 576))
|
45
50
|
|
46
51
|
>>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
|
@@ -57,26 +62,64 @@ def _append_dims(x, target_dims):
|
|
57
62
|
return x[(...,) + (None,) * dims_to_append]
|
58
63
|
|
59
64
|
|
60
|
-
# Copied from diffusers.pipelines.
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
elif output_type == "pt":
|
74
|
-
outputs = torch.stack(outputs)
|
75
|
-
|
76
|
-
elif not output_type == "pil":
|
77
|
-
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
65
|
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
66
|
+
def retrieve_timesteps(
|
67
|
+
scheduler,
|
68
|
+
num_inference_steps: Optional[int] = None,
|
69
|
+
device: Optional[Union[str, torch.device]] = None,
|
70
|
+
timesteps: Optional[List[int]] = None,
|
71
|
+
sigmas: Optional[List[float]] = None,
|
72
|
+
**kwargs,
|
73
|
+
):
|
74
|
+
"""
|
75
|
+
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
76
|
+
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
78
77
|
|
79
|
-
|
78
|
+
Args:
|
79
|
+
scheduler (`SchedulerMixin`):
|
80
|
+
The scheduler to get timesteps from.
|
81
|
+
num_inference_steps (`int`):
|
82
|
+
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
83
|
+
must be `None`.
|
84
|
+
device (`str` or `torch.device`, *optional*):
|
85
|
+
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
86
|
+
timesteps (`List[int]`, *optional*):
|
87
|
+
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
88
|
+
`num_inference_steps` and `sigmas` must be `None`.
|
89
|
+
sigmas (`List[float]`, *optional*):
|
90
|
+
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
91
|
+
`num_inference_steps` and `timesteps` must be `None`.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
95
|
+
second element is the number of inference steps.
|
96
|
+
"""
|
97
|
+
if timesteps is not None and sigmas is not None:
|
98
|
+
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
99
|
+
if timesteps is not None:
|
100
|
+
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
101
|
+
if not accepts_timesteps:
|
102
|
+
raise ValueError(
|
103
|
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
104
|
+
f" timestep schedules. Please check whether you are using the correct scheduler."
|
105
|
+
)
|
106
|
+
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
107
|
+
timesteps = scheduler.timesteps
|
108
|
+
num_inference_steps = len(timesteps)
|
109
|
+
elif sigmas is not None:
|
110
|
+
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
111
|
+
if not accept_sigmas:
|
112
|
+
raise ValueError(
|
113
|
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
114
|
+
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
115
|
+
)
|
116
|
+
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
117
|
+
timesteps = scheduler.timesteps
|
118
|
+
num_inference_steps = len(timesteps)
|
119
|
+
else:
|
120
|
+
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
121
|
+
timesteps = scheduler.timesteps
|
122
|
+
return timesteps, num_inference_steps
|
80
123
|
|
81
124
|
|
82
125
|
@dataclass
|
@@ -85,12 +128,12 @@ class StableVideoDiffusionPipelineOutput(BaseOutput):
|
|
85
128
|
Output class for Stable Video Diffusion pipeline.
|
86
129
|
|
87
130
|
Args:
|
88
|
-
frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.
|
89
|
-
List of denoised PIL images of length `batch_size` or numpy array or torch tensor
|
90
|
-
|
131
|
+
frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
|
132
|
+
List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
|
133
|
+
num_frames, height, width, num_channels)`.
|
91
134
|
"""
|
92
135
|
|
93
|
-
frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.
|
136
|
+
frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
|
94
137
|
|
95
138
|
|
96
139
|
class StableVideoDiffusionPipeline(DiffusionPipeline):
|
@@ -104,7 +147,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
104
147
|
vae ([`AutoencoderKLTemporalDecoder`]):
|
105
148
|
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
|
106
149
|
image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
|
107
|
-
Frozen CLIP image-encoder
|
150
|
+
Frozen CLIP image-encoder
|
151
|
+
([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
|
108
152
|
unet ([`UNetSpatioTemporalConditionModel`]):
|
109
153
|
A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
|
110
154
|
scheduler ([`EulerDiscreteScheduler`]):
|
@@ -134,7 +178,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
134
178
|
feature_extractor=feature_extractor,
|
135
179
|
)
|
136
180
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
137
|
-
self.
|
181
|
+
self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
|
138
182
|
|
139
183
|
def _encode_image(
|
140
184
|
self,
|
@@ -142,12 +186,12 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
142
186
|
device: Union[str, torch.device],
|
143
187
|
num_videos_per_prompt: int,
|
144
188
|
do_classifier_free_guidance: bool,
|
145
|
-
) -> torch.
|
189
|
+
) -> torch.Tensor:
|
146
190
|
dtype = next(self.image_encoder.parameters()).dtype
|
147
191
|
|
148
192
|
if not isinstance(image, torch.Tensor):
|
149
|
-
image = self.
|
150
|
-
image = self.
|
193
|
+
image = self.video_processor.pil_to_numpy(image)
|
194
|
+
image = self.video_processor.numpy_to_pt(image)
|
151
195
|
|
152
196
|
# We normalize the image before resizing to match with the original implementation.
|
153
197
|
# Then we unnormalize it after resizing.
|
@@ -194,6 +238,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
194
238
|
image = image.to(device=device)
|
195
239
|
image_latents = self.vae.encode(image).latent_dist.mode()
|
196
240
|
|
241
|
+
# duplicate image_latents for each generation per prompt, using mps friendly method
|
242
|
+
image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
|
243
|
+
|
197
244
|
if do_classifier_free_guidance:
|
198
245
|
negative_image_latents = torch.zeros_like(image_latents)
|
199
246
|
|
@@ -202,9 +249,6 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
202
249
|
# to avoid doing two forward passes
|
203
250
|
image_latents = torch.cat([negative_image_latents, image_latents])
|
204
251
|
|
205
|
-
# duplicate image_latents for each generation per prompt, using mps friendly method
|
206
|
-
image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
|
207
|
-
|
208
252
|
return image_latents
|
209
253
|
|
210
254
|
def _get_add_time_ids(
|
@@ -235,7 +279,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
235
279
|
|
236
280
|
return add_time_ids
|
237
281
|
|
238
|
-
def decode_latents(self, latents: torch.
|
282
|
+
def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
|
239
283
|
# [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
|
240
284
|
latents = latents.flatten(0, 1)
|
241
285
|
|
@@ -271,7 +315,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
271
315
|
and not isinstance(image, list)
|
272
316
|
):
|
273
317
|
raise ValueError(
|
274
|
-
"`image` has to be of type `torch.
|
318
|
+
"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
|
275
319
|
f" {type(image)}"
|
276
320
|
)
|
277
321
|
|
@@ -288,7 +332,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
288
332
|
dtype: torch.dtype,
|
289
333
|
device: Union[str, torch.device],
|
290
334
|
generator: torch.Generator,
|
291
|
-
latents: Optional[torch.
|
335
|
+
latents: Optional[torch.Tensor] = None,
|
292
336
|
):
|
293
337
|
shape = (
|
294
338
|
batch_size,
|
@@ -333,11 +377,12 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
333
377
|
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
334
378
|
def __call__(
|
335
379
|
self,
|
336
|
-
image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.
|
380
|
+
image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
|
337
381
|
height: int = 576,
|
338
382
|
width: int = 1024,
|
339
383
|
num_frames: Optional[int] = None,
|
340
384
|
num_inference_steps: int = 25,
|
385
|
+
sigmas: Optional[List[float]] = None,
|
341
386
|
min_guidance_scale: float = 1.0,
|
342
387
|
max_guidance_scale: float = 3.0,
|
343
388
|
fps: int = 7,
|
@@ -346,7 +391,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
346
391
|
decode_chunk_size: Optional[int] = None,
|
347
392
|
num_videos_per_prompt: Optional[int] = 1,
|
348
393
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
349
|
-
latents: Optional[torch.
|
394
|
+
latents: Optional[torch.Tensor] = None,
|
350
395
|
output_type: Optional[str] = "pil",
|
351
396
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
352
397
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
@@ -356,39 +401,46 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
356
401
|
The call function to the pipeline for generation.
|
357
402
|
|
358
403
|
Args:
|
359
|
-
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.
|
360
|
-
Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
|
404
|
+
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
|
405
|
+
Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
|
406
|
+
1]`.
|
361
407
|
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
362
408
|
The height in pixels of the generated image.
|
363
409
|
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
364
410
|
The width in pixels of the generated image.
|
365
411
|
num_frames (`int`, *optional*):
|
366
|
-
The number of video frames to generate. Defaults to `self.unet.config.num_frames`
|
367
|
-
|
412
|
+
The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
|
413
|
+
`stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
|
368
414
|
num_inference_steps (`int`, *optional*, defaults to 25):
|
369
415
|
The number of denoising steps. More denoising steps usually lead to a higher quality video at the
|
370
416
|
expense of slower inference. This parameter is modulated by `strength`.
|
417
|
+
sigmas (`List[float]`, *optional*):
|
418
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
419
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
420
|
+
will be used.
|
371
421
|
min_guidance_scale (`float`, *optional*, defaults to 1.0):
|
372
422
|
The minimum guidance scale. Used for the classifier free guidance with first frame.
|
373
423
|
max_guidance_scale (`float`, *optional*, defaults to 3.0):
|
374
424
|
The maximum guidance scale. Used for the classifier free guidance with last frame.
|
375
425
|
fps (`int`, *optional*, defaults to 7):
|
376
|
-
Frames per second. The rate at which the generated images shall be exported to a video after
|
377
|
-
Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
|
426
|
+
Frames per second. The rate at which the generated images shall be exported to a video after
|
427
|
+
generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
|
378
428
|
motion_bucket_id (`int`, *optional*, defaults to 127):
|
379
429
|
Used for conditioning the amount of motion for the generation. The higher the number the more motion
|
380
430
|
will be in the video.
|
381
431
|
noise_aug_strength (`float`, *optional*, defaults to 0.02):
|
382
|
-
The amount of noise added to the init image, the higher it is the less the video will look like the
|
432
|
+
The amount of noise added to the init image, the higher it is the less the video will look like the
|
433
|
+
init image. Increase it for more motion.
|
383
434
|
decode_chunk_size (`int`, *optional*):
|
384
|
-
The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
|
385
|
-
|
435
|
+
The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
|
436
|
+
expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
|
437
|
+
For lower memory usage, reduce `decode_chunk_size`.
|
386
438
|
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
387
439
|
The number of videos to generate per prompt.
|
388
440
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
389
441
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
390
442
|
generation deterministic.
|
391
|
-
latents (`torch.
|
443
|
+
latents (`torch.Tensor`, *optional*):
|
392
444
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
|
393
445
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
394
446
|
tensor is generated by sampling using the supplied random `generator`.
|
@@ -398,7 +450,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
398
450
|
A function that is called at the end of each denoising step during inference. The function is called
|
399
451
|
with the following arguments:
|
400
452
|
`callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
|
401
|
-
`callback_kwargs` will include a list of all tensors as specified by
|
453
|
+
`callback_kwargs` will include a list of all tensors as specified by
|
454
|
+
`callback_on_step_end_tensor_inputs`.
|
402
455
|
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
403
456
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
404
457
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
@@ -411,8 +464,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
411
464
|
|
412
465
|
Returns:
|
413
466
|
[`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
|
414
|
-
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
|
415
|
-
otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.
|
467
|
+
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
|
468
|
+
returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is
|
469
|
+
returned.
|
416
470
|
"""
|
417
471
|
# 0. Default height and width to unet
|
418
472
|
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
@@ -445,7 +499,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
445
499
|
fps = fps - 1
|
446
500
|
|
447
501
|
# 4. Encode input image using VAE
|
448
|
-
image = self.
|
502
|
+
image = self.video_processor.preprocess(image, height=height, width=width).to(device)
|
449
503
|
noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
|
450
504
|
image = image + noise_aug_strength * noise
|
451
505
|
|
@@ -482,8 +536,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
482
536
|
added_time_ids = added_time_ids.to(device)
|
483
537
|
|
484
538
|
# 6. Prepare timesteps
|
485
|
-
self.scheduler
|
486
|
-
timesteps = self.scheduler.timesteps
|
539
|
+
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
|
487
540
|
|
488
541
|
# 7. Prepare latent variables
|
489
542
|
num_channels_latents = self.unet.config.in_channels
|
@@ -552,7 +605,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
|
|
552
605
|
if needs_upcasting:
|
553
606
|
self.vae.to(dtype=torch.float16)
|
554
607
|
frames = self.decode_latents(latents, num_frames, decode_chunk_size)
|
555
|
-
frames =
|
608
|
+
frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
|
556
609
|
else:
|
557
610
|
frames = latents
|
558
611
|
|
@@ -627,7 +680,7 @@ def _filter2d(input, kernel):
|
|
627
680
|
|
628
681
|
height, width = tmp_kernel.shape[-2:]
|
629
682
|
|
630
|
-
padding_shape:
|
683
|
+
padding_shape: List[int] = _compute_padding([height, width])
|
631
684
|
input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
|
632
685
|
|
633
686
|
# kernel and input tensor reshape to align element-wise or batch-wise params
|
@@ -124,6 +124,7 @@ def retrieve_timesteps(
|
|
124
124
|
num_inference_steps: Optional[int] = None,
|
125
125
|
device: Optional[Union[str, torch.device]] = None,
|
126
126
|
timesteps: Optional[List[int]] = None,
|
127
|
+
sigmas: Optional[List[float]] = None,
|
127
128
|
**kwargs,
|
128
129
|
):
|
129
130
|
"""
|
@@ -134,19 +135,23 @@ def retrieve_timesteps(
|
|
134
135
|
scheduler (`SchedulerMixin`):
|
135
136
|
The scheduler to get timesteps from.
|
136
137
|
num_inference_steps (`int`):
|
137
|
-
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
138
|
-
|
138
|
+
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
139
|
+
must be `None`.
|
139
140
|
device (`str` or `torch.device`, *optional*):
|
140
141
|
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
141
142
|
timesteps (`List[int]`, *optional*):
|
142
|
-
|
143
|
-
|
144
|
-
|
143
|
+
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
144
|
+
`num_inference_steps` and `sigmas` must be `None`.
|
145
|
+
sigmas (`List[float]`, *optional*):
|
146
|
+
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
147
|
+
`num_inference_steps` and `timesteps` must be `None`.
|
145
148
|
|
146
149
|
Returns:
|
147
150
|
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
148
151
|
second element is the number of inference steps.
|
149
152
|
"""
|
153
|
+
if timesteps is not None and sigmas is not None:
|
154
|
+
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
150
155
|
if timesteps is not None:
|
151
156
|
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
152
157
|
if not accepts_timesteps:
|
@@ -157,6 +162,16 @@ def retrieve_timesteps(
|
|
157
162
|
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
158
163
|
timesteps = scheduler.timesteps
|
159
164
|
num_inference_steps = len(timesteps)
|
165
|
+
elif sigmas is not None:
|
166
|
+
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
167
|
+
if not accept_sigmas:
|
168
|
+
raise ValueError(
|
169
|
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
170
|
+
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
171
|
+
)
|
172
|
+
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
173
|
+
timesteps = scheduler.timesteps
|
174
|
+
num_inference_steps = len(timesteps)
|
160
175
|
else:
|
161
176
|
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
162
177
|
timesteps = scheduler.timesteps
|
@@ -256,8 +271,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
256
271
|
num_images_per_prompt,
|
257
272
|
do_classifier_free_guidance,
|
258
273
|
negative_prompt=None,
|
259
|
-
prompt_embeds: Optional[torch.
|
260
|
-
negative_prompt_embeds: Optional[torch.
|
274
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
275
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
261
276
|
lora_scale: Optional[float] = None,
|
262
277
|
**kwargs,
|
263
278
|
):
|
@@ -289,8 +304,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
289
304
|
num_images_per_prompt,
|
290
305
|
do_classifier_free_guidance,
|
291
306
|
negative_prompt=None,
|
292
|
-
prompt_embeds: Optional[torch.
|
293
|
-
negative_prompt_embeds: Optional[torch.
|
307
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
308
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
294
309
|
lora_scale: Optional[float] = None,
|
295
310
|
clip_skip: Optional[int] = None,
|
296
311
|
):
|
@@ -310,10 +325,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
310
325
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
311
326
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
312
327
|
less than `1`).
|
313
|
-
prompt_embeds (`torch.
|
328
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
314
329
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
315
330
|
provided, text embeddings will be generated from `prompt` input argument.
|
316
|
-
negative_prompt_embeds (`torch.
|
331
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
317
332
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
318
333
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
319
334
|
argument.
|
@@ -569,7 +584,12 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
569
584
|
|
570
585
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
571
586
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
572
|
-
shape = (
|
587
|
+
shape = (
|
588
|
+
batch_size,
|
589
|
+
num_channels_latents,
|
590
|
+
int(height) // self.vae_scale_factor,
|
591
|
+
int(width) // self.vae_scale_factor,
|
592
|
+
)
|
573
593
|
if isinstance(generator, list) and len(generator) != batch_size:
|
574
594
|
raise ValueError(
|
575
595
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -613,20 +633,22 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
613
633
|
return height, width
|
614
634
|
|
615
635
|
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
616
|
-
def get_guidance_scale_embedding(
|
636
|
+
def get_guidance_scale_embedding(
|
637
|
+
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
|
638
|
+
) -> torch.Tensor:
|
617
639
|
"""
|
618
640
|
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
619
641
|
|
620
642
|
Args:
|
621
|
-
|
622
|
-
|
643
|
+
w (`torch.Tensor`):
|
644
|
+
Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
|
623
645
|
embedding_dim (`int`, *optional*, defaults to 512):
|
624
|
-
|
625
|
-
dtype:
|
626
|
-
|
646
|
+
Dimension of the embeddings to generate.
|
647
|
+
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
648
|
+
Data type of the generated embeddings.
|
627
649
|
|
628
650
|
Returns:
|
629
|
-
`torch.
|
651
|
+
`torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
|
630
652
|
"""
|
631
653
|
assert len(w.shape) == 1
|
632
654
|
w = w * 1000.0
|
@@ -662,17 +684,18 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
662
684
|
width: Optional[int] = None,
|
663
685
|
num_inference_steps: int = 50,
|
664
686
|
timesteps: List[int] = None,
|
687
|
+
sigmas: List[float] = None,
|
665
688
|
guidance_scale: float = 7.5,
|
666
689
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
667
690
|
num_images_per_prompt: Optional[int] = 1,
|
668
691
|
eta: float = 0.0,
|
669
692
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
670
|
-
latents: Optional[torch.
|
671
|
-
prompt_embeds: Optional[torch.
|
672
|
-
negative_prompt_embeds: Optional[torch.
|
693
|
+
latents: Optional[torch.Tensor] = None,
|
694
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
695
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
673
696
|
output_type: Optional[str] = "pil",
|
674
697
|
return_dict: bool = True,
|
675
|
-
callback: Optional[Callable[[int, int, torch.
|
698
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
676
699
|
callback_steps: int = 1,
|
677
700
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
678
701
|
adapter_conditioning_scale: Union[float, List[float]] = 1.0,
|
@@ -685,9 +708,9 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
685
708
|
prompt (`str` or `List[str]`, *optional*):
|
686
709
|
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
687
710
|
instead.
|
688
|
-
image (`torch.
|
711
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
|
689
712
|
The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
|
690
|
-
type is specified as `
|
713
|
+
type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
|
691
714
|
accepted as an image. The control image is automatically resized to fit the output image.
|
692
715
|
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
693
716
|
The height in pixels of the generated image.
|
@@ -700,6 +723,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
700
723
|
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
701
724
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
702
725
|
passed will be used. Must be in descending order.
|
726
|
+
sigmas (`List[float]`, *optional*):
|
727
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
728
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
729
|
+
will be used.
|
703
730
|
guidance_scale (`float`, *optional*, defaults to 7.5):
|
704
731
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
705
732
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -718,14 +745,14 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
718
745
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
719
746
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
720
747
|
to make generation deterministic.
|
721
|
-
latents (`torch.
|
748
|
+
latents (`torch.Tensor`, *optional*):
|
722
749
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
723
750
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
724
751
|
tensor will ge generated by sampling using the supplied random `generator`.
|
725
|
-
prompt_embeds (`torch.
|
752
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
726
753
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
727
754
|
provided, text embeddings will be generated from `prompt` input argument.
|
728
|
-
negative_prompt_embeds (`torch.
|
755
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
729
756
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
730
757
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
731
758
|
argument.
|
@@ -737,7 +764,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
737
764
|
of a plain tuple.
|
738
765
|
callback (`Callable`, *optional*):
|
739
766
|
A function that will be called every `callback_steps` steps during inference. The function will be
|
740
|
-
called with the following arguments: `callback(step: int, timestep: int, latents: torch.
|
767
|
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
741
768
|
callback_steps (`int`, *optional*, defaults to 1):
|
742
769
|
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
743
770
|
called at every step.
|
@@ -809,7 +836,9 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
809
836
|
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
810
837
|
|
811
838
|
# 4. Prepare timesteps
|
812
|
-
timesteps, num_inference_steps = retrieve_timesteps(
|
839
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
840
|
+
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
841
|
+
)
|
813
842
|
|
814
843
|
# 5. Prepare latent variables
|
815
844
|
num_channels_latents = self.unet.config.in_channels
|