diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +20 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +23 -25
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +46 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
- diffusers/schedulers/scheduling_edm_euler.py +53 -30
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
- diffusers/schedulers/scheduling_euler_discrete.py +163 -67
- diffusers/schedulers/scheduling_heun_discrete.py +60 -38
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +27 -25
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
- diffusers-0.28.0.dist-info/RECORD +414 -0
- diffusers-0.27.1.dist-info/RECORD +0 -399
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -140,6 +140,7 @@ def retrieve_timesteps(
|
|
140
140
|
num_inference_steps: Optional[int] = None,
|
141
141
|
device: Optional[Union[str, torch.device]] = None,
|
142
142
|
timesteps: Optional[List[int]] = None,
|
143
|
+
sigmas: Optional[List[float]] = None,
|
143
144
|
**kwargs,
|
144
145
|
):
|
145
146
|
"""
|
@@ -150,19 +151,23 @@ def retrieve_timesteps(
|
|
150
151
|
scheduler (`SchedulerMixin`):
|
151
152
|
The scheduler to get timesteps from.
|
152
153
|
num_inference_steps (`int`):
|
153
|
-
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
154
|
-
|
154
|
+
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
155
|
+
must be `None`.
|
155
156
|
device (`str` or `torch.device`, *optional*):
|
156
157
|
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
157
158
|
timesteps (`List[int]`, *optional*):
|
158
|
-
|
159
|
-
|
160
|
-
|
159
|
+
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
160
|
+
`num_inference_steps` and `sigmas` must be `None`.
|
161
|
+
sigmas (`List[float]`, *optional*):
|
162
|
+
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
163
|
+
`num_inference_steps` and `timesteps` must be `None`.
|
161
164
|
|
162
165
|
Returns:
|
163
166
|
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
164
167
|
second element is the number of inference steps.
|
165
168
|
"""
|
169
|
+
if timesteps is not None and sigmas is not None:
|
170
|
+
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
166
171
|
if timesteps is not None:
|
167
172
|
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
168
173
|
if not accepts_timesteps:
|
@@ -173,6 +178,16 @@ def retrieve_timesteps(
|
|
173
178
|
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
174
179
|
timesteps = scheduler.timesteps
|
175
180
|
num_inference_steps = len(timesteps)
|
181
|
+
elif sigmas is not None:
|
182
|
+
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
183
|
+
if not accept_sigmas:
|
184
|
+
raise ValueError(
|
185
|
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
186
|
+
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
187
|
+
)
|
188
|
+
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
189
|
+
timesteps = scheduler.timesteps
|
190
|
+
num_inference_steps = len(timesteps)
|
176
191
|
else:
|
177
192
|
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
178
193
|
timesteps = scheduler.timesteps
|
@@ -281,10 +296,10 @@ class StableDiffusionXLAdapterPipeline(
|
|
281
296
|
do_classifier_free_guidance: bool = True,
|
282
297
|
negative_prompt: Optional[str] = None,
|
283
298
|
negative_prompt_2: Optional[str] = None,
|
284
|
-
prompt_embeds: Optional[torch.
|
285
|
-
negative_prompt_embeds: Optional[torch.
|
286
|
-
pooled_prompt_embeds: Optional[torch.
|
287
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
299
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
300
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
301
|
+
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
302
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
288
303
|
lora_scale: Optional[float] = None,
|
289
304
|
clip_skip: Optional[int] = None,
|
290
305
|
):
|
@@ -310,17 +325,17 @@ class StableDiffusionXLAdapterPipeline(
|
|
310
325
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
311
326
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
312
327
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
313
|
-
prompt_embeds (`torch.
|
328
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
314
329
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
315
330
|
provided, text embeddings will be generated from `prompt` input argument.
|
316
|
-
negative_prompt_embeds (`torch.
|
331
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
317
332
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
318
333
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
319
334
|
argument.
|
320
|
-
pooled_prompt_embeds (`torch.
|
335
|
+
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
321
336
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
322
337
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
323
|
-
negative_pooled_prompt_embeds (`torch.
|
338
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
324
339
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
325
340
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
326
341
|
input argument.
|
@@ -700,7 +715,12 @@ class StableDiffusionXLAdapterPipeline(
|
|
700
715
|
|
701
716
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
702
717
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
703
|
-
shape = (
|
718
|
+
shape = (
|
719
|
+
batch_size,
|
720
|
+
num_channels_latents,
|
721
|
+
int(height) // self.vae_scale_factor,
|
722
|
+
int(width) // self.vae_scale_factor,
|
723
|
+
)
|
704
724
|
if isinstance(generator, list) and len(generator) != batch_size:
|
705
725
|
raise ValueError(
|
706
726
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -784,20 +804,22 @@ class StableDiffusionXLAdapterPipeline(
|
|
784
804
|
return height, width
|
785
805
|
|
786
806
|
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
787
|
-
def get_guidance_scale_embedding(
|
807
|
+
def get_guidance_scale_embedding(
|
808
|
+
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
|
809
|
+
) -> torch.Tensor:
|
788
810
|
"""
|
789
811
|
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
790
812
|
|
791
813
|
Args:
|
792
|
-
|
793
|
-
|
814
|
+
w (`torch.Tensor`):
|
815
|
+
Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
|
794
816
|
embedding_dim (`int`, *optional*, defaults to 512):
|
795
|
-
|
796
|
-
dtype:
|
797
|
-
|
817
|
+
Dimension of the embeddings to generate.
|
818
|
+
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
819
|
+
Data type of the generated embeddings.
|
798
820
|
|
799
821
|
Returns:
|
800
|
-
`torch.
|
822
|
+
`torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
|
801
823
|
"""
|
802
824
|
assert len(w.shape) == 1
|
803
825
|
w = w * 1000.0
|
@@ -834,6 +856,7 @@ class StableDiffusionXLAdapterPipeline(
|
|
834
856
|
width: Optional[int] = None,
|
835
857
|
num_inference_steps: int = 50,
|
836
858
|
timesteps: List[int] = None,
|
859
|
+
sigmas: List[float] = None,
|
837
860
|
denoising_end: Optional[float] = None,
|
838
861
|
guidance_scale: float = 5.0,
|
839
862
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
@@ -841,16 +864,16 @@ class StableDiffusionXLAdapterPipeline(
|
|
841
864
|
num_images_per_prompt: Optional[int] = 1,
|
842
865
|
eta: float = 0.0,
|
843
866
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
844
|
-
latents: Optional[torch.
|
845
|
-
prompt_embeds: Optional[torch.
|
846
|
-
negative_prompt_embeds: Optional[torch.
|
847
|
-
pooled_prompt_embeds: Optional[torch.
|
848
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
867
|
+
latents: Optional[torch.Tensor] = None,
|
868
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
869
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
870
|
+
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
871
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
849
872
|
ip_adapter_image: Optional[PipelineImageInput] = None,
|
850
|
-
ip_adapter_image_embeds: Optional[List[torch.
|
873
|
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
851
874
|
output_type: Optional[str] = "pil",
|
852
875
|
return_dict: bool = True,
|
853
|
-
callback: Optional[Callable[[int, int, torch.
|
876
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
854
877
|
callback_steps: int = 1,
|
855
878
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
856
879
|
guidance_rescale: float = 0.0,
|
@@ -874,9 +897,9 @@ class StableDiffusionXLAdapterPipeline(
|
|
874
897
|
prompt_2 (`str` or `List[str]`, *optional*):
|
875
898
|
The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
|
876
899
|
used in both text-encoders
|
877
|
-
image (`torch.
|
900
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
|
878
901
|
The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
|
879
|
-
type is specified as `
|
902
|
+
type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
|
880
903
|
accepted as an image. The control image is automatically resized to fit the output image.
|
881
904
|
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
882
905
|
The height in pixels of the generated image. Anything below 512 pixels won't work well for
|
@@ -893,6 +916,10 @@ class StableDiffusionXLAdapterPipeline(
|
|
893
916
|
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
894
917
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
895
918
|
passed will be used. Must be in descending order.
|
919
|
+
sigmas (`List[float]`, *optional*):
|
920
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
921
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
922
|
+
will be used.
|
896
923
|
denoising_end (`float`, *optional*):
|
897
924
|
When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
|
898
925
|
completed before it is intentionally prematurely terminated. As a result, the returned sample will
|
@@ -921,30 +948,30 @@ class StableDiffusionXLAdapterPipeline(
|
|
921
948
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
922
949
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
923
950
|
to make generation deterministic.
|
924
|
-
latents (`torch.
|
951
|
+
latents (`torch.Tensor`, *optional*):
|
925
952
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
926
953
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
927
954
|
tensor will ge generated by sampling using the supplied random `generator`.
|
928
|
-
prompt_embeds (`torch.
|
955
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
929
956
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
930
957
|
provided, text embeddings will be generated from `prompt` input argument.
|
931
|
-
negative_prompt_embeds (`torch.
|
958
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
932
959
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
933
960
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
934
961
|
argument.
|
935
|
-
pooled_prompt_embeds (`torch.
|
962
|
+
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
936
963
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
937
964
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
938
|
-
negative_pooled_prompt_embeds (`torch.
|
965
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
939
966
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
940
967
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
941
968
|
input argument.
|
942
969
|
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
943
|
-
ip_adapter_image_embeds (`List[torch.
|
944
|
-
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
945
|
-
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
946
|
-
if `do_classifier_free_guidance` is set to `True`.
|
947
|
-
|
970
|
+
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
971
|
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
972
|
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
973
|
+
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
974
|
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
948
975
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
949
976
|
The output format of the generate image. Choose between
|
950
977
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
@@ -953,7 +980,7 @@ class StableDiffusionXLAdapterPipeline(
|
|
953
980
|
instead of a plain tuple.
|
954
981
|
callback (`Callable`, *optional*):
|
955
982
|
A function that will be called every `callback_steps` steps during inference. The function will be
|
956
|
-
called with the following arguments: `callback(step: int, timestep: int, latents: torch.
|
983
|
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
957
984
|
callback_steps (`int`, *optional*, defaults to 1):
|
958
985
|
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
959
986
|
called at every step.
|
@@ -1094,7 +1121,9 @@ class StableDiffusionXLAdapterPipeline(
|
|
1094
1121
|
)
|
1095
1122
|
|
1096
1123
|
# 4. Prepare timesteps
|
1097
|
-
timesteps, num_inference_steps = retrieve_timesteps(
|
1124
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
1125
|
+
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
1126
|
+
)
|
1098
1127
|
|
1099
1128
|
# 5. Prepare latent variables
|
1100
1129
|
num_channels_latents = self.unet.config.in_channels
|
@@ -15,9 +15,10 @@ class TextToVideoSDPipelineOutput(BaseOutput):
|
|
15
15
|
"""
|
16
16
|
Output class for text-to-video pipelines.
|
17
17
|
|
18
|
-
|
18
|
+
Args:
|
19
19
|
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
20
|
-
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
20
|
+
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
21
|
+
denoised
|
21
22
|
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
22
23
|
`(batch_size, num_frames, channels, height, width)`
|
23
24
|
"""
|
@@ -15,11 +15,9 @@
|
|
15
15
|
import inspect
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Union
|
17
17
|
|
18
|
-
import numpy as np
|
19
18
|
import torch
|
20
19
|
from transformers import CLIPTextModel, CLIPTokenizer
|
21
20
|
|
22
|
-
from ...image_processor import VaeImageProcessor
|
23
21
|
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
|
24
22
|
from ...models import AutoencoderKL, UNet3DConditionModel
|
25
23
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -33,6 +31,7 @@ from ...utils import (
|
|
33
31
|
unscale_lora_layers,
|
34
32
|
)
|
35
33
|
from ...utils.torch_utils import randn_tensor
|
34
|
+
from ...video_processor import VideoProcessor
|
36
35
|
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
37
36
|
from . import TextToVideoSDPipelineOutput
|
38
37
|
|
@@ -59,28 +58,6 @@ EXAMPLE_DOC_STRING = """
|
|
59
58
|
"""
|
60
59
|
|
61
60
|
|
62
|
-
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
|
63
|
-
def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
|
64
|
-
batch_size, channels, num_frames, height, width = video.shape
|
65
|
-
outputs = []
|
66
|
-
for batch_idx in range(batch_size):
|
67
|
-
batch_vid = video[batch_idx].permute(1, 0, 2, 3)
|
68
|
-
batch_output = processor.postprocess(batch_vid, output_type)
|
69
|
-
|
70
|
-
outputs.append(batch_output)
|
71
|
-
|
72
|
-
if output_type == "np":
|
73
|
-
outputs = np.stack(outputs)
|
74
|
-
|
75
|
-
elif output_type == "pt":
|
76
|
-
outputs = torch.stack(outputs)
|
77
|
-
|
78
|
-
elif not output_type == "pil":
|
79
|
-
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
80
|
-
|
81
|
-
return outputs
|
82
|
-
|
83
|
-
|
84
61
|
class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
|
85
62
|
r"""
|
86
63
|
Pipeline for text-to-video generation.
|
@@ -127,7 +104,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
127
104
|
scheduler=scheduler,
|
128
105
|
)
|
129
106
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
130
|
-
self.
|
107
|
+
self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
|
131
108
|
|
132
109
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
133
110
|
def _encode_prompt(
|
@@ -137,8 +114,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
137
114
|
num_images_per_prompt,
|
138
115
|
do_classifier_free_guidance,
|
139
116
|
negative_prompt=None,
|
140
|
-
prompt_embeds: Optional[torch.
|
141
|
-
negative_prompt_embeds: Optional[torch.
|
117
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
118
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
142
119
|
lora_scale: Optional[float] = None,
|
143
120
|
**kwargs,
|
144
121
|
):
|
@@ -170,8 +147,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
170
147
|
num_images_per_prompt,
|
171
148
|
do_classifier_free_guidance,
|
172
149
|
negative_prompt=None,
|
173
|
-
prompt_embeds: Optional[torch.
|
174
|
-
negative_prompt_embeds: Optional[torch.
|
150
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
151
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
175
152
|
lora_scale: Optional[float] = None,
|
176
153
|
clip_skip: Optional[int] = None,
|
177
154
|
):
|
@@ -191,10 +168,10 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
191
168
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
192
169
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
193
170
|
less than `1`).
|
194
|
-
prompt_embeds (`torch.
|
171
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
195
172
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
196
173
|
provided, text embeddings will be generated from `prompt` input argument.
|
197
|
-
negative_prompt_embeds (`torch.
|
174
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
198
175
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
199
176
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
200
177
|
argument.
|
@@ -465,12 +442,12 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
465
442
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
466
443
|
eta: float = 0.0,
|
467
444
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
468
|
-
latents: Optional[torch.
|
469
|
-
prompt_embeds: Optional[torch.
|
470
|
-
negative_prompt_embeds: Optional[torch.
|
445
|
+
latents: Optional[torch.Tensor] = None,
|
446
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
447
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
471
448
|
output_type: Optional[str] = "np",
|
472
449
|
return_dict: bool = True,
|
473
|
-
callback: Optional[Callable[[int, int, torch.
|
450
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
474
451
|
callback_steps: int = 1,
|
475
452
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
476
453
|
clip_skip: Optional[int] = None,
|
@@ -505,25 +482,25 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
505
482
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
506
483
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
507
484
|
generation deterministic.
|
508
|
-
latents (`torch.
|
485
|
+
latents (`torch.Tensor`, *optional*):
|
509
486
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
|
510
487
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
511
488
|
tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
|
512
489
|
`(batch_size, num_channel, num_frames, height, width)`.
|
513
|
-
prompt_embeds (`torch.
|
490
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
514
491
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
515
492
|
provided, text embeddings are generated from the `prompt` input argument.
|
516
|
-
negative_prompt_embeds (`torch.
|
493
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
517
494
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
518
495
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
519
496
|
output_type (`str`, *optional*, defaults to `"np"`):
|
520
|
-
The output format of the generated video. Choose between `torch.
|
497
|
+
The output format of the generated video. Choose between `torch.Tensor` or `np.array`.
|
521
498
|
return_dict (`bool`, *optional*, defaults to `True`):
|
522
499
|
Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
|
523
500
|
of a plain tuple.
|
524
501
|
callback (`Callable`, *optional*):
|
525
502
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
526
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
503
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
527
504
|
callback_steps (`int`, *optional*, defaults to 1):
|
528
505
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
529
506
|
every step.
|
@@ -652,7 +629,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
|
|
652
629
|
video = latents
|
653
630
|
else:
|
654
631
|
video_tensor = self.decode_latents(latents)
|
655
|
-
video =
|
632
|
+
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
656
633
|
|
657
634
|
# 9. Offload all models
|
658
635
|
self.maybe_free_model_hooks()
|
@@ -16,11 +16,9 @@ import inspect
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Union
|
17
17
|
|
18
18
|
import numpy as np
|
19
|
-
import PIL.Image
|
20
19
|
import torch
|
21
20
|
from transformers import CLIPTextModel, CLIPTokenizer
|
22
21
|
|
23
|
-
from ...image_processor import VaeImageProcessor
|
24
22
|
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
|
25
23
|
from ...models import AutoencoderKL, UNet3DConditionModel
|
26
24
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -34,6 +32,7 @@ from ...utils import (
|
|
34
32
|
unscale_lora_layers,
|
35
33
|
)
|
36
34
|
from ...utils.torch_utils import randn_tensor
|
35
|
+
from ...video_processor import VideoProcessor
|
37
36
|
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
38
37
|
from . import TextToVideoSDPipelineOutput
|
39
38
|
|
@@ -94,69 +93,6 @@ def retrieve_latents(
|
|
94
93
|
raise AttributeError("Could not access latents of provided encoder_output")
|
95
94
|
|
96
95
|
|
97
|
-
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
|
98
|
-
def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
|
99
|
-
batch_size, channels, num_frames, height, width = video.shape
|
100
|
-
outputs = []
|
101
|
-
for batch_idx in range(batch_size):
|
102
|
-
batch_vid = video[batch_idx].permute(1, 0, 2, 3)
|
103
|
-
batch_output = processor.postprocess(batch_vid, output_type)
|
104
|
-
|
105
|
-
outputs.append(batch_output)
|
106
|
-
|
107
|
-
if output_type == "np":
|
108
|
-
outputs = np.stack(outputs)
|
109
|
-
|
110
|
-
elif output_type == "pt":
|
111
|
-
outputs = torch.stack(outputs)
|
112
|
-
|
113
|
-
elif not output_type == "pil":
|
114
|
-
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
115
|
-
|
116
|
-
return outputs
|
117
|
-
|
118
|
-
|
119
|
-
def preprocess_video(video):
|
120
|
-
supported_formats = (np.ndarray, torch.Tensor, PIL.Image.Image)
|
121
|
-
|
122
|
-
if isinstance(video, supported_formats):
|
123
|
-
video = [video]
|
124
|
-
elif not (isinstance(video, list) and all(isinstance(i, supported_formats) for i in video)):
|
125
|
-
raise ValueError(
|
126
|
-
f"Input is in incorrect format: {[type(i) for i in video]}. Currently, we only support {', '.join(supported_formats)}"
|
127
|
-
)
|
128
|
-
|
129
|
-
if isinstance(video[0], PIL.Image.Image):
|
130
|
-
video = [np.array(frame) for frame in video]
|
131
|
-
|
132
|
-
if isinstance(video[0], np.ndarray):
|
133
|
-
video = np.concatenate(video, axis=0) if video[0].ndim == 5 else np.stack(video, axis=0)
|
134
|
-
|
135
|
-
if video.dtype == np.uint8:
|
136
|
-
video = np.array(video).astype(np.float32) / 255.0
|
137
|
-
|
138
|
-
if video.ndim == 4:
|
139
|
-
video = video[None, ...]
|
140
|
-
|
141
|
-
video = torch.from_numpy(video.transpose(0, 4, 1, 2, 3))
|
142
|
-
|
143
|
-
elif isinstance(video[0], torch.Tensor):
|
144
|
-
video = torch.cat(video, axis=0) if video[0].ndim == 5 else torch.stack(video, axis=0)
|
145
|
-
|
146
|
-
# don't need any preprocess if the video is latents
|
147
|
-
channel = video.shape[1]
|
148
|
-
if channel == 4:
|
149
|
-
return video
|
150
|
-
|
151
|
-
# move channels before num_frames
|
152
|
-
video = video.permute(0, 2, 1, 3, 4)
|
153
|
-
|
154
|
-
# normalize video
|
155
|
-
video = 2.0 * video - 1.0
|
156
|
-
|
157
|
-
return video
|
158
|
-
|
159
|
-
|
160
96
|
class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
|
161
97
|
r"""
|
162
98
|
Pipeline for text-guided video-to-video generation.
|
@@ -203,7 +139,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
203
139
|
scheduler=scheduler,
|
204
140
|
)
|
205
141
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
206
|
-
self.
|
142
|
+
self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
|
207
143
|
|
208
144
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
209
145
|
def _encode_prompt(
|
@@ -213,8 +149,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
213
149
|
num_images_per_prompt,
|
214
150
|
do_classifier_free_guidance,
|
215
151
|
negative_prompt=None,
|
216
|
-
prompt_embeds: Optional[torch.
|
217
|
-
negative_prompt_embeds: Optional[torch.
|
152
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
153
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
218
154
|
lora_scale: Optional[float] = None,
|
219
155
|
**kwargs,
|
220
156
|
):
|
@@ -246,8 +182,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
246
182
|
num_images_per_prompt,
|
247
183
|
do_classifier_free_guidance,
|
248
184
|
negative_prompt=None,
|
249
|
-
prompt_embeds: Optional[torch.
|
250
|
-
negative_prompt_embeds: Optional[torch.
|
185
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
186
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
251
187
|
lora_scale: Optional[float] = None,
|
252
188
|
clip_skip: Optional[int] = None,
|
253
189
|
):
|
@@ -267,10 +203,10 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
267
203
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
268
204
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
269
205
|
less than `1`).
|
270
|
-
prompt_embeds (`torch.
|
206
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
271
207
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
272
208
|
provided, text embeddings will be generated from `prompt` input argument.
|
273
|
-
negative_prompt_embeds (`torch.
|
209
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
274
210
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
275
211
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
276
212
|
argument.
|
@@ -563,19 +499,19 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
563
499
|
def __call__(
|
564
500
|
self,
|
565
501
|
prompt: Union[str, List[str]] = None,
|
566
|
-
video: Union[List[np.ndarray], torch.
|
502
|
+
video: Union[List[np.ndarray], torch.Tensor] = None,
|
567
503
|
strength: float = 0.6,
|
568
504
|
num_inference_steps: int = 50,
|
569
505
|
guidance_scale: float = 15.0,
|
570
506
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
571
507
|
eta: float = 0.0,
|
572
508
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
573
|
-
latents: Optional[torch.
|
574
|
-
prompt_embeds: Optional[torch.
|
575
|
-
negative_prompt_embeds: Optional[torch.
|
509
|
+
latents: Optional[torch.Tensor] = None,
|
510
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
511
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
576
512
|
output_type: Optional[str] = "np",
|
577
513
|
return_dict: bool = True,
|
578
|
-
callback: Optional[Callable[[int, int, torch.
|
514
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
579
515
|
callback_steps: int = 1,
|
580
516
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
581
517
|
clip_skip: Optional[int] = None,
|
@@ -586,7 +522,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
586
522
|
Args:
|
587
523
|
prompt (`str` or `List[str]`, *optional*):
|
588
524
|
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
589
|
-
video (`List[np.ndarray]` or `torch.
|
525
|
+
video (`List[np.ndarray]` or `torch.Tensor`):
|
590
526
|
`video` frames or tensor representing a video batch to be used as the starting point for the process.
|
591
527
|
Can also accept video latents as `image`, if passing latents directly, it will not be encoded again.
|
592
528
|
strength (`float`, *optional*, defaults to 0.8):
|
@@ -610,25 +546,25 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
610
546
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
611
547
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
612
548
|
generation deterministic.
|
613
|
-
latents (`torch.
|
549
|
+
latents (`torch.Tensor`, *optional*):
|
614
550
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
|
615
551
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
616
552
|
tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
|
617
553
|
`(batch_size, num_channel, num_frames, height, width)`.
|
618
|
-
prompt_embeds (`torch.
|
554
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
619
555
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
620
556
|
provided, text embeddings are generated from the `prompt` input argument.
|
621
|
-
negative_prompt_embeds (`torch.
|
557
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
622
558
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
623
559
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
624
560
|
output_type (`str`, *optional*, defaults to `"np"`):
|
625
|
-
The output format of the generated video. Choose between `torch.
|
561
|
+
The output format of the generated video. Choose between `torch.Tensor` or `np.array`.
|
626
562
|
return_dict (`bool`, *optional*, defaults to `True`):
|
627
563
|
Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
|
628
564
|
of a plain tuple.
|
629
565
|
callback (`Callable`, *optional*):
|
630
566
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
631
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
567
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
632
568
|
callback_steps (`int`, *optional*, defaults to 1):
|
633
569
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
634
570
|
every step.
|
@@ -687,7 +623,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
687
623
|
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
688
624
|
|
689
625
|
# 4. Preprocess video
|
690
|
-
video = preprocess_video(video)
|
626
|
+
video = self.video_processor.preprocess_video(video)
|
691
627
|
|
692
628
|
# 5. Prepare timesteps
|
693
629
|
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
@@ -749,7 +685,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
|
|
749
685
|
video = latents
|
750
686
|
else:
|
751
687
|
video_tensor = self.decode_latents(latents)
|
752
|
-
video =
|
688
|
+
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
753
689
|
|
754
690
|
# 10. Offload all models
|
755
691
|
self.maybe_free_model_hooks()
|