diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +20 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +23 -25
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +46 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
- diffusers/schedulers/scheduling_edm_euler.py +53 -30
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
- diffusers/schedulers/scheduling_euler_discrete.py +163 -67
- diffusers/schedulers/scheduling_heun_discrete.py +60 -38
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +27 -25
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
- diffusers-0.28.0.dist-info/RECORD +414 -0
- diffusers-0.27.1.dist-info/RECORD +0 -399
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
CHANGED
@@ -197,7 +197,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
|
197
197
|
and not isinstance(image, list)
|
198
198
|
):
|
199
199
|
raise ValueError(
|
200
|
-
"`image` has to be of type `torch.
|
200
|
+
"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
|
201
201
|
f" {type(image)}"
|
202
202
|
)
|
203
203
|
|
@@ -214,7 +214,12 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
|
214
214
|
|
215
215
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
216
216
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
217
|
-
shape = (
|
217
|
+
shape = (
|
218
|
+
batch_size,
|
219
|
+
num_channels_latents,
|
220
|
+
int(height) // self.vae_scale_factor,
|
221
|
+
int(width) // self.vae_scale_factor,
|
222
|
+
)
|
218
223
|
if isinstance(generator, list) and len(generator) != batch_size:
|
219
224
|
raise ValueError(
|
220
225
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -242,10 +247,10 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
|
242
247
|
num_images_per_prompt: Optional[int] = 1,
|
243
248
|
eta: float = 0.0,
|
244
249
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
245
|
-
latents: Optional[torch.
|
250
|
+
latents: Optional[torch.Tensor] = None,
|
246
251
|
output_type: Optional[str] = "pil",
|
247
252
|
return_dict: bool = True,
|
248
|
-
callback: Optional[Callable[[int, int, torch.
|
253
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
249
254
|
callback_steps: int = 1,
|
250
255
|
**kwargs,
|
251
256
|
):
|
@@ -276,7 +281,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
|
276
281
|
generator (`torch.Generator`, *optional*):
|
277
282
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
278
283
|
generation deterministic.
|
279
|
-
latents (`torch.
|
284
|
+
latents (`torch.Tensor`, *optional*):
|
280
285
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
281
286
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
282
287
|
tensor is generated by sampling using the supplied random `generator`.
|
@@ -287,7 +292,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
|
287
292
|
plain tuple.
|
288
293
|
callback (`Callable`, *optional*):
|
289
294
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
290
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
295
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
291
296
|
callback_steps (`int`, *optional*, defaults to 1):
|
292
297
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
293
298
|
every step.
|
diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
CHANGED
@@ -300,7 +300,12 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
|
300
300
|
|
301
301
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
302
302
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
303
|
-
shape = (
|
303
|
+
shape = (
|
304
|
+
batch_size,
|
305
|
+
num_channels_latents,
|
306
|
+
int(height) // self.vae_scale_factor,
|
307
|
+
int(width) // self.vae_scale_factor,
|
308
|
+
)
|
304
309
|
if isinstance(generator, list) and len(generator) != batch_size:
|
305
310
|
raise ValueError(
|
306
311
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -328,10 +333,10 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
|
328
333
|
num_images_per_prompt: Optional[int] = 1,
|
329
334
|
eta: float = 0.0,
|
330
335
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
331
|
-
latents: Optional[torch.
|
336
|
+
latents: Optional[torch.Tensor] = None,
|
332
337
|
output_type: Optional[str] = "pil",
|
333
338
|
return_dict: bool = True,
|
334
|
-
callback: Optional[Callable[[int, int, torch.
|
339
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
335
340
|
callback_steps: int = 1,
|
336
341
|
**kwargs,
|
337
342
|
):
|
@@ -362,7 +367,7 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
|
362
367
|
generator (`torch.Generator`, *optional*):
|
363
368
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
364
369
|
generation deterministic.
|
365
|
-
latents (`torch.
|
370
|
+
latents (`torch.Tensor`, *optional*):
|
366
371
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
367
372
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
368
373
|
tensor is generated by sampling using the supplied random `generator`.
|
@@ -373,7 +378,7 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
|
373
378
|
plain tuple.
|
374
379
|
callback (`Callable`, *optional*):
|
375
380
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
376
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
381
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
377
382
|
callback_steps (`int`, *optional*, defaults to 1):
|
378
383
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
379
384
|
every step.
|
@@ -169,10 +169,10 @@ class VQDiffusionPipeline(DiffusionPipeline):
|
|
169
169
|
truncation_rate: float = 1.0,
|
170
170
|
num_images_per_prompt: int = 1,
|
171
171
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
172
|
-
latents: Optional[torch.
|
172
|
+
latents: Optional[torch.Tensor] = None,
|
173
173
|
output_type: Optional[str] = "pil",
|
174
174
|
return_dict: bool = True,
|
175
|
-
callback: Optional[Callable[[int, int, torch.
|
175
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
176
176
|
callback_steps: int = 1,
|
177
177
|
) -> Union[ImagePipelineOutput, Tuple]:
|
178
178
|
"""
|
@@ -196,7 +196,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
|
|
196
196
|
generator (`torch.Generator`, *optional*):
|
197
197
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
198
198
|
generation deterministic.
|
199
|
-
latents (`torch.
|
199
|
+
latents (`torch.Tensor` of shape (batch), *optional*):
|
200
200
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
201
201
|
generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of
|
202
202
|
completely masked latent pixels.
|
@@ -206,7 +206,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
|
|
206
206
|
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
|
207
207
|
callback (`Callable`, *optional*):
|
208
208
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
209
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
209
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
210
210
|
callback_steps (`int`, *optional*, defaults to 1):
|
211
211
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
212
212
|
every step.
|
@@ -301,7 +301,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
|
|
301
301
|
|
302
302
|
return ImagePipelineOutput(images=image)
|
303
303
|
|
304
|
-
def truncate(self, log_p_x_0: torch.
|
304
|
+
def truncate(self, log_p_x_0: torch.Tensor, truncation_rate: float) -> torch.Tensor:
|
305
305
|
"""
|
306
306
|
Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate`
|
307
307
|
The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to
|
@@ -41,20 +41,20 @@ class FreeInitMixin:
|
|
41
41
|
num_iters (`int`, *optional*, defaults to `3`):
|
42
42
|
Number of FreeInit noise re-initialization iterations.
|
43
43
|
use_fast_sampling (`bool`, *optional*, defaults to `False`):
|
44
|
-
Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
|
45
|
-
|
44
|
+
Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the
|
45
|
+
"Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
|
46
46
|
method (`str`, *optional*, defaults to `butterworth`):
|
47
|
-
Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
|
48
|
-
|
47
|
+
Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low
|
48
|
+
pass filter.
|
49
49
|
order (`int`, *optional*, defaults to `4`):
|
50
50
|
Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
|
51
51
|
whereas lower values lead to `gaussian` method behaviour.
|
52
52
|
spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
|
53
|
-
Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
|
54
|
-
|
53
|
+
Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the
|
54
|
+
original implementation.
|
55
55
|
temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
|
56
|
-
Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
|
57
|
-
|
56
|
+
Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the
|
57
|
+
original implementation.
|
58
58
|
"""
|
59
59
|
self._free_init_num_iters = num_iters
|
60
60
|
self._free_init_use_fast_sampling = use_fast_sampling
|
@@ -146,39 +146,40 @@ class FreeInitMixin:
|
|
146
146
|
):
|
147
147
|
if free_init_iteration == 0:
|
148
148
|
self._free_init_initial_noise = latents.detach().clone()
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
latents = latents.to(dtype)
|
149
|
+
else:
|
150
|
+
latent_shape = latents.shape
|
151
|
+
|
152
|
+
free_init_filter_shape = (1, *latent_shape[1:])
|
153
|
+
free_init_freq_filter = self._get_free_init_freq_filter(
|
154
|
+
shape=free_init_filter_shape,
|
155
|
+
device=device,
|
156
|
+
filter_type=self._free_init_method,
|
157
|
+
order=self._free_init_order,
|
158
|
+
spatial_stop_frequency=self._free_init_spatial_stop_frequency,
|
159
|
+
temporal_stop_frequency=self._free_init_temporal_stop_frequency,
|
160
|
+
)
|
161
|
+
|
162
|
+
current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
|
163
|
+
diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
|
164
|
+
|
165
|
+
z_t = self.scheduler.add_noise(
|
166
|
+
original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
|
167
|
+
).to(dtype=torch.float32)
|
168
|
+
|
169
|
+
z_rand = randn_tensor(
|
170
|
+
shape=latent_shape,
|
171
|
+
generator=generator,
|
172
|
+
device=device,
|
173
|
+
dtype=torch.float32,
|
174
|
+
)
|
175
|
+
latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
|
176
|
+
latents = latents.to(dtype)
|
178
177
|
|
179
178
|
# Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
|
180
179
|
if self._free_init_use_fast_sampling:
|
181
|
-
num_inference_steps =
|
180
|
+
num_inference_steps = max(
|
181
|
+
1, int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
|
182
|
+
)
|
182
183
|
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
183
184
|
|
184
185
|
return latents, self.scheduler.timesteps
|
@@ -31,6 +31,7 @@ from ...utils import (
|
|
31
31
|
replace_example_docstring,
|
32
32
|
)
|
33
33
|
from ...utils.torch_utils import randn_tensor
|
34
|
+
from ...video_processor import VideoProcessor
|
34
35
|
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
35
36
|
|
36
37
|
|
@@ -43,10 +44,14 @@ EXAMPLE_DOC_STRING = """
|
|
43
44
|
>>> from diffusers import I2VGenXLPipeline
|
44
45
|
>>> from diffusers.utils import export_to_gif, load_image
|
45
46
|
|
46
|
-
>>> pipeline = I2VGenXLPipeline.from_pretrained(
|
47
|
+
>>> pipeline = I2VGenXLPipeline.from_pretrained(
|
48
|
+
... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
|
49
|
+
... )
|
47
50
|
>>> pipeline.enable_model_cpu_offload()
|
48
51
|
|
49
|
-
>>> image_url =
|
52
|
+
>>> image_url = (
|
53
|
+
... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
|
54
|
+
... )
|
50
55
|
>>> image = load_image(image_url).convert("RGB")
|
51
56
|
|
52
57
|
>>> prompt = "Papers were floating in the air on a table in the library"
|
@@ -59,43 +64,22 @@ EXAMPLE_DOC_STRING = """
|
|
59
64
|
... num_inference_steps=50,
|
60
65
|
... negative_prompt=negative_prompt,
|
61
66
|
... guidance_scale=9.0,
|
62
|
-
... generator=generator
|
67
|
+
... generator=generator,
|
63
68
|
... ).frames[0]
|
64
69
|
>>> video_path = export_to_gif(frames, "i2v.gif")
|
65
70
|
```
|
66
71
|
"""
|
67
72
|
|
68
73
|
|
69
|
-
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
|
70
|
-
def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
|
71
|
-
batch_size, channels, num_frames, height, width = video.shape
|
72
|
-
outputs = []
|
73
|
-
for batch_idx in range(batch_size):
|
74
|
-
batch_vid = video[batch_idx].permute(1, 0, 2, 3)
|
75
|
-
batch_output = processor.postprocess(batch_vid, output_type)
|
76
|
-
|
77
|
-
outputs.append(batch_output)
|
78
|
-
|
79
|
-
if output_type == "np":
|
80
|
-
outputs = np.stack(outputs)
|
81
|
-
|
82
|
-
elif output_type == "pt":
|
83
|
-
outputs = torch.stack(outputs)
|
84
|
-
|
85
|
-
elif not output_type == "pil":
|
86
|
-
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
87
|
-
|
88
|
-
return outputs
|
89
|
-
|
90
|
-
|
91
74
|
@dataclass
|
92
75
|
class I2VGenXLPipelineOutput(BaseOutput):
|
93
76
|
r"""
|
94
77
|
Output class for image-to-video pipeline.
|
95
78
|
|
96
|
-
|
79
|
+
Args:
|
97
80
|
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
98
|
-
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
81
|
+
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
82
|
+
denoised
|
99
83
|
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
100
84
|
`(batch_size, num_frames, channels, height, width)`
|
101
85
|
"""
|
@@ -151,7 +135,7 @@ class I2VGenXLPipeline(
|
|
151
135
|
)
|
152
136
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
153
137
|
# `do_resize=False` as we do custom resizing.
|
154
|
-
self.
|
138
|
+
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
|
155
139
|
|
156
140
|
@property
|
157
141
|
def guidance_scale(self):
|
@@ -170,8 +154,8 @@ class I2VGenXLPipeline(
|
|
170
154
|
device,
|
171
155
|
num_videos_per_prompt,
|
172
156
|
negative_prompt=None,
|
173
|
-
prompt_embeds: Optional[torch.
|
174
|
-
negative_prompt_embeds: Optional[torch.
|
157
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
158
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
175
159
|
clip_skip: Optional[int] = None,
|
176
160
|
):
|
177
161
|
r"""
|
@@ -190,10 +174,10 @@ class I2VGenXLPipeline(
|
|
190
174
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
191
175
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
192
176
|
less than `1`).
|
193
|
-
prompt_embeds (`torch.
|
177
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
194
178
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
195
179
|
provided, text embeddings will be generated from `prompt` input argument.
|
196
|
-
negative_prompt_embeds (`torch.
|
180
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
197
181
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
198
182
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
199
183
|
argument.
|
@@ -337,8 +321,8 @@ class I2VGenXLPipeline(
|
|
337
321
|
dtype = next(self.image_encoder.parameters()).dtype
|
338
322
|
|
339
323
|
if not isinstance(image, torch.Tensor):
|
340
|
-
image = self.
|
341
|
-
image = self.
|
324
|
+
image = self.video_processor.pil_to_numpy(image)
|
325
|
+
image = self.video_processor.numpy_to_pt(image)
|
342
326
|
|
343
327
|
# Normalize the image with CLIP training stats.
|
344
328
|
image = self.feature_extractor(
|
@@ -450,7 +434,7 @@ class I2VGenXLPipeline(
|
|
450
434
|
and not isinstance(image, list)
|
451
435
|
):
|
452
436
|
raise ValueError(
|
453
|
-
"`image` has to be of type `torch.
|
437
|
+
"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
|
454
438
|
f" {type(image)}"
|
455
439
|
)
|
456
440
|
|
@@ -529,9 +513,9 @@ class I2VGenXLPipeline(
|
|
529
513
|
num_videos_per_prompt: Optional[int] = 1,
|
530
514
|
decode_chunk_size: Optional[int] = 1,
|
531
515
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
532
|
-
latents: Optional[torch.
|
533
|
-
prompt_embeds: Optional[torch.
|
534
|
-
negative_prompt_embeds: Optional[torch.
|
516
|
+
latents: Optional[torch.Tensor] = None,
|
517
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
518
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
535
519
|
output_type: Optional[str] = "pil",
|
536
520
|
return_dict: bool = True,
|
537
521
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -543,7 +527,7 @@ class I2VGenXLPipeline(
|
|
543
527
|
Args:
|
544
528
|
prompt (`str` or `List[str]`, *optional*):
|
545
529
|
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
546
|
-
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.
|
530
|
+
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
|
547
531
|
Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
|
548
532
|
[`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
|
549
533
|
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
@@ -551,7 +535,8 @@ class I2VGenXLPipeline(
|
|
551
535
|
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
552
536
|
The width in pixels of the generated image.
|
553
537
|
target_fps (`int`, *optional*):
|
554
|
-
Frames per second. The rate at which the generated images shall be exported to a video after
|
538
|
+
Frames per second. The rate at which the generated images shall be exported to a video after
|
539
|
+
generation. This is also used as a "micro-condition" while generation.
|
555
540
|
num_frames (`int`, *optional*):
|
556
541
|
The number of video frames to generate.
|
557
542
|
num_inference_steps (`int`, *optional*):
|
@@ -568,20 +553,20 @@ class I2VGenXLPipeline(
|
|
568
553
|
num_videos_per_prompt (`int`, *optional*):
|
569
554
|
The number of images to generate per prompt.
|
570
555
|
decode_chunk_size (`int`, *optional*):
|
571
|
-
The number of frames to decode at a time. The higher the chunk size, the higher the temporal
|
572
|
-
between frames, but also the higher the memory consumption. By default, the decoder will
|
573
|
-
for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
|
556
|
+
The number of frames to decode at a time. The higher the chunk size, the higher the temporal
|
557
|
+
consistency between frames, but also the higher the memory consumption. By default, the decoder will
|
558
|
+
decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
|
574
559
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
575
560
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
576
561
|
generation deterministic.
|
577
|
-
latents (`torch.
|
562
|
+
latents (`torch.Tensor`, *optional*):
|
578
563
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
579
564
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
580
565
|
tensor is generated by sampling using the supplied random `generator`.
|
581
|
-
prompt_embeds (`torch.
|
566
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
582
567
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
583
568
|
provided, text embeddings are generated from the `prompt` input argument.
|
584
|
-
negative_prompt_embeds (`torch.
|
569
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
585
570
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
586
571
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
587
572
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
@@ -651,7 +636,7 @@ class I2VGenXLPipeline(
|
|
651
636
|
|
652
637
|
# 3.2.2 Image latents.
|
653
638
|
resized_image = _center_crop_wide(image, (width, height))
|
654
|
-
image = self.
|
639
|
+
image = self.video_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
|
655
640
|
image_latents = self.prepare_image_latents(
|
656
641
|
image,
|
657
642
|
device=device,
|
@@ -731,7 +716,7 @@ class I2VGenXLPipeline(
|
|
731
716
|
video = latents
|
732
717
|
else:
|
733
718
|
video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size)
|
734
|
-
video =
|
719
|
+
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
735
720
|
|
736
721
|
# 9. Offload all models
|
737
722
|
self.maybe_free_model_hooks()
|
@@ -233,8 +233,8 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
233
233
|
def __call__(
|
234
234
|
self,
|
235
235
|
prompt: Union[str, List[str]],
|
236
|
-
image_embeds: Union[torch.
|
237
|
-
negative_image_embeds: Union[torch.
|
236
|
+
image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
237
|
+
negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
238
238
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
239
239
|
height: int = 512,
|
240
240
|
width: int = 512,
|
@@ -242,9 +242,9 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
242
242
|
guidance_scale: float = 4.0,
|
243
243
|
num_images_per_prompt: int = 1,
|
244
244
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
245
|
-
latents: Optional[torch.
|
245
|
+
latents: Optional[torch.Tensor] = None,
|
246
246
|
output_type: Optional[str] = "pil",
|
247
|
-
callback: Optional[Callable[[int, int, torch.
|
247
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
248
248
|
callback_steps: int = 1,
|
249
249
|
return_dict: bool = True,
|
250
250
|
):
|
@@ -254,9 +254,9 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
254
254
|
Args:
|
255
255
|
prompt (`str` or `List[str]`):
|
256
256
|
The prompt or prompts to guide the image generation.
|
257
|
-
image_embeds (`torch.
|
257
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
258
258
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
259
|
-
negative_image_embeds (`torch.
|
259
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
260
260
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
261
261
|
negative_prompt (`str` or `List[str]`, *optional*):
|
262
262
|
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
@@ -279,7 +279,7 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
279
279
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
280
280
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
281
281
|
to make generation deterministic.
|
282
|
-
latents (`torch.
|
282
|
+
latents (`torch.Tensor`, *optional*):
|
283
283
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
284
284
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
285
285
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -288,7 +288,7 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
288
288
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
289
289
|
callback (`Callable`, *optional*):
|
290
290
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
291
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
291
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
292
292
|
callback_steps (`int`, *optional*, defaults to 1):
|
293
293
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
294
294
|
every step.
|