diffusers 0.27.2__py3-none-any.whl → 0.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +26 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +33 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +8 -0
- diffusers/models/activations.py +23 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +475 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +363 -32
- diffusers/models/model_loading_utils.py +177 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_outputs.py +14 -0
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +175 -99
- diffusers/models/normalization.py +2 -1
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/__init__.py +3 -0
- diffusers/models/transformers/dit_transformer_2d.py +240 -0
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/hunyuan_transformer_2d.py +427 -0
- diffusers/models/transformers/pixart_transformer_2d.py +336 -0
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +292 -184
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +19 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +27 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +7 -4
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/hunyuandit/__init__.py +48 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +881 -0
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +269 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +69 -79
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +20 -26
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +42 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
- diffusers/schedulers/scheduling_edm_euler.py +50 -31
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
- diffusers/schedulers/scheduling_euler_discrete.py +160 -68
- diffusers/schedulers/scheduling_heun_discrete.py +57 -39
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +24 -26
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +75 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +105 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/METADATA +7 -7
- diffusers-0.28.1.dist-info/RECORD +419 -0
- diffusers-0.27.2.dist-info/RECORD +0 -399
- {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/LICENSE +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/WHEEL +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ from ...utils import (
|
|
31
31
|
replace_example_docstring,
|
32
32
|
)
|
33
33
|
from ...utils.torch_utils import randn_tensor
|
34
|
+
from ...video_processor import VideoProcessor
|
34
35
|
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
|
35
36
|
|
36
37
|
|
@@ -43,10 +44,14 @@ EXAMPLE_DOC_STRING = """
|
|
43
44
|
>>> from diffusers import I2VGenXLPipeline
|
44
45
|
>>> from diffusers.utils import export_to_gif, load_image
|
45
46
|
|
46
|
-
>>> pipeline = I2VGenXLPipeline.from_pretrained(
|
47
|
+
>>> pipeline = I2VGenXLPipeline.from_pretrained(
|
48
|
+
... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
|
49
|
+
... )
|
47
50
|
>>> pipeline.enable_model_cpu_offload()
|
48
51
|
|
49
|
-
>>> image_url =
|
52
|
+
>>> image_url = (
|
53
|
+
... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
|
54
|
+
... )
|
50
55
|
>>> image = load_image(image_url).convert("RGB")
|
51
56
|
|
52
57
|
>>> prompt = "Papers were floating in the air on a table in the library"
|
@@ -59,43 +64,22 @@ EXAMPLE_DOC_STRING = """
|
|
59
64
|
... num_inference_steps=50,
|
60
65
|
... negative_prompt=negative_prompt,
|
61
66
|
... guidance_scale=9.0,
|
62
|
-
... generator=generator
|
67
|
+
... generator=generator,
|
63
68
|
... ).frames[0]
|
64
69
|
>>> video_path = export_to_gif(frames, "i2v.gif")
|
65
70
|
```
|
66
71
|
"""
|
67
72
|
|
68
73
|
|
69
|
-
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
|
70
|
-
def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
|
71
|
-
batch_size, channels, num_frames, height, width = video.shape
|
72
|
-
outputs = []
|
73
|
-
for batch_idx in range(batch_size):
|
74
|
-
batch_vid = video[batch_idx].permute(1, 0, 2, 3)
|
75
|
-
batch_output = processor.postprocess(batch_vid, output_type)
|
76
|
-
|
77
|
-
outputs.append(batch_output)
|
78
|
-
|
79
|
-
if output_type == "np":
|
80
|
-
outputs = np.stack(outputs)
|
81
|
-
|
82
|
-
elif output_type == "pt":
|
83
|
-
outputs = torch.stack(outputs)
|
84
|
-
|
85
|
-
elif not output_type == "pil":
|
86
|
-
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
|
87
|
-
|
88
|
-
return outputs
|
89
|
-
|
90
|
-
|
91
74
|
@dataclass
|
92
75
|
class I2VGenXLPipelineOutput(BaseOutput):
|
93
76
|
r"""
|
94
77
|
Output class for image-to-video pipeline.
|
95
78
|
|
96
|
-
|
79
|
+
Args:
|
97
80
|
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
98
|
-
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
81
|
+
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
82
|
+
denoised
|
99
83
|
PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
|
100
84
|
`(batch_size, num_frames, channels, height, width)`
|
101
85
|
"""
|
@@ -151,7 +135,7 @@ class I2VGenXLPipeline(
|
|
151
135
|
)
|
152
136
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
153
137
|
# `do_resize=False` as we do custom resizing.
|
154
|
-
self.
|
138
|
+
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
|
155
139
|
|
156
140
|
@property
|
157
141
|
def guidance_scale(self):
|
@@ -170,8 +154,8 @@ class I2VGenXLPipeline(
|
|
170
154
|
device,
|
171
155
|
num_videos_per_prompt,
|
172
156
|
negative_prompt=None,
|
173
|
-
prompt_embeds: Optional[torch.
|
174
|
-
negative_prompt_embeds: Optional[torch.
|
157
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
158
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
175
159
|
clip_skip: Optional[int] = None,
|
176
160
|
):
|
177
161
|
r"""
|
@@ -190,10 +174,10 @@ class I2VGenXLPipeline(
|
|
190
174
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
191
175
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
192
176
|
less than `1`).
|
193
|
-
prompt_embeds (`torch.
|
177
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
194
178
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
195
179
|
provided, text embeddings will be generated from `prompt` input argument.
|
196
|
-
negative_prompt_embeds (`torch.
|
180
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
197
181
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
198
182
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
199
183
|
argument.
|
@@ -337,8 +321,8 @@ class I2VGenXLPipeline(
|
|
337
321
|
dtype = next(self.image_encoder.parameters()).dtype
|
338
322
|
|
339
323
|
if not isinstance(image, torch.Tensor):
|
340
|
-
image = self.
|
341
|
-
image = self.
|
324
|
+
image = self.video_processor.pil_to_numpy(image)
|
325
|
+
image = self.video_processor.numpy_to_pt(image)
|
342
326
|
|
343
327
|
# Normalize the image with CLIP training stats.
|
344
328
|
image = self.feature_extractor(
|
@@ -450,7 +434,7 @@ class I2VGenXLPipeline(
|
|
450
434
|
and not isinstance(image, list)
|
451
435
|
):
|
452
436
|
raise ValueError(
|
453
|
-
"`image` has to be of type `torch.
|
437
|
+
"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
|
454
438
|
f" {type(image)}"
|
455
439
|
)
|
456
440
|
|
@@ -529,9 +513,9 @@ class I2VGenXLPipeline(
|
|
529
513
|
num_videos_per_prompt: Optional[int] = 1,
|
530
514
|
decode_chunk_size: Optional[int] = 1,
|
531
515
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
532
|
-
latents: Optional[torch.
|
533
|
-
prompt_embeds: Optional[torch.
|
534
|
-
negative_prompt_embeds: Optional[torch.
|
516
|
+
latents: Optional[torch.Tensor] = None,
|
517
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
518
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
535
519
|
output_type: Optional[str] = "pil",
|
536
520
|
return_dict: bool = True,
|
537
521
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -543,7 +527,7 @@ class I2VGenXLPipeline(
|
|
543
527
|
Args:
|
544
528
|
prompt (`str` or `List[str]`, *optional*):
|
545
529
|
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
546
|
-
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.
|
530
|
+
image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
|
547
531
|
Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
|
548
532
|
[`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
|
549
533
|
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
@@ -551,7 +535,8 @@ class I2VGenXLPipeline(
|
|
551
535
|
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
552
536
|
The width in pixels of the generated image.
|
553
537
|
target_fps (`int`, *optional*):
|
554
|
-
Frames per second. The rate at which the generated images shall be exported to a video after
|
538
|
+
Frames per second. The rate at which the generated images shall be exported to a video after
|
539
|
+
generation. This is also used as a "micro-condition" while generation.
|
555
540
|
num_frames (`int`, *optional*):
|
556
541
|
The number of video frames to generate.
|
557
542
|
num_inference_steps (`int`, *optional*):
|
@@ -568,20 +553,20 @@ class I2VGenXLPipeline(
|
|
568
553
|
num_videos_per_prompt (`int`, *optional*):
|
569
554
|
The number of images to generate per prompt.
|
570
555
|
decode_chunk_size (`int`, *optional*):
|
571
|
-
The number of frames to decode at a time. The higher the chunk size, the higher the temporal
|
572
|
-
between frames, but also the higher the memory consumption. By default, the decoder will
|
573
|
-
for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
|
556
|
+
The number of frames to decode at a time. The higher the chunk size, the higher the temporal
|
557
|
+
consistency between frames, but also the higher the memory consumption. By default, the decoder will
|
558
|
+
decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
|
574
559
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
575
560
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
576
561
|
generation deterministic.
|
577
|
-
latents (`torch.
|
562
|
+
latents (`torch.Tensor`, *optional*):
|
578
563
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
579
564
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
580
565
|
tensor is generated by sampling using the supplied random `generator`.
|
581
|
-
prompt_embeds (`torch.
|
566
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
582
567
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
583
568
|
provided, text embeddings are generated from the `prompt` input argument.
|
584
|
-
negative_prompt_embeds (`torch.
|
569
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
585
570
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
586
571
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
587
572
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
@@ -651,7 +636,7 @@ class I2VGenXLPipeline(
|
|
651
636
|
|
652
637
|
# 3.2.2 Image latents.
|
653
638
|
resized_image = _center_crop_wide(image, (width, height))
|
654
|
-
image = self.
|
639
|
+
image = self.video_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
|
655
640
|
image_latents = self.prepare_image_latents(
|
656
641
|
image,
|
657
642
|
device=device,
|
@@ -731,7 +716,7 @@ class I2VGenXLPipeline(
|
|
731
716
|
video = latents
|
732
717
|
else:
|
733
718
|
video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size)
|
734
|
-
video =
|
719
|
+
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
735
720
|
|
736
721
|
# 9. Offload all models
|
737
722
|
self.maybe_free_model_hooks()
|
@@ -233,8 +233,8 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
233
233
|
def __call__(
|
234
234
|
self,
|
235
235
|
prompt: Union[str, List[str]],
|
236
|
-
image_embeds: Union[torch.
|
237
|
-
negative_image_embeds: Union[torch.
|
236
|
+
image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
237
|
+
negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
238
238
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
239
239
|
height: int = 512,
|
240
240
|
width: int = 512,
|
@@ -242,9 +242,9 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
242
242
|
guidance_scale: float = 4.0,
|
243
243
|
num_images_per_prompt: int = 1,
|
244
244
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
245
|
-
latents: Optional[torch.
|
245
|
+
latents: Optional[torch.Tensor] = None,
|
246
246
|
output_type: Optional[str] = "pil",
|
247
|
-
callback: Optional[Callable[[int, int, torch.
|
247
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
248
248
|
callback_steps: int = 1,
|
249
249
|
return_dict: bool = True,
|
250
250
|
):
|
@@ -254,9 +254,9 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
254
254
|
Args:
|
255
255
|
prompt (`str` or `List[str]`):
|
256
256
|
The prompt or prompts to guide the image generation.
|
257
|
-
image_embeds (`torch.
|
257
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
258
258
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
259
|
-
negative_image_embeds (`torch.
|
259
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
260
260
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
261
261
|
negative_prompt (`str` or `List[str]`, *optional*):
|
262
262
|
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
@@ -279,7 +279,7 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
279
279
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
280
280
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
281
281
|
to make generation deterministic.
|
282
|
-
latents (`torch.
|
282
|
+
latents (`torch.Tensor`, *optional*):
|
283
283
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
284
284
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
285
285
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -288,7 +288,7 @@ class KandinskyPipeline(DiffusionPipeline):
|
|
288
288
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
289
289
|
callback (`Callable`, *optional*):
|
290
290
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
291
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
291
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
292
292
|
callback_steps (`int`, *optional*, defaults to 1):
|
293
293
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
294
294
|
every step.
|
@@ -129,7 +129,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
129
129
|
movq ([`VQModel`]):
|
130
130
|
MoVQ Decoder to generate the image from the latents.
|
131
131
|
prior_prior ([`PriorTransformer`]):
|
132
|
-
The
|
132
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
133
133
|
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
134
134
|
Frozen image-encoder.
|
135
135
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -143,6 +143,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
143
143
|
|
144
144
|
_load_connected_pipes = True
|
145
145
|
model_cpu_offload_seq = "text_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoder"
|
146
|
+
_exclude_from_cpu_offload = ["prior_prior"]
|
146
147
|
|
147
148
|
def __init__(
|
148
149
|
self,
|
@@ -225,9 +226,9 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
225
226
|
prior_guidance_scale: float = 4.0,
|
226
227
|
prior_num_inference_steps: int = 25,
|
227
228
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
228
|
-
latents: Optional[torch.
|
229
|
+
latents: Optional[torch.Tensor] = None,
|
229
230
|
output_type: Optional[str] = "pil",
|
230
|
-
callback: Optional[Callable[[int, int, torch.
|
231
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
231
232
|
callback_steps: int = 1,
|
232
233
|
return_dict: bool = True,
|
233
234
|
):
|
@@ -267,7 +268,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
267
268
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
268
269
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
269
270
|
to make generation deterministic.
|
270
|
-
latents (`torch.
|
271
|
+
latents (`torch.Tensor`, *optional*):
|
271
272
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
272
273
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
273
274
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -276,7 +277,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
|
|
276
277
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
277
278
|
callback (`Callable`, *optional*):
|
278
279
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
279
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
280
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
280
281
|
callback_steps (`int`, *optional*, defaults to 1):
|
281
282
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
282
283
|
every step.
|
@@ -346,7 +347,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
346
347
|
movq ([`VQModel`]):
|
347
348
|
MoVQ Decoder to generate the image from the latents.
|
348
349
|
prior_prior ([`PriorTransformer`]):
|
349
|
-
The
|
350
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
350
351
|
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
351
352
|
Frozen image-encoder.
|
352
353
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -360,6 +361,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
360
361
|
|
361
362
|
_load_connected_pipes = True
|
362
363
|
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
|
364
|
+
_exclude_from_cpu_offload = ["prior_prior"]
|
363
365
|
|
364
366
|
def __init__(
|
365
367
|
self,
|
@@ -434,7 +436,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
434
436
|
def __call__(
|
435
437
|
self,
|
436
438
|
prompt: Union[str, List[str]],
|
437
|
-
image: Union[torch.
|
439
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
438
440
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
439
441
|
num_inference_steps: int = 100,
|
440
442
|
guidance_scale: float = 4.0,
|
@@ -445,9 +447,9 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
445
447
|
prior_guidance_scale: float = 4.0,
|
446
448
|
prior_num_inference_steps: int = 25,
|
447
449
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
448
|
-
latents: Optional[torch.
|
450
|
+
latents: Optional[torch.Tensor] = None,
|
449
451
|
output_type: Optional[str] = "pil",
|
450
|
-
callback: Optional[Callable[[int, int, torch.
|
452
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
451
453
|
callback_steps: int = 1,
|
452
454
|
return_dict: bool = True,
|
453
455
|
):
|
@@ -457,7 +459,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
457
459
|
Args:
|
458
460
|
prompt (`str` or `List[str]`):
|
459
461
|
The prompt or prompts to guide the image generation.
|
460
|
-
image (`torch.
|
462
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
461
463
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
462
464
|
process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
|
463
465
|
again.
|
@@ -497,7 +499,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
497
499
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
498
500
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
499
501
|
to make generation deterministic.
|
500
|
-
latents (`torch.
|
502
|
+
latents (`torch.Tensor`, *optional*):
|
501
503
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
502
504
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
503
505
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -506,7 +508,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
|
|
506
508
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
507
509
|
callback (`Callable`, *optional*):
|
508
510
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
509
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
511
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
510
512
|
callback_steps (`int`, *optional*, defaults to 1):
|
511
513
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
512
514
|
every step.
|
@@ -586,7 +588,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
586
588
|
movq ([`VQModel`]):
|
587
589
|
MoVQ Decoder to generate the image from the latents.
|
588
590
|
prior_prior ([`PriorTransformer`]):
|
589
|
-
The
|
591
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
590
592
|
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
591
593
|
Frozen image-encoder.
|
592
594
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -600,6 +602,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
600
602
|
|
601
603
|
_load_connected_pipes = True
|
602
604
|
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movq"
|
605
|
+
_exclude_from_cpu_offload = ["prior_prior"]
|
603
606
|
|
604
607
|
def __init__(
|
605
608
|
self,
|
@@ -674,8 +677,8 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
674
677
|
def __call__(
|
675
678
|
self,
|
676
679
|
prompt: Union[str, List[str]],
|
677
|
-
image: Union[torch.
|
678
|
-
mask_image: Union[torch.
|
680
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
681
|
+
mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
679
682
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
680
683
|
num_inference_steps: int = 100,
|
681
684
|
guidance_scale: float = 4.0,
|
@@ -685,9 +688,9 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
685
688
|
prior_guidance_scale: float = 4.0,
|
686
689
|
prior_num_inference_steps: int = 25,
|
687
690
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
688
|
-
latents: Optional[torch.
|
691
|
+
latents: Optional[torch.Tensor] = None,
|
689
692
|
output_type: Optional[str] = "pil",
|
690
|
-
callback: Optional[Callable[[int, int, torch.
|
693
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
691
694
|
callback_steps: int = 1,
|
692
695
|
return_dict: bool = True,
|
693
696
|
):
|
@@ -697,7 +700,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
697
700
|
Args:
|
698
701
|
prompt (`str` or `List[str]`):
|
699
702
|
The prompt or prompts to guide the image generation.
|
700
|
-
image (`torch.
|
703
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
701
704
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
702
705
|
process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
|
703
706
|
again.
|
@@ -736,7 +739,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
736
739
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
737
740
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
738
741
|
to make generation deterministic.
|
739
|
-
latents (`torch.
|
742
|
+
latents (`torch.Tensor`, *optional*):
|
740
743
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
741
744
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
742
745
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -745,7 +748,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
|
|
745
748
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
746
749
|
callback (`Callable`, *optional*):
|
747
750
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
748
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
751
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
749
752
|
callback_steps (`int`, *optional*, defaults to 1):
|
750
753
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
751
754
|
every step.
|
@@ -266,10 +266,10 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
266
266
|
# add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
|
267
267
|
def add_noise(
|
268
268
|
self,
|
269
|
-
original_samples: torch.
|
270
|
-
noise: torch.
|
269
|
+
original_samples: torch.Tensor,
|
270
|
+
noise: torch.Tensor,
|
271
271
|
timesteps: torch.IntTensor,
|
272
|
-
) -> torch.
|
272
|
+
) -> torch.Tensor:
|
273
273
|
betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
|
274
274
|
alphas = 1.0 - betas
|
275
275
|
alphas_cumprod = torch.cumprod(alphas, dim=0)
|
@@ -295,9 +295,9 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
295
295
|
def __call__(
|
296
296
|
self,
|
297
297
|
prompt: Union[str, List[str]],
|
298
|
-
image: Union[torch.
|
299
|
-
image_embeds: torch.
|
300
|
-
negative_image_embeds: torch.
|
298
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
299
|
+
image_embeds: torch.Tensor,
|
300
|
+
negative_image_embeds: torch.Tensor,
|
301
301
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
302
302
|
height: int = 512,
|
303
303
|
width: int = 512,
|
@@ -307,7 +307,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
307
307
|
num_images_per_prompt: int = 1,
|
308
308
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
309
309
|
output_type: Optional[str] = "pil",
|
310
|
-
callback: Optional[Callable[[int, int, torch.
|
310
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
311
311
|
callback_steps: int = 1,
|
312
312
|
return_dict: bool = True,
|
313
313
|
):
|
@@ -317,12 +317,12 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
317
317
|
Args:
|
318
318
|
prompt (`str` or `List[str]`):
|
319
319
|
The prompt or prompts to guide the image generation.
|
320
|
-
image (`torch.
|
320
|
+
image (`torch.Tensor`, `PIL.Image.Image`):
|
321
321
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
322
322
|
process.
|
323
|
-
image_embeds (`torch.
|
323
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
324
324
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
325
|
-
negative_image_embeds (`torch.
|
325
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
326
326
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
327
327
|
negative_prompt (`str` or `List[str]`, *optional*):
|
328
328
|
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
@@ -356,7 +356,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
|
|
356
356
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
357
357
|
callback (`Callable`, *optional*):
|
358
358
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
359
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
359
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
360
360
|
callback_steps (`int`, *optional*, defaults to 1):
|
361
361
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
362
362
|
every step.
|
@@ -398,10 +398,10 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
398
398
|
def __call__(
|
399
399
|
self,
|
400
400
|
prompt: Union[str, List[str]],
|
401
|
-
image: Union[torch.
|
402
|
-
mask_image: Union[torch.
|
403
|
-
image_embeds: torch.
|
404
|
-
negative_image_embeds: torch.
|
401
|
+
image: Union[torch.Tensor, PIL.Image.Image],
|
402
|
+
mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
|
403
|
+
image_embeds: torch.Tensor,
|
404
|
+
negative_image_embeds: torch.Tensor,
|
405
405
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
406
406
|
height: int = 512,
|
407
407
|
width: int = 512,
|
@@ -409,9 +409,9 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
409
409
|
guidance_scale: float = 4.0,
|
410
410
|
num_images_per_prompt: int = 1,
|
411
411
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
412
|
-
latents: Optional[torch.
|
412
|
+
latents: Optional[torch.Tensor] = None,
|
413
413
|
output_type: Optional[str] = "pil",
|
414
|
-
callback: Optional[Callable[[int, int, torch.
|
414
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
415
415
|
callback_steps: int = 1,
|
416
416
|
return_dict: bool = True,
|
417
417
|
):
|
@@ -421,10 +421,10 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
421
421
|
Args:
|
422
422
|
prompt (`str` or `List[str]`):
|
423
423
|
The prompt or prompts to guide the image generation.
|
424
|
-
image (`torch.
|
424
|
+
image (`torch.Tensor`, `PIL.Image.Image` or `np.ndarray`):
|
425
425
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
426
426
|
process.
|
427
|
-
mask_image (`PIL.Image.Image`,`torch.
|
427
|
+
mask_image (`PIL.Image.Image`,`torch.Tensor` or `np.ndarray`):
|
428
428
|
`Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
429
429
|
repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the
|
430
430
|
image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the
|
@@ -432,9 +432,9 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
432
432
|
image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it
|
433
433
|
will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected
|
434
434
|
shape is `(H, W)`.
|
435
|
-
image_embeds (`torch.
|
435
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
436
436
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
437
|
-
negative_image_embeds (`torch.
|
437
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
438
438
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
439
439
|
negative_prompt (`str` or `List[str]`, *optional*):
|
440
440
|
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
@@ -457,7 +457,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
457
457
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
458
458
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
459
459
|
to make generation deterministic.
|
460
|
-
latents (`torch.
|
460
|
+
latents (`torch.Tensor`, *optional*):
|
461
461
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
462
462
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
463
463
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -466,7 +466,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
|
|
466
466
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
467
467
|
callback (`Callable`, *optional*):
|
468
468
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
469
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
469
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
470
470
|
callback_steps (`int`, *optional*, defaults to 1):
|
471
471
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
472
472
|
every step.
|
@@ -115,14 +115,14 @@ class KandinskyPriorPipelineOutput(BaseOutput):
|
|
115
115
|
Output class for KandinskyPriorPipeline.
|
116
116
|
|
117
117
|
Args:
|
118
|
-
image_embeds (`torch.
|
118
|
+
image_embeds (`torch.Tensor`)
|
119
119
|
clip image embeddings for text prompt
|
120
120
|
negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
|
121
121
|
clip image embeddings for unconditional tokens
|
122
122
|
"""
|
123
123
|
|
124
|
-
image_embeds: Union[torch.
|
125
|
-
negative_image_embeds: Union[torch.
|
124
|
+
image_embeds: Union[torch.Tensor, np.ndarray]
|
125
|
+
negative_image_embeds: Union[torch.Tensor, np.ndarray]
|
126
126
|
|
127
127
|
|
128
128
|
class KandinskyPriorPipeline(DiffusionPipeline):
|
@@ -134,7 +134,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
134
134
|
|
135
135
|
Args:
|
136
136
|
prior ([`PriorTransformer`]):
|
137
|
-
The
|
137
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
138
138
|
image_encoder ([`CLIPVisionModelWithProjection`]):
|
139
139
|
Frozen image-encoder.
|
140
140
|
text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -173,12 +173,12 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
173
173
|
@replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
|
174
174
|
def interpolate(
|
175
175
|
self,
|
176
|
-
images_and_prompts: List[Union[str, PIL.Image.Image, torch.
|
176
|
+
images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
|
177
177
|
weights: List[float],
|
178
178
|
num_images_per_prompt: int = 1,
|
179
179
|
num_inference_steps: int = 25,
|
180
180
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
181
|
-
latents: Optional[torch.
|
181
|
+
latents: Optional[torch.Tensor] = None,
|
182
182
|
negative_prior_prompt: Optional[str] = None,
|
183
183
|
negative_prompt: str = "",
|
184
184
|
guidance_scale: float = 4.0,
|
@@ -188,7 +188,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
188
188
|
Function invoked when using the prior pipeline for interpolation.
|
189
189
|
|
190
190
|
Args:
|
191
|
-
images_and_prompts (`List[Union[str, PIL.Image.Image, torch.
|
191
|
+
images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
|
192
192
|
list of prompts and images to guide the image generation.
|
193
193
|
weights: (`List[float]`):
|
194
194
|
list of weights for each condition in `images_and_prompts`
|
@@ -200,7 +200,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
200
200
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
201
201
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
202
202
|
to make generation deterministic.
|
203
|
-
latents (`torch.
|
203
|
+
latents (`torch.Tensor`, *optional*):
|
204
204
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
205
205
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
206
206
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -403,7 +403,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
403
403
|
num_images_per_prompt: int = 1,
|
404
404
|
num_inference_steps: int = 25,
|
405
405
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
406
|
-
latents: Optional[torch.
|
406
|
+
latents: Optional[torch.Tensor] = None,
|
407
407
|
guidance_scale: float = 4.0,
|
408
408
|
output_type: Optional[str] = "pt",
|
409
409
|
return_dict: bool = True,
|
@@ -425,7 +425,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
|
|
425
425
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
426
426
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
427
427
|
to make generation deterministic.
|
428
|
-
latents (`torch.
|
428
|
+
latents (`torch.Tensor`, *optional*):
|
429
429
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
430
430
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
431
431
|
tensor will ge generated by sampling using the supplied random `generator`.
|