diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +20 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +23 -25
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +46 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
- diffusers/schedulers/scheduling_edm_euler.py +53 -30
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
- diffusers/schedulers/scheduling_euler_discrete.py +163 -67
- diffusers/schedulers/scheduling_heun_discrete.py +60 -38
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +27 -25
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
- diffusers-0.28.0.dist-info/RECORD +414 -0
- diffusers-0.27.1.dist-info/RECORD +0 -399
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -119,7 +119,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
|
119
119
|
movq ([`VQModel`]):
|
120
120
|
MoVQ Decoder to generate the image from the latents.
|
121
121
|
prior_prior ([`PriorTransformer`]):
|
122
|
-
The
|
122
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
123
123
|
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
124
124
|
Frozen image-encoder.
|
125
125
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -135,6 +135,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
|
135
135
|
|
136
136
|
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
|
137
137
|
_load_connected_pipes = True
|
138
|
+
_exclude_from_cpu_offload = ["prior_prior"]
|
138
139
|
|
139
140
|
def __init__(
|
140
141
|
self,
|
@@ -178,7 +179,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
|
178
179
|
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
179
180
|
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
180
181
|
|
181
|
-
def enable_sequential_cpu_offload(self, gpu_id=
|
182
|
+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
182
183
|
r"""
|
183
184
|
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
184
185
|
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
@@ -186,8 +187,8 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
|
186
187
|
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
187
188
|
`enable_model_cpu_offload`, but performance is lower.
|
188
189
|
"""
|
189
|
-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
190
|
-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
190
|
+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
191
|
+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
191
192
|
|
192
193
|
def progress_bar(self, iterable=None, total=None):
|
193
194
|
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
@@ -212,9 +213,9 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
|
212
213
|
prior_guidance_scale: float = 4.0,
|
213
214
|
prior_num_inference_steps: int = 25,
|
214
215
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
215
|
-
latents: Optional[torch.
|
216
|
+
latents: Optional[torch.Tensor] = None,
|
216
217
|
output_type: Optional[str] = "pil",
|
217
|
-
callback: Optional[Callable[[int, int, torch.
|
218
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
218
219
|
callback_steps: int = 1,
|
219
220
|
return_dict: bool = True,
|
220
221
|
prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
@@ -258,7 +259,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
|
|
258
259
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
259
260
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
260
261
|
to make generation deterministic.
|
261
|
-
latents (`torch.
|
262
|
+
latents (`torch.Tensor`, *optional*):
|
262
263
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
263
264
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
264
265
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -346,7 +347,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
346
347
|
movq ([`VQModel`]):
|
347
348
|
MoVQ Decoder to generate the image from the latents.
|
348
349
|
prior_prior ([`PriorTransformer`]):
|
349
|
-
The
|
350
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
350
351
|
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
351
352
|
Frozen image-encoder.
|
352
353
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -362,6 +363,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
362
363
|
|
363
364
|
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
|
364
365
|
_load_connected_pipes = True
|
366
|
+
_exclude_from_cpu_offload = ["prior_prior"]
|
365
367
|
|
366
368
|
def __init__(
|
367
369
|
self,
|
@@ -405,17 +407,17 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
405
407
|
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
406
408
|
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
407
409
|
|
408
|
-
def enable_model_cpu_offload(self, gpu_id=
|
410
|
+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
409
411
|
r"""
|
410
412
|
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
411
413
|
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
412
414
|
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
413
415
|
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
414
416
|
"""
|
415
|
-
self.prior_pipe.enable_model_cpu_offload()
|
416
|
-
self.decoder_pipe.enable_model_cpu_offload()
|
417
|
+
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
418
|
+
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
|
417
419
|
|
418
|
-
def enable_sequential_cpu_offload(self, gpu_id=
|
420
|
+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
419
421
|
r"""
|
420
422
|
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
421
423
|
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
@@ -423,8 +425,8 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
423
425
|
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
424
426
|
`enable_model_cpu_offload`, but performance is lower.
|
425
427
|
"""
|
426
|
-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
427
|
-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
428
|
+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
429
|
+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
428
430
|
|
429
431
|
def progress_bar(self, iterable=None, total=None):
|
430
432
|
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
@@ -440,7 +442,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
440
442
|
def __call__(
|
441
443
|
self,
|
442
444
|
prompt: Union[str, List[str]],
|
443
|
-
image: Union[torch.
|
445
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
444
446
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
445
447
|
num_inference_steps: int = 100,
|
446
448
|
guidance_scale: float = 4.0,
|
@@ -451,9 +453,9 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
451
453
|
prior_guidance_scale: float = 4.0,
|
452
454
|
prior_num_inference_steps: int = 25,
|
453
455
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
454
|
-
latents: Optional[torch.
|
456
|
+
latents: Optional[torch.Tensor] = None,
|
455
457
|
output_type: Optional[str] = "pil",
|
456
|
-
callback: Optional[Callable[[int, int, torch.
|
458
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
457
459
|
callback_steps: int = 1,
|
458
460
|
return_dict: bool = True,
|
459
461
|
prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
@@ -467,7 +469,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
467
469
|
Args:
|
468
470
|
prompt (`str` or `List[str]`):
|
469
471
|
The prompt or prompts to guide the image generation.
|
470
|
-
image (`torch.
|
472
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
471
473
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
472
474
|
process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
|
473
475
|
again.
|
@@ -507,7 +509,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
507
509
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
508
510
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
509
511
|
to make generation deterministic.
|
510
|
-
latents (`torch.
|
512
|
+
latents (`torch.Tensor`, *optional*):
|
511
513
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
512
514
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
513
515
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -516,7 +518,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
|
|
516
518
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
517
519
|
callback (`Callable`, *optional*):
|
518
520
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
519
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
521
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
520
522
|
callback_steps (`int`, *optional*, defaults to 1):
|
521
523
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
522
524
|
every step.
|
@@ -594,7 +596,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
594
596
|
movq ([`VQModel`]):
|
595
597
|
MoVQ Decoder to generate the image from the latents.
|
596
598
|
prior_prior ([`PriorTransformer`]):
|
597
|
-
The
|
599
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
598
600
|
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
|
599
601
|
Frozen image-encoder.
|
600
602
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -610,6 +612,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
610
612
|
|
611
613
|
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
|
612
614
|
_load_connected_pipes = True
|
615
|
+
_exclude_from_cpu_offload = ["prior_prior"]
|
613
616
|
|
614
617
|
def __init__(
|
615
618
|
self,
|
@@ -653,7 +656,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
653
656
|
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
|
654
657
|
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
|
655
658
|
|
656
|
-
def enable_sequential_cpu_offload(self, gpu_id=
|
659
|
+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
657
660
|
r"""
|
658
661
|
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
|
659
662
|
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
|
@@ -661,8 +664,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
661
664
|
Note that offloading happens on a submodule basis. Memory savings are higher than with
|
662
665
|
`enable_model_cpu_offload`, but performance is lower.
|
663
666
|
"""
|
664
|
-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
665
|
-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
|
667
|
+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
668
|
+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
|
666
669
|
|
667
670
|
def progress_bar(self, iterable=None, total=None):
|
668
671
|
self.prior_pipe.progress_bar(iterable=iterable, total=total)
|
@@ -678,8 +681,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
678
681
|
def __call__(
|
679
682
|
self,
|
680
683
|
prompt: Union[str, List[str]],
|
681
|
-
image: Union[torch.
|
682
|
-
mask_image: Union[torch.
|
684
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
685
|
+
mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
683
686
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
684
687
|
num_inference_steps: int = 100,
|
685
688
|
guidance_scale: float = 4.0,
|
@@ -689,7 +692,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
689
692
|
prior_guidance_scale: float = 4.0,
|
690
693
|
prior_num_inference_steps: int = 25,
|
691
694
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
692
|
-
latents: Optional[torch.
|
695
|
+
latents: Optional[torch.Tensor] = None,
|
693
696
|
output_type: Optional[str] = "pil",
|
694
697
|
return_dict: bool = True,
|
695
698
|
prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
@@ -704,7 +707,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
704
707
|
Args:
|
705
708
|
prompt (`str` or `List[str]`):
|
706
709
|
The prompt or prompts to guide the image generation.
|
707
|
-
image (`torch.
|
710
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
708
711
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
709
712
|
process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
|
710
713
|
again.
|
@@ -743,7 +746,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
|
|
743
746
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
744
747
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
745
748
|
to make generation deterministic.
|
746
|
-
latents (`torch.
|
749
|
+
latents (`torch.Tensor`, *optional*):
|
747
750
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
748
751
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
749
752
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -151,18 +151,18 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
|
|
151
151
|
@torch.no_grad()
|
152
152
|
def __call__(
|
153
153
|
self,
|
154
|
-
image_embeds: Union[torch.
|
155
|
-
negative_image_embeds: Union[torch.
|
156
|
-
hint: torch.
|
154
|
+
image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
155
|
+
negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
156
|
+
hint: torch.Tensor,
|
157
157
|
height: int = 512,
|
158
158
|
width: int = 512,
|
159
159
|
num_inference_steps: int = 100,
|
160
160
|
guidance_scale: float = 4.0,
|
161
161
|
num_images_per_prompt: int = 1,
|
162
162
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
163
|
-
latents: Optional[torch.
|
163
|
+
latents: Optional[torch.Tensor] = None,
|
164
164
|
output_type: Optional[str] = "pil",
|
165
|
-
callback: Optional[Callable[[int, int, torch.
|
165
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
166
166
|
callback_steps: int = 1,
|
167
167
|
return_dict: bool = True,
|
168
168
|
):
|
@@ -172,11 +172,11 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
|
|
172
172
|
Args:
|
173
173
|
prompt (`str` or `List[str]`):
|
174
174
|
The prompt or prompts to guide the image generation.
|
175
|
-
hint (`torch.
|
175
|
+
hint (`torch.Tensor`):
|
176
176
|
The controlnet condition.
|
177
|
-
image_embeds (`torch.
|
177
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
178
178
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
179
|
-
negative_image_embeds (`torch.
|
179
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
180
180
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
181
181
|
negative_prompt (`str` or `List[str]`, *optional*):
|
182
182
|
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
@@ -199,7 +199,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
|
|
199
199
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
200
200
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
201
201
|
to make generation deterministic.
|
202
|
-
latents (`torch.
|
202
|
+
latents (`torch.Tensor`, *optional*):
|
203
203
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
204
204
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
205
205
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -208,7 +208,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
|
|
208
208
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
209
209
|
callback (`Callable`, *optional*):
|
210
210
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
211
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
211
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
212
212
|
callback_steps (`int`, *optional*, defaults to 1):
|
213
213
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
214
214
|
every step.
|
@@ -206,10 +206,10 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
|
|
206
206
|
@torch.no_grad()
|
207
207
|
def __call__(
|
208
208
|
self,
|
209
|
-
image_embeds: Union[torch.
|
210
|
-
image: Union[torch.
|
211
|
-
negative_image_embeds: Union[torch.
|
212
|
-
hint: torch.
|
209
|
+
image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
210
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
211
|
+
negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
212
|
+
hint: torch.Tensor,
|
213
213
|
height: int = 512,
|
214
214
|
width: int = 512,
|
215
215
|
num_inference_steps: int = 100,
|
@@ -218,7 +218,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
|
|
218
218
|
num_images_per_prompt: int = 1,
|
219
219
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
220
220
|
output_type: Optional[str] = "pil",
|
221
|
-
callback: Optional[Callable[[int, int, torch.
|
221
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
222
222
|
callback_steps: int = 1,
|
223
223
|
return_dict: bool = True,
|
224
224
|
):
|
@@ -226,9 +226,9 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
|
|
226
226
|
Function invoked when calling the pipeline for generation.
|
227
227
|
|
228
228
|
Args:
|
229
|
-
image_embeds (`torch.
|
229
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
230
230
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
231
|
-
image (`torch.
|
231
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
232
232
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
233
233
|
process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
|
234
234
|
again.
|
@@ -238,9 +238,9 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
|
|
238
238
|
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
239
239
|
be maximum and the denoising process will run for the full number of iterations specified in
|
240
240
|
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
241
|
-
hint (`torch.
|
241
|
+
hint (`torch.Tensor`):
|
242
242
|
The controlnet condition.
|
243
|
-
negative_image_embeds (`torch.
|
243
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
244
244
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
245
245
|
height (`int`, *optional*, defaults to 512):
|
246
246
|
The height in pixels of the generated image.
|
@@ -265,7 +265,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
|
|
265
265
|
(`np.array`) or `"pt"` (`torch.Tensor`).
|
266
266
|
callback (`Callable`, *optional*):
|
267
267
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
268
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
268
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
269
269
|
callback_steps (`int`, *optional*, defaults to 1):
|
270
270
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
271
271
|
every step.
|
@@ -190,9 +190,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
|
|
190
190
|
@torch.no_grad()
|
191
191
|
def __call__(
|
192
192
|
self,
|
193
|
-
image_embeds: Union[torch.
|
194
|
-
image: Union[torch.
|
195
|
-
negative_image_embeds: Union[torch.
|
193
|
+
image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
194
|
+
image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
|
195
|
+
negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
196
196
|
height: int = 512,
|
197
197
|
width: int = 512,
|
198
198
|
num_inference_steps: int = 100,
|
@@ -210,9 +210,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
|
|
210
210
|
Function invoked when calling the pipeline for generation.
|
211
211
|
|
212
212
|
Args:
|
213
|
-
image_embeds (`torch.
|
213
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
214
214
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
215
|
-
image (`torch.
|
215
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
216
216
|
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
217
217
|
process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
|
218
218
|
again.
|
@@ -222,7 +222,7 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
|
|
222
222
|
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
223
223
|
be maximum and the denoising process will run for the full number of iterations specified in
|
224
224
|
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
225
|
-
negative_image_embeds (`torch.
|
225
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
226
226
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
227
227
|
height (`int`, *optional*, defaults to 512):
|
228
228
|
The height in pixels of the generated image.
|
@@ -294,17 +294,17 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
|
|
294
294
|
@torch.no_grad()
|
295
295
|
def __call__(
|
296
296
|
self,
|
297
|
-
image_embeds: Union[torch.
|
298
|
-
image: Union[torch.
|
299
|
-
mask_image: Union[torch.
|
300
|
-
negative_image_embeds: Union[torch.
|
297
|
+
image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
298
|
+
image: Union[torch.Tensor, PIL.Image.Image],
|
299
|
+
mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
|
300
|
+
negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
|
301
301
|
height: int = 512,
|
302
302
|
width: int = 512,
|
303
303
|
num_inference_steps: int = 100,
|
304
304
|
guidance_scale: float = 4.0,
|
305
305
|
num_images_per_prompt: int = 1,
|
306
306
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
307
|
-
latents: Optional[torch.
|
307
|
+
latents: Optional[torch.Tensor] = None,
|
308
308
|
output_type: Optional[str] = "pil",
|
309
309
|
return_dict: bool = True,
|
310
310
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
@@ -315,7 +315,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
|
|
315
315
|
Function invoked when calling the pipeline for generation.
|
316
316
|
|
317
317
|
Args:
|
318
|
-
image_embeds (`torch.
|
318
|
+
image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
319
319
|
The clip image embeddings for text prompt, that will be used to condition the image generation.
|
320
320
|
image (`PIL.Image.Image`):
|
321
321
|
`Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
|
@@ -325,7 +325,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
|
|
325
325
|
black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
|
326
326
|
channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
|
327
327
|
so the expected shape would be `(B, H, W, 1)`.
|
328
|
-
negative_image_embeds (`torch.
|
328
|
+
negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
|
329
329
|
The clip image embeddings for negative text prompt, will be used to condition the image generation.
|
330
330
|
height (`int`, *optional*, defaults to 512):
|
331
331
|
The height in pixels of the generated image.
|
@@ -345,7 +345,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
|
|
345
345
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
346
346
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
347
347
|
to make generation deterministic.
|
348
|
-
latents (`torch.
|
348
|
+
latents (`torch.Tensor`, *optional*):
|
349
349
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
350
350
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
351
351
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -90,7 +90,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
|
90
90
|
|
91
91
|
Args:
|
92
92
|
prior ([`PriorTransformer`]):
|
93
|
-
The
|
93
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
94
94
|
image_encoder ([`CLIPVisionModelWithProjection`]):
|
95
95
|
Frozen image-encoder.
|
96
96
|
text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -132,12 +132,12 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
|
132
132
|
@replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
|
133
133
|
def interpolate(
|
134
134
|
self,
|
135
|
-
images_and_prompts: List[Union[str, PIL.Image.Image, torch.
|
135
|
+
images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
|
136
136
|
weights: List[float],
|
137
137
|
num_images_per_prompt: int = 1,
|
138
138
|
num_inference_steps: int = 25,
|
139
139
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
140
|
-
latents: Optional[torch.
|
140
|
+
latents: Optional[torch.Tensor] = None,
|
141
141
|
negative_prior_prompt: Optional[str] = None,
|
142
142
|
negative_prompt: str = "",
|
143
143
|
guidance_scale: float = 4.0,
|
@@ -147,7 +147,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
|
147
147
|
Function invoked when using the prior pipeline for interpolation.
|
148
148
|
|
149
149
|
Args:
|
150
|
-
images_and_prompts (`List[Union[str, PIL.Image.Image, torch.
|
150
|
+
images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
|
151
151
|
list of prompts and images to guide the image generation.
|
152
152
|
weights: (`List[float]`):
|
153
153
|
list of weights for each condition in `images_and_prompts`
|
@@ -159,7 +159,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
|
159
159
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
160
160
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
161
161
|
to make generation deterministic.
|
162
|
-
latents (`torch.
|
162
|
+
latents (`torch.Tensor`, *optional*):
|
163
163
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
164
164
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
165
165
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -376,7 +376,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
|
376
376
|
num_images_per_prompt: int = 1,
|
377
377
|
num_inference_steps: int = 25,
|
378
378
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
379
|
-
latents: Optional[torch.
|
379
|
+
latents: Optional[torch.Tensor] = None,
|
380
380
|
guidance_scale: float = 4.0,
|
381
381
|
output_type: Optional[str] = "pt", # pt only
|
382
382
|
return_dict: bool = True,
|
@@ -400,7 +400,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
|
|
400
400
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
401
401
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
402
402
|
to make generation deterministic.
|
403
|
-
latents (`torch.
|
403
|
+
latents (`torch.Tensor`, *optional*):
|
404
404
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
405
405
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
406
406
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -108,7 +108,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
|
108
108
|
|
109
109
|
Args:
|
110
110
|
prior ([`PriorTransformer`]):
|
111
|
-
The
|
111
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
112
112
|
image_encoder ([`CLIPVisionModelWithProjection`]):
|
113
113
|
Frozen image-encoder.
|
114
114
|
text_encoder ([`CLIPTextModelWithProjection`]):
|
@@ -156,12 +156,12 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
|
156
156
|
@replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
|
157
157
|
def interpolate(
|
158
158
|
self,
|
159
|
-
images_and_prompts: List[Union[str, PIL.Image.Image, torch.
|
159
|
+
images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
|
160
160
|
weights: List[float],
|
161
161
|
num_images_per_prompt: int = 1,
|
162
162
|
num_inference_steps: int = 25,
|
163
163
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
164
|
-
latents: Optional[torch.
|
164
|
+
latents: Optional[torch.Tensor] = None,
|
165
165
|
negative_prior_prompt: Optional[str] = None,
|
166
166
|
negative_prompt: str = "",
|
167
167
|
guidance_scale: float = 4.0,
|
@@ -171,7 +171,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
|
171
171
|
Function invoked when using the prior pipeline for interpolation.
|
172
172
|
|
173
173
|
Args:
|
174
|
-
images_and_prompts (`List[Union[str, PIL.Image.Image, torch.
|
174
|
+
images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
|
175
175
|
list of prompts and images to guide the image generation.
|
176
176
|
weights: (`List[float]`):
|
177
177
|
list of weights for each condition in `images_and_prompts`
|
@@ -183,7 +183,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
|
183
183
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
184
184
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
185
185
|
to make generation deterministic.
|
186
|
-
latents (`torch.
|
186
|
+
latents (`torch.Tensor`, *optional*):
|
187
187
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
188
188
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
189
189
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -418,7 +418,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
|
|
418
418
|
Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image`
|
419
419
|
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
420
420
|
denoising steps depends on the amount of noise initially added.
|
421
|
-
emb (`torch.
|
421
|
+
emb (`torch.Tensor`):
|
422
422
|
The image embedding.
|
423
423
|
negative_prompt (`str` or `List[str]`, *optional*):
|
424
424
|
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
@@ -35,10 +35,10 @@ DYNAMIC_MAP = {
|
|
35
35
|
|
36
36
|
def convert_state_dict(unet_state_dict):
|
37
37
|
"""
|
38
|
-
Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
|
39
38
|
Args:
|
40
|
-
|
41
|
-
|
39
|
+
Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
|
40
|
+
unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet
|
41
|
+
model to match keys with.
|
42
42
|
|
43
43
|
Returns:
|
44
44
|
OrderedDict: The converted state dictionary.
|