diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +20 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +23 -25
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +46 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
- diffusers/schedulers/scheduling_edm_euler.py +53 -30
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
- diffusers/schedulers/scheduling_euler_discrete.py +163 -67
- diffusers/schedulers/scheduling_heun_discrete.py +60 -38
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +27 -25
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
- diffusers-0.28.0.dist-info/RECORD +414 -0
- diffusers-0.27.1.dist-info/RECORD +0 -399
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -13,13 +13,14 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import inspect
|
16
|
-
from typing import Callable, Dict, List, Optional, Union
|
16
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
17
17
|
|
18
18
|
import numpy as np
|
19
19
|
import PIL.Image
|
20
20
|
import torch
|
21
21
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
22
22
|
|
23
|
+
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
23
24
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
24
25
|
from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
25
26
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
@@ -168,14 +169,18 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
168
169
|
num_images_per_prompt: Optional[int] = 1,
|
169
170
|
eta: float = 0.0,
|
170
171
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
171
|
-
latents: Optional[torch.
|
172
|
-
prompt_embeds: Optional[torch.
|
173
|
-
negative_prompt_embeds: Optional[torch.
|
172
|
+
latents: Optional[torch.Tensor] = None,
|
173
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
174
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
174
175
|
ip_adapter_image: Optional[PipelineImageInput] = None,
|
176
|
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
175
177
|
output_type: Optional[str] = "pil",
|
176
178
|
return_dict: bool = True,
|
177
|
-
callback_on_step_end: Optional[
|
179
|
+
callback_on_step_end: Optional[
|
180
|
+
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
181
|
+
] = None,
|
178
182
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
183
|
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
179
184
|
**kwargs,
|
180
185
|
):
|
181
186
|
r"""
|
@@ -184,7 +189,7 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
184
189
|
Args:
|
185
190
|
prompt (`str` or `List[str]`, *optional*):
|
186
191
|
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
187
|
-
image (`torch.
|
192
|
+
image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
188
193
|
`Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
|
189
194
|
image latents as `image`, but if passing latents directly it is not encoded again.
|
190
195
|
num_inference_steps (`int`, *optional*, defaults to 100):
|
@@ -194,7 +199,7 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
194
199
|
A higher guidance scale value encourages the model to generate images closely linked to the text
|
195
200
|
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
196
201
|
image_guidance_scale (`float`, *optional*, defaults to 1.5):
|
197
|
-
Push the generated image towards the
|
202
|
+
Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
|
198
203
|
`image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
|
199
204
|
linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
|
200
205
|
value of at least `1`.
|
@@ -209,14 +214,14 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
209
214
|
generator (`torch.Generator`, *optional*):
|
210
215
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
211
216
|
generation deterministic.
|
212
|
-
latents (`torch.
|
217
|
+
latents (`torch.Tensor`, *optional*):
|
213
218
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
214
219
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
215
220
|
tensor is generated by sampling using the supplied random `generator`.
|
216
|
-
prompt_embeds (`torch.
|
221
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
217
222
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
218
223
|
provided, text embeddings are generated from the `prompt` input argument.
|
219
|
-
negative_prompt_embeds (`torch.
|
224
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
220
225
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
221
226
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
222
227
|
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
@@ -226,15 +231,18 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
226
231
|
return_dict (`bool`, *optional*, defaults to `True`):
|
227
232
|
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
228
233
|
plain tuple.
|
229
|
-
callback_on_step_end (`Callable`, *optional*):
|
230
|
-
A function
|
231
|
-
with the following arguments: `callback_on_step_end(self:
|
232
|
-
callback_kwargs: Dict)`. `callback_kwargs` will include a
|
233
|
-
`callback_on_step_end_tensor_inputs`.
|
234
|
+
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
235
|
+
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
236
|
+
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
237
|
+
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
238
|
+
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
234
239
|
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
235
240
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
236
241
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
237
242
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
243
|
+
cross_attention_kwargs (`dict`, *optional*):
|
244
|
+
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
245
|
+
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
238
246
|
|
239
247
|
Examples:
|
240
248
|
|
@@ -289,6 +297,9 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
289
297
|
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
290
298
|
)
|
291
299
|
|
300
|
+
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
301
|
+
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
302
|
+
|
292
303
|
# 0. Check inputs
|
293
304
|
self.check_inputs(
|
294
305
|
prompt,
|
@@ -296,6 +307,8 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
296
307
|
negative_prompt,
|
297
308
|
prompt_embeds,
|
298
309
|
negative_prompt_embeds,
|
310
|
+
ip_adapter_image,
|
311
|
+
ip_adapter_image_embeds,
|
299
312
|
callback_on_step_end_tensor_inputs,
|
300
313
|
)
|
301
314
|
self._guidance_scale = guidance_scale
|
@@ -303,14 +316,6 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
303
316
|
|
304
317
|
device = self._execution_device
|
305
318
|
|
306
|
-
if ip_adapter_image is not None:
|
307
|
-
output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
|
308
|
-
image_embeds, negative_image_embeds = self.encode_image(
|
309
|
-
ip_adapter_image, device, num_images_per_prompt, output_hidden_state
|
310
|
-
)
|
311
|
-
if self.do_classifier_free_guidance:
|
312
|
-
image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
|
313
|
-
|
314
319
|
if image is None:
|
315
320
|
raise ValueError("`image` input cannot be undefined.")
|
316
321
|
|
@@ -335,6 +340,14 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
335
340
|
negative_prompt_embeds=negative_prompt_embeds,
|
336
341
|
)
|
337
342
|
|
343
|
+
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
344
|
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
345
|
+
ip_adapter_image,
|
346
|
+
ip_adapter_image_embeds,
|
347
|
+
device,
|
348
|
+
batch_size * num_images_per_prompt,
|
349
|
+
self.do_classifier_free_guidance,
|
350
|
+
)
|
338
351
|
# 3. Preprocess image
|
339
352
|
image = self.image_processor.preprocess(image)
|
340
353
|
|
@@ -406,6 +419,7 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
406
419
|
t,
|
407
420
|
encoder_hidden_states=prompt_embeds,
|
408
421
|
added_cond_kwargs=added_cond_kwargs,
|
422
|
+
cross_attention_kwargs=cross_attention_kwargs,
|
409
423
|
return_dict=False,
|
410
424
|
)[0]
|
411
425
|
|
@@ -468,8 +482,8 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
468
482
|
num_images_per_prompt,
|
469
483
|
do_classifier_free_guidance,
|
470
484
|
negative_prompt=None,
|
471
|
-
prompt_embeds: Optional[torch.
|
472
|
-
negative_prompt_embeds: Optional[torch.
|
485
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
486
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
473
487
|
):
|
474
488
|
r"""
|
475
489
|
Encodes the prompt into text encoder hidden states.
|
@@ -487,10 +501,10 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
487
501
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
488
502
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
489
503
|
less than `1`).
|
490
|
-
prompt_embeds (`torch.
|
504
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
491
505
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
492
506
|
provided, text embeddings will be generated from `prompt` input argument.
|
493
|
-
negative_prompt_embeds (`torch.
|
507
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
494
508
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
495
509
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
496
510
|
argument.
|
@@ -635,6 +649,65 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
635
649
|
|
636
650
|
return image_embeds, uncond_image_embeds
|
637
651
|
|
652
|
+
def prepare_ip_adapter_image_embeds(
|
653
|
+
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
654
|
+
):
|
655
|
+
if ip_adapter_image_embeds is None:
|
656
|
+
if not isinstance(ip_adapter_image, list):
|
657
|
+
ip_adapter_image = [ip_adapter_image]
|
658
|
+
|
659
|
+
if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
|
660
|
+
raise ValueError(
|
661
|
+
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
662
|
+
)
|
663
|
+
|
664
|
+
image_embeds = []
|
665
|
+
for single_ip_adapter_image, image_proj_layer in zip(
|
666
|
+
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
667
|
+
):
|
668
|
+
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
|
669
|
+
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
670
|
+
single_ip_adapter_image, device, 1, output_hidden_state
|
671
|
+
)
|
672
|
+
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
673
|
+
single_negative_image_embeds = torch.stack(
|
674
|
+
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
675
|
+
)
|
676
|
+
|
677
|
+
if do_classifier_free_guidance:
|
678
|
+
single_image_embeds = torch.cat(
|
679
|
+
[single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
|
680
|
+
)
|
681
|
+
single_image_embeds = single_image_embeds.to(device)
|
682
|
+
|
683
|
+
image_embeds.append(single_image_embeds)
|
684
|
+
else:
|
685
|
+
repeat_dims = [1]
|
686
|
+
image_embeds = []
|
687
|
+
for single_image_embeds in ip_adapter_image_embeds:
|
688
|
+
if do_classifier_free_guidance:
|
689
|
+
(
|
690
|
+
single_image_embeds,
|
691
|
+
single_negative_image_embeds,
|
692
|
+
single_negative_image_embeds,
|
693
|
+
) = single_image_embeds.chunk(3)
|
694
|
+
single_image_embeds = single_image_embeds.repeat(
|
695
|
+
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
696
|
+
)
|
697
|
+
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
698
|
+
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
699
|
+
)
|
700
|
+
single_image_embeds = torch.cat(
|
701
|
+
[single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
|
702
|
+
)
|
703
|
+
else:
|
704
|
+
single_image_embeds = single_image_embeds.repeat(
|
705
|
+
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
706
|
+
)
|
707
|
+
image_embeds.append(single_image_embeds)
|
708
|
+
|
709
|
+
return image_embeds
|
710
|
+
|
638
711
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
|
639
712
|
def run_safety_checker(self, image, device, dtype):
|
640
713
|
if self.safety_checker is None:
|
@@ -687,6 +760,8 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
687
760
|
negative_prompt=None,
|
688
761
|
prompt_embeds=None,
|
689
762
|
negative_prompt_embeds=None,
|
763
|
+
ip_adapter_image=None,
|
764
|
+
ip_adapter_image_embeds=None,
|
690
765
|
callback_on_step_end_tensor_inputs=None,
|
691
766
|
):
|
692
767
|
if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
|
@@ -728,9 +803,29 @@ class StableDiffusionInstructPix2PixPipeline(
|
|
728
803
|
f" {negative_prompt_embeds.shape}."
|
729
804
|
)
|
730
805
|
|
806
|
+
if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
|
807
|
+
raise ValueError(
|
808
|
+
"Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
|
809
|
+
)
|
810
|
+
|
811
|
+
if ip_adapter_image_embeds is not None:
|
812
|
+
if not isinstance(ip_adapter_image_embeds, list):
|
813
|
+
raise ValueError(
|
814
|
+
f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
|
815
|
+
)
|
816
|
+
elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
|
817
|
+
raise ValueError(
|
818
|
+
f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
|
819
|
+
)
|
820
|
+
|
731
821
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
732
822
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
733
|
-
shape = (
|
823
|
+
shape = (
|
824
|
+
batch_size,
|
825
|
+
num_channels_latents,
|
826
|
+
int(height) // self.vae_scale_factor,
|
827
|
+
int(width) // self.vae_scale_factor,
|
828
|
+
)
|
734
829
|
if isinstance(generator, list) and len(generator) != batch_size:
|
735
830
|
raise ValueError(
|
736
831
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -221,7 +221,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
|
221
221
|
)
|
222
222
|
|
223
223
|
# verify batch size of prompt and image are same if image is a list or tensor
|
224
|
-
if isinstance(image, list
|
224
|
+
if isinstance(image, (list, torch.Tensor)):
|
225
225
|
if isinstance(prompt, str):
|
226
226
|
batch_size = 1
|
227
227
|
else:
|
@@ -267,10 +267,10 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
|
267
267
|
guidance_scale: float = 9.0,
|
268
268
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
269
269
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
270
|
-
latents: Optional[torch.
|
270
|
+
latents: Optional[torch.Tensor] = None,
|
271
271
|
output_type: Optional[str] = "pil",
|
272
272
|
return_dict: bool = True,
|
273
|
-
callback: Optional[Callable[[int, int, torch.
|
273
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
274
274
|
callback_steps: int = 1,
|
275
275
|
):
|
276
276
|
r"""
|
@@ -279,7 +279,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
|
279
279
|
Args:
|
280
280
|
prompt (`str` or `List[str]`):
|
281
281
|
The prompt or prompts to guide image upscaling.
|
282
|
-
image (`torch.
|
282
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
283
283
|
`Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a
|
284
284
|
latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered
|
285
285
|
a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and
|
@@ -299,7 +299,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
|
299
299
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
300
300
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
301
301
|
generation deterministic.
|
302
|
-
latents (`torch.
|
302
|
+
latents (`torch.Tensor`, *optional*):
|
303
303
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
304
304
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
305
305
|
tensor is generated by sampling using the supplied random `generator`.
|
@@ -310,7 +310,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
|
|
310
310
|
plain tuple.
|
311
311
|
callback (`Callable`, *optional*):
|
312
312
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
313
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
313
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
314
314
|
callback_steps (`int`, *optional*, defaults to 1):
|
315
315
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
316
316
|
every step.
|
@@ -176,8 +176,8 @@ class StableDiffusionUpscalePipeline(
|
|
176
176
|
num_images_per_prompt,
|
177
177
|
do_classifier_free_guidance,
|
178
178
|
negative_prompt=None,
|
179
|
-
prompt_embeds: Optional[torch.
|
180
|
-
negative_prompt_embeds: Optional[torch.
|
179
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
180
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
181
181
|
lora_scale: Optional[float] = None,
|
182
182
|
**kwargs,
|
183
183
|
):
|
@@ -209,8 +209,8 @@ class StableDiffusionUpscalePipeline(
|
|
209
209
|
num_images_per_prompt,
|
210
210
|
do_classifier_free_guidance,
|
211
211
|
negative_prompt=None,
|
212
|
-
prompt_embeds: Optional[torch.
|
213
|
-
negative_prompt_embeds: Optional[torch.
|
212
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
213
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
214
214
|
lora_scale: Optional[float] = None,
|
215
215
|
clip_skip: Optional[int] = None,
|
216
216
|
):
|
@@ -230,10 +230,10 @@ class StableDiffusionUpscalePipeline(
|
|
230
230
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
231
231
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
232
232
|
less than `1`).
|
233
|
-
prompt_embeds (`torch.
|
233
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
234
234
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
235
235
|
provided, text embeddings will be generated from `prompt` input argument.
|
236
|
-
negative_prompt_embeds (`torch.
|
236
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
237
237
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
238
238
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
239
239
|
argument.
|
@@ -468,7 +468,7 @@ class StableDiffusionUpscalePipeline(
|
|
468
468
|
)
|
469
469
|
|
470
470
|
# verify batch size of prompt and image are same if image is a list or tensor or numpy array
|
471
|
-
if isinstance(image, list
|
471
|
+
if isinstance(image, (list, np.ndarray, torch.Tensor)):
|
472
472
|
if prompt is not None and isinstance(prompt, str):
|
473
473
|
batch_size = 1
|
474
474
|
elif prompt is not None and isinstance(prompt, list):
|
@@ -542,12 +542,12 @@ class StableDiffusionUpscalePipeline(
|
|
542
542
|
num_images_per_prompt: Optional[int] = 1,
|
543
543
|
eta: float = 0.0,
|
544
544
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
545
|
-
latents: Optional[torch.
|
546
|
-
prompt_embeds: Optional[torch.
|
547
|
-
negative_prompt_embeds: Optional[torch.
|
545
|
+
latents: Optional[torch.Tensor] = None,
|
546
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
547
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
548
548
|
output_type: Optional[str] = "pil",
|
549
549
|
return_dict: bool = True,
|
550
|
-
callback: Optional[Callable[[int, int, torch.
|
550
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
551
551
|
callback_steps: int = 1,
|
552
552
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
553
553
|
clip_skip: int = None,
|
@@ -558,7 +558,7 @@ class StableDiffusionUpscalePipeline(
|
|
558
558
|
Args:
|
559
559
|
prompt (`str` or `List[str]`, *optional*):
|
560
560
|
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
561
|
-
image (`torch.
|
561
|
+
image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
562
562
|
`Image` or tensor representing an image batch to be upscaled.
|
563
563
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
564
564
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
@@ -577,14 +577,14 @@ class StableDiffusionUpscalePipeline(
|
|
577
577
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
578
578
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
579
579
|
generation deterministic.
|
580
|
-
latents (`torch.
|
580
|
+
latents (`torch.Tensor`, *optional*):
|
581
581
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
582
582
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
583
583
|
tensor is generated by sampling using the supplied random `generator`.
|
584
|
-
prompt_embeds (`torch.
|
584
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
585
585
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
586
586
|
provided, text embeddings are generated from the `prompt` input argument.
|
587
|
-
negative_prompt_embeds (`torch.
|
587
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
588
588
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
589
589
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
590
590
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
@@ -594,7 +594,7 @@ class StableDiffusionUpscalePipeline(
|
|
594
594
|
plain tuple.
|
595
595
|
callback (`Callable`, *optional*):
|
596
596
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
597
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
597
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
598
598
|
callback_steps (`int`, *optional*, defaults to 1):
|
599
599
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
600
600
|
every step.
|
@@ -76,7 +76,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
76
76
|
prior_text_encoder ([`CLIPTextModelWithProjection`]):
|
77
77
|
Frozen [`CLIPTextModelWithProjection`] text-encoder.
|
78
78
|
prior ([`PriorTransformer`]):
|
79
|
-
The
|
79
|
+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
|
80
80
|
prior_scheduler ([`KarrasDiffusionSchedulers`]):
|
81
81
|
Scheduler used in the prior denoising process.
|
82
82
|
image_normalizer ([`StableUnCLIPImageNormalizer`]):
|
@@ -257,8 +257,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
257
257
|
num_images_per_prompt,
|
258
258
|
do_classifier_free_guidance,
|
259
259
|
negative_prompt=None,
|
260
|
-
prompt_embeds: Optional[torch.
|
261
|
-
negative_prompt_embeds: Optional[torch.
|
260
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
261
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
262
262
|
lora_scale: Optional[float] = None,
|
263
263
|
**kwargs,
|
264
264
|
):
|
@@ -290,8 +290,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
290
290
|
num_images_per_prompt,
|
291
291
|
do_classifier_free_guidance,
|
292
292
|
negative_prompt=None,
|
293
|
-
prompt_embeds: Optional[torch.
|
294
|
-
negative_prompt_embeds: Optional[torch.
|
293
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
294
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
295
295
|
lora_scale: Optional[float] = None,
|
296
296
|
clip_skip: Optional[int] = None,
|
297
297
|
):
|
@@ -311,10 +311,10 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
311
311
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
312
312
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
313
313
|
less than `1`).
|
314
|
-
prompt_embeds (`torch.
|
314
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
315
315
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
316
316
|
provided, text embeddings will be generated from `prompt` input argument.
|
317
|
-
negative_prompt_embeds (`torch.
|
317
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
318
318
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
319
319
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
320
320
|
argument.
|
@@ -588,7 +588,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
588
588
|
self,
|
589
589
|
image_embeds: torch.Tensor,
|
590
590
|
noise_level: int,
|
591
|
-
noise: Optional[torch.
|
591
|
+
noise: Optional[torch.Tensor] = None,
|
592
592
|
generator: Optional[torch.Generator] = None,
|
593
593
|
):
|
594
594
|
"""
|
@@ -644,19 +644,19 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
644
644
|
num_images_per_prompt: Optional[int] = 1,
|
645
645
|
eta: float = 0.0,
|
646
646
|
generator: Optional[torch.Generator] = None,
|
647
|
-
latents: Optional[torch.
|
648
|
-
prompt_embeds: Optional[torch.
|
649
|
-
negative_prompt_embeds: Optional[torch.
|
647
|
+
latents: Optional[torch.Tensor] = None,
|
648
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
649
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
650
650
|
output_type: Optional[str] = "pil",
|
651
651
|
return_dict: bool = True,
|
652
|
-
callback: Optional[Callable[[int, int, torch.
|
652
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
653
653
|
callback_steps: int = 1,
|
654
654
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
655
655
|
noise_level: int = 0,
|
656
656
|
# prior args
|
657
657
|
prior_num_inference_steps: int = 25,
|
658
658
|
prior_guidance_scale: float = 4.0,
|
659
|
-
prior_latents: Optional[torch.
|
659
|
+
prior_latents: Optional[torch.Tensor] = None,
|
660
660
|
clip_skip: Optional[int] = None,
|
661
661
|
):
|
662
662
|
"""
|
@@ -686,14 +686,14 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
686
686
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
687
687
|
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
688
688
|
generation deterministic.
|
689
|
-
latents (`torch.
|
689
|
+
latents (`torch.Tensor`, *optional*):
|
690
690
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
691
691
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
692
692
|
tensor is generated by sampling using the supplied random `generator`.
|
693
|
-
prompt_embeds (`torch.
|
693
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
694
694
|
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
695
695
|
provided, text embeddings are generated from the `prompt` input argument.
|
696
|
-
negative_prompt_embeds (`torch.
|
696
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
697
697
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
698
698
|
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
699
699
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
@@ -702,7 +702,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
702
702
|
Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
|
703
703
|
callback (`Callable`, *optional*):
|
704
704
|
A function that calls every `callback_steps` steps during inference. The function is called with the
|
705
|
-
following arguments: `callback(step: int, timestep: int, latents: torch.
|
705
|
+
following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
706
706
|
callback_steps (`int`, *optional*, defaults to 1):
|
707
707
|
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
708
708
|
every step.
|
@@ -718,7 +718,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
718
718
|
prior_guidance_scale (`float`, *optional*, defaults to 4.0):
|
719
719
|
A higher guidance scale value encourages the model to generate images closely linked to the text
|
720
720
|
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
721
|
-
prior_latents (`torch.
|
721
|
+
prior_latents (`torch.Tensor`, *optional*):
|
722
722
|
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
723
723
|
embedding generation in the prior denoising process. Can be used to tweak the same generation with
|
724
724
|
different prompts. If not provided, a latents tensor is generated by sampling using the supplied random
|
@@ -876,7 +876,12 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
876
876
|
|
877
877
|
# 11. Prepare latent variables
|
878
878
|
num_channels_latents = self.unet.config.in_channels
|
879
|
-
shape = (
|
879
|
+
shape = (
|
880
|
+
batch_size,
|
881
|
+
num_channels_latents,
|
882
|
+
int(height) // self.vae_scale_factor,
|
883
|
+
int(width) // self.vae_scale_factor,
|
884
|
+
)
|
880
885
|
latents = self.prepare_latents(
|
881
886
|
shape=shape,
|
882
887
|
dtype=prompt_embeds.dtype,
|