diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +18 -1
- diffusers/callbacks.py +156 -0
- diffusers/commands/env.py +110 -6
- diffusers/configuration_utils.py +16 -11
- diffusers/dependency_versions_table.py +2 -1
- diffusers/image_processor.py +158 -45
- diffusers/loaders/__init__.py +2 -5
- diffusers/loaders/autoencoder.py +4 -4
- diffusers/loaders/controlnet.py +4 -4
- diffusers/loaders/ip_adapter.py +80 -22
- diffusers/loaders/lora.py +134 -20
- diffusers/loaders/lora_conversion_utils.py +46 -43
- diffusers/loaders/peft.py +4 -3
- diffusers/loaders/single_file.py +401 -170
- diffusers/loaders/single_file_model.py +290 -0
- diffusers/loaders/single_file_utils.py +616 -672
- diffusers/loaders/textual_inversion.py +41 -20
- diffusers/loaders/unet.py +168 -115
- diffusers/loaders/unet_loader_utils.py +163 -0
- diffusers/models/__init__.py +2 -0
- diffusers/models/activations.py +11 -3
- diffusers/models/attention.py +10 -11
- diffusers/models/attention_processor.py +367 -148
- diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
- diffusers/models/autoencoders/autoencoder_kl.py +18 -19
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
- diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
- diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
- diffusers/models/autoencoders/vae.py +23 -24
- diffusers/models/controlnet.py +12 -9
- diffusers/models/controlnet_flax.py +4 -4
- diffusers/models/controlnet_xs.py +1915 -0
- diffusers/models/downsampling.py +17 -18
- diffusers/models/embeddings.py +147 -24
- diffusers/models/model_loading_utils.py +149 -0
- diffusers/models/modeling_flax_pytorch_utils.py +2 -1
- diffusers/models/modeling_flax_utils.py +4 -4
- diffusers/models/modeling_pytorch_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +118 -98
- diffusers/models/resnet.py +18 -23
- diffusers/models/transformer_temporal.py +3 -3
- diffusers/models/transformers/dual_transformer_2d.py +4 -4
- diffusers/models/transformers/prior_transformer.py +7 -7
- diffusers/models/transformers/t5_film_transformer.py +17 -19
- diffusers/models/transformers/transformer_2d.py +272 -156
- diffusers/models/transformers/transformer_temporal.py +10 -10
- diffusers/models/unets/unet_1d.py +5 -5
- diffusers/models/unets/unet_1d_blocks.py +29 -29
- diffusers/models/unets/unet_2d.py +6 -6
- diffusers/models/unets/unet_2d_blocks.py +137 -128
- diffusers/models/unets/unet_2d_condition.py +20 -15
- diffusers/models/unets/unet_2d_condition_flax.py +6 -5
- diffusers/models/unets/unet_3d_blocks.py +79 -77
- diffusers/models/unets/unet_3d_condition.py +13 -9
- diffusers/models/unets/unet_i2vgen_xl.py +14 -13
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +114 -14
- diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
- diffusers/models/unets/unet_stable_cascade.py +16 -13
- diffusers/models/upsampling.py +17 -20
- diffusers/models/vq_model.py +16 -15
- diffusers/pipelines/__init__.py +25 -3
- diffusers/pipelines/amused/pipeline_amused.py +12 -12
- diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
- diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
- diffusers/pipelines/animatediff/pipeline_output.py +3 -2
- diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
- diffusers/pipelines/auto_pipeline.py +21 -17
- diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
- diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
- diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
- diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
- diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
- diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
- diffusers/pipelines/controlnet_xs/__init__.py +68 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
- diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
- diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
- diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
- diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
- diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
- diffusers/pipelines/dit/pipeline_dit.py +3 -0
- diffusers/pipelines/free_init_utils.py +39 -38
- diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
- diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
- diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
- diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
- diffusers/pipelines/marigold/__init__.py +50 -0
- diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
- diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
- diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
- diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
- diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
- diffusers/pipelines/pia/pipeline_pia.py +39 -125
- diffusers/pipelines/pipeline_flax_utils.py +4 -4
- diffusers/pipelines/pipeline_loading_utils.py +268 -23
- diffusers/pipelines/pipeline_utils.py +266 -37
- diffusers/pipelines/pixart_alpha/__init__.py +8 -1
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
- diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
- diffusers/pipelines/shap_e/renderer.py +1 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
- diffusers/pipelines/stable_diffusion/__init__.py +0 -1
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
- diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
- diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
- diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
- diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
- diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
- diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
- diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
- diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
- diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
- diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
- diffusers/schedulers/__init__.py +2 -2
- diffusers/schedulers/deprecated/__init__.py +1 -1
- diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
- diffusers/schedulers/scheduling_amused.py +5 -5
- diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
- diffusers/schedulers/scheduling_consistency_models.py +23 -25
- diffusers/schedulers/scheduling_ddim.py +22 -24
- diffusers/schedulers/scheduling_ddim_flax.py +2 -1
- diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
- diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
- diffusers/schedulers/scheduling_ddpm.py +20 -22
- diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
- diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
- diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
- diffusers/schedulers/scheduling_deis_multistep.py +46 -42
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
- diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
- diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
- diffusers/schedulers/scheduling_edm_euler.py +53 -30
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
- diffusers/schedulers/scheduling_euler_discrete.py +163 -67
- diffusers/schedulers/scheduling_heun_discrete.py +60 -38
- diffusers/schedulers/scheduling_ipndm.py +8 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
- diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
- diffusers/schedulers/scheduling_lcm.py +21 -23
- diffusers/schedulers/scheduling_lms_discrete.py +27 -25
- diffusers/schedulers/scheduling_pndm.py +20 -20
- diffusers/schedulers/scheduling_repaint.py +20 -20
- diffusers/schedulers/scheduling_sasolver.py +55 -54
- diffusers/schedulers/scheduling_sde_ve.py +19 -19
- diffusers/schedulers/scheduling_tcd.py +39 -30
- diffusers/schedulers/scheduling_unclip.py +15 -15
- diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
- diffusers/schedulers/scheduling_utils.py +14 -5
- diffusers/schedulers/scheduling_utils_flax.py +3 -3
- diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
- diffusers/training_utils.py +56 -1
- diffusers/utils/__init__.py +7 -0
- diffusers/utils/doc_utils.py +1 -0
- diffusers/utils/dummy_pt_objects.py +30 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
- diffusers/utils/dynamic_modules_utils.py +24 -11
- diffusers/utils/hub_utils.py +3 -2
- diffusers/utils/import_utils.py +91 -0
- diffusers/utils/loading_utils.py +2 -2
- diffusers/utils/logging.py +1 -1
- diffusers/utils/peft_utils.py +32 -5
- diffusers/utils/state_dict_utils.py +11 -2
- diffusers/utils/testing_utils.py +71 -6
- diffusers/utils/torch_utils.py +1 -0
- diffusers/video_processor.py +113 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
- diffusers-0.28.0.dist-info/RECORD +414 -0
- diffusers-0.27.1.dist-info/RECORD +0 -399
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,7 @@ from transformers import (
|
|
26
26
|
CLIPVisionModelWithProjection,
|
27
27
|
)
|
28
28
|
|
29
|
+
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
29
30
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
30
31
|
from ...loaders import (
|
31
32
|
FromSingleFileMixin,
|
@@ -269,6 +270,7 @@ def retrieve_timesteps(
|
|
269
270
|
num_inference_steps: Optional[int] = None,
|
270
271
|
device: Optional[Union[str, torch.device]] = None,
|
271
272
|
timesteps: Optional[List[int]] = None,
|
273
|
+
sigmas: Optional[List[float]] = None,
|
272
274
|
**kwargs,
|
273
275
|
):
|
274
276
|
"""
|
@@ -279,19 +281,23 @@ def retrieve_timesteps(
|
|
279
281
|
scheduler (`SchedulerMixin`):
|
280
282
|
The scheduler to get timesteps from.
|
281
283
|
num_inference_steps (`int`):
|
282
|
-
The number of diffusion steps used when generating samples with a pre-trained model. If used,
|
283
|
-
|
284
|
+
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
285
|
+
must be `None`.
|
284
286
|
device (`str` or `torch.device`, *optional*):
|
285
287
|
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
286
288
|
timesteps (`List[int]`, *optional*):
|
287
|
-
|
288
|
-
|
289
|
-
|
289
|
+
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
290
|
+
`num_inference_steps` and `sigmas` must be `None`.
|
291
|
+
sigmas (`List[float]`, *optional*):
|
292
|
+
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
293
|
+
`num_inference_steps` and `timesteps` must be `None`.
|
290
294
|
|
291
295
|
Returns:
|
292
296
|
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
293
297
|
second element is the number of inference steps.
|
294
298
|
"""
|
299
|
+
if timesteps is not None and sigmas is not None:
|
300
|
+
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
295
301
|
if timesteps is not None:
|
296
302
|
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
297
303
|
if not accepts_timesteps:
|
@@ -302,6 +308,16 @@ def retrieve_timesteps(
|
|
302
308
|
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
303
309
|
timesteps = scheduler.timesteps
|
304
310
|
num_inference_steps = len(timesteps)
|
311
|
+
elif sigmas is not None:
|
312
|
+
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
313
|
+
if not accept_sigmas:
|
314
|
+
raise ValueError(
|
315
|
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
316
|
+
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
317
|
+
)
|
318
|
+
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
319
|
+
timesteps = scheduler.timesteps
|
320
|
+
num_inference_steps = len(timesteps)
|
305
321
|
else:
|
306
322
|
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
307
323
|
timesteps = scheduler.timesteps
|
@@ -516,10 +532,10 @@ class StableDiffusionXLInpaintPipeline(
|
|
516
532
|
do_classifier_free_guidance: bool = True,
|
517
533
|
negative_prompt: Optional[str] = None,
|
518
534
|
negative_prompt_2: Optional[str] = None,
|
519
|
-
prompt_embeds: Optional[torch.
|
520
|
-
negative_prompt_embeds: Optional[torch.
|
521
|
-
pooled_prompt_embeds: Optional[torch.
|
522
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
535
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
536
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
537
|
+
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
538
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
523
539
|
lora_scale: Optional[float] = None,
|
524
540
|
clip_skip: Optional[int] = None,
|
525
541
|
):
|
@@ -545,17 +561,17 @@ class StableDiffusionXLInpaintPipeline(
|
|
545
561
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
546
562
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
547
563
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
548
|
-
prompt_embeds (`torch.
|
564
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
549
565
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
550
566
|
provided, text embeddings will be generated from `prompt` input argument.
|
551
|
-
negative_prompt_embeds (`torch.
|
567
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
552
568
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
553
569
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
554
570
|
argument.
|
555
|
-
pooled_prompt_embeds (`torch.
|
571
|
+
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
556
572
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
557
573
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
558
|
-
negative_pooled_prompt_embeds (`torch.
|
574
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
559
575
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
560
576
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
561
577
|
input argument.
|
@@ -880,7 +896,12 @@ class StableDiffusionXLInpaintPipeline(
|
|
880
896
|
return_noise=False,
|
881
897
|
return_image_latents=False,
|
882
898
|
):
|
883
|
-
shape = (
|
899
|
+
shape = (
|
900
|
+
batch_size,
|
901
|
+
num_channels_latents,
|
902
|
+
int(height) // self.vae_scale_factor,
|
903
|
+
int(width) // self.vae_scale_factor,
|
904
|
+
)
|
884
905
|
if isinstance(generator, list) and len(generator) != batch_size:
|
885
906
|
raise ValueError(
|
886
907
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -1027,7 +1048,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
1027
1048
|
# because `num_inference_steps` might be even given that every timestep
|
1028
1049
|
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
|
1029
1050
|
# mean that we cut the timesteps in the middle of the denoising step
|
1030
|
-
# (between 1st and 2nd
|
1051
|
+
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
|
1031
1052
|
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
|
1032
1053
|
num_inference_steps = num_inference_steps + 1
|
1033
1054
|
|
@@ -1110,20 +1131,22 @@ class StableDiffusionXLInpaintPipeline(
|
|
1110
1131
|
self.vae.decoder.mid_block.to(dtype)
|
1111
1132
|
|
1112
1133
|
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
|
1113
|
-
def get_guidance_scale_embedding(
|
1134
|
+
def get_guidance_scale_embedding(
|
1135
|
+
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
|
1136
|
+
) -> torch.Tensor:
|
1114
1137
|
"""
|
1115
1138
|
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
1116
1139
|
|
1117
1140
|
Args:
|
1118
|
-
|
1119
|
-
|
1141
|
+
w (`torch.Tensor`):
|
1142
|
+
Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
|
1120
1143
|
embedding_dim (`int`, *optional*, defaults to 512):
|
1121
|
-
|
1122
|
-
dtype:
|
1123
|
-
|
1144
|
+
Dimension of the embeddings to generate.
|
1145
|
+
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
1146
|
+
Data type of the generated embeddings.
|
1124
1147
|
|
1125
1148
|
Returns:
|
1126
|
-
`torch.
|
1149
|
+
`torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
|
1127
1150
|
"""
|
1128
1151
|
assert len(w.shape) == 1
|
1129
1152
|
w = w * 1000.0
|
@@ -1185,13 +1208,14 @@ class StableDiffusionXLInpaintPipeline(
|
|
1185
1208
|
prompt_2: Optional[Union[str, List[str]]] = None,
|
1186
1209
|
image: PipelineImageInput = None,
|
1187
1210
|
mask_image: PipelineImageInput = None,
|
1188
|
-
masked_image_latents: torch.
|
1211
|
+
masked_image_latents: torch.Tensor = None,
|
1189
1212
|
height: Optional[int] = None,
|
1190
1213
|
width: Optional[int] = None,
|
1191
1214
|
padding_mask_crop: Optional[int] = None,
|
1192
1215
|
strength: float = 0.9999,
|
1193
1216
|
num_inference_steps: int = 50,
|
1194
1217
|
timesteps: List[int] = None,
|
1218
|
+
sigmas: List[float] = None,
|
1195
1219
|
denoising_start: Optional[float] = None,
|
1196
1220
|
denoising_end: Optional[float] = None,
|
1197
1221
|
guidance_scale: float = 7.5,
|
@@ -1200,13 +1224,13 @@ class StableDiffusionXLInpaintPipeline(
|
|
1200
1224
|
num_images_per_prompt: Optional[int] = 1,
|
1201
1225
|
eta: float = 0.0,
|
1202
1226
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
1203
|
-
latents: Optional[torch.
|
1204
|
-
prompt_embeds: Optional[torch.
|
1205
|
-
negative_prompt_embeds: Optional[torch.
|
1206
|
-
pooled_prompt_embeds: Optional[torch.
|
1207
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
1227
|
+
latents: Optional[torch.Tensor] = None,
|
1228
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
1229
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
1230
|
+
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
1231
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
1208
1232
|
ip_adapter_image: Optional[PipelineImageInput] = None,
|
1209
|
-
ip_adapter_image_embeds: Optional[List[torch.
|
1233
|
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
1210
1234
|
output_type: Optional[str] = "pil",
|
1211
1235
|
return_dict: bool = True,
|
1212
1236
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -1220,7 +1244,9 @@ class StableDiffusionXLInpaintPipeline(
|
|
1220
1244
|
aesthetic_score: float = 6.0,
|
1221
1245
|
negative_aesthetic_score: float = 2.5,
|
1222
1246
|
clip_skip: Optional[int] = None,
|
1223
|
-
callback_on_step_end: Optional[
|
1247
|
+
callback_on_step_end: Optional[
|
1248
|
+
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
1249
|
+
] = None,
|
1224
1250
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
1225
1251
|
**kwargs,
|
1226
1252
|
):
|
@@ -1253,11 +1279,12 @@ class StableDiffusionXLInpaintPipeline(
|
|
1253
1279
|
[stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
1254
1280
|
and checkpoints that are not specifically fine-tuned on low resolutions.
|
1255
1281
|
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
1256
|
-
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
1257
|
-
`padding_mask_crop` is not `None`, it will first find a rectangular region
|
1258
|
-
contains all masked area, and then expand that area based
|
1259
|
-
|
1260
|
-
|
1282
|
+
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
1283
|
+
image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
|
1284
|
+
with the same aspect ration of the image and contains all masked area, and then expand that area based
|
1285
|
+
on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
|
1286
|
+
resizing to the original image size for inpainting. This is useful when the masked area is small while
|
1287
|
+
the image is large and contain information irrelevant for inpainting, such as background.
|
1261
1288
|
strength (`float`, *optional*, defaults to 0.9999):
|
1262
1289
|
Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
|
1263
1290
|
between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
|
@@ -1273,6 +1300,10 @@ class StableDiffusionXLInpaintPipeline(
|
|
1273
1300
|
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
1274
1301
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
1275
1302
|
passed will be used. Must be in descending order.
|
1303
|
+
sigmas (`List[float]`, *optional*):
|
1304
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
1305
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
1306
|
+
will be used.
|
1276
1307
|
denoising_start (`float`, *optional*):
|
1277
1308
|
When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
|
1278
1309
|
bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
|
@@ -1301,26 +1332,26 @@ class StableDiffusionXLInpaintPipeline(
|
|
1301
1332
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
1302
1333
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
1303
1334
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
1304
|
-
prompt_embeds (`torch.
|
1335
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
1305
1336
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
1306
1337
|
provided, text embeddings will be generated from `prompt` input argument.
|
1307
|
-
negative_prompt_embeds (`torch.
|
1338
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
1308
1339
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
1309
1340
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
1310
1341
|
argument.
|
1311
|
-
pooled_prompt_embeds (`torch.
|
1342
|
+
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
1312
1343
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
1313
1344
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
1314
|
-
negative_pooled_prompt_embeds (`torch.
|
1345
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
1315
1346
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
1316
1347
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
1317
1348
|
input argument.
|
1318
1349
|
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
1319
|
-
ip_adapter_image_embeds (`List[torch.
|
1320
|
-
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
1321
|
-
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
1322
|
-
if `do_classifier_free_guidance` is set to `True`.
|
1323
|
-
|
1350
|
+
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
1351
|
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
1352
|
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
1353
|
+
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
1354
|
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
1324
1355
|
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
1325
1356
|
The number of images to generate per prompt.
|
1326
1357
|
eta (`float`, *optional*, defaults to 0.0):
|
@@ -1329,7 +1360,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
1329
1360
|
generator (`torch.Generator`, *optional*):
|
1330
1361
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
1331
1362
|
to make generation deterministic.
|
1332
|
-
latents (`torch.
|
1363
|
+
latents (`torch.Tensor`, *optional*):
|
1333
1364
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
1334
1365
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
1335
1366
|
tensor will ge generated by sampling using the supplied random `generator`.
|
@@ -1383,11 +1414,11 @@ class StableDiffusionXLInpaintPipeline(
|
|
1383
1414
|
clip_skip (`int`, *optional*):
|
1384
1415
|
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
1385
1416
|
the output of the pre-final layer will be used for computing the prompt embeddings.
|
1386
|
-
callback_on_step_end (`Callable`, *optional*):
|
1387
|
-
A function
|
1388
|
-
with the following arguments: `callback_on_step_end(self:
|
1389
|
-
callback_kwargs: Dict)`. `callback_kwargs` will include a
|
1390
|
-
`callback_on_step_end_tensor_inputs`.
|
1417
|
+
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
1418
|
+
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
1419
|
+
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
1420
|
+
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
1421
|
+
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
1391
1422
|
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
1392
1423
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
1393
1424
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
@@ -1417,6 +1448,9 @@ class StableDiffusionXLInpaintPipeline(
|
|
1417
1448
|
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
|
1418
1449
|
)
|
1419
1450
|
|
1451
|
+
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
1452
|
+
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
1453
|
+
|
1420
1454
|
# 0. Default height and width to unet
|
1421
1455
|
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
1422
1456
|
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
@@ -1490,7 +1524,9 @@ class StableDiffusionXLInpaintPipeline(
|
|
1490
1524
|
def denoising_value_valid(dnv):
|
1491
1525
|
return isinstance(dnv, float) and 0 < dnv < 1
|
1492
1526
|
|
1493
|
-
timesteps, num_inference_steps = retrieve_timesteps(
|
1527
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
1528
|
+
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
1529
|
+
)
|
1494
1530
|
timesteps, num_inference_steps = self.get_timesteps(
|
1495
1531
|
num_inference_steps,
|
1496
1532
|
strength,
|
@@ -1718,7 +1754,12 @@ class StableDiffusionXLInpaintPipeline(
|
|
1718
1754
|
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
1719
1755
|
|
1720
1756
|
# compute the previous noisy sample x_t -> x_t-1
|
1757
|
+
latents_dtype = latents.dtype
|
1721
1758
|
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
1759
|
+
if latents.dtype != latents_dtype:
|
1760
|
+
if torch.backends.mps.is_available():
|
1761
|
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
1762
|
+
latents = latents.to(latents_dtype)
|
1722
1763
|
|
1723
1764
|
if num_channels_unet == 4:
|
1724
1765
|
init_latents_proper = image_latents
|
@@ -1770,6 +1811,10 @@ class StableDiffusionXLInpaintPipeline(
|
|
1770
1811
|
if needs_upcasting:
|
1771
1812
|
self.upcast_vae()
|
1772
1813
|
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
1814
|
+
elif latents.dtype != self.vae.dtype:
|
1815
|
+
if torch.backends.mps.is_available():
|
1816
|
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
1817
|
+
self.vae = self.vae.to(latents.dtype)
|
1773
1818
|
|
1774
1819
|
# unscale/denormalize the latents
|
1775
1820
|
# denormalize with the mean and std if available and not None
|
@@ -169,6 +169,8 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
169
169
|
Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
|
170
170
|
watermark output images. If not defined, it will default to True if the package is installed, otherwise no
|
171
171
|
watermarker will be used.
|
172
|
+
is_cosxl_edit (`bool`, *optional*):
|
173
|
+
When set the image latents are scaled.
|
172
174
|
"""
|
173
175
|
|
174
176
|
model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
|
@@ -185,6 +187,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
185
187
|
scheduler: KarrasDiffusionSchedulers,
|
186
188
|
force_zeros_for_empty_prompt: bool = True,
|
187
189
|
add_watermarker: Optional[bool] = None,
|
190
|
+
is_cosxl_edit: Optional[bool] = False,
|
188
191
|
):
|
189
192
|
super().__init__()
|
190
193
|
|
@@ -201,6 +204,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
201
204
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
202
205
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
203
206
|
self.default_sample_size = self.unet.config.sample_size
|
207
|
+
self.is_cosxl_edit = is_cosxl_edit
|
204
208
|
|
205
209
|
add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
|
206
210
|
|
@@ -218,10 +222,10 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
218
222
|
do_classifier_free_guidance: bool = True,
|
219
223
|
negative_prompt: Optional[str] = None,
|
220
224
|
negative_prompt_2: Optional[str] = None,
|
221
|
-
prompt_embeds: Optional[torch.
|
222
|
-
negative_prompt_embeds: Optional[torch.
|
223
|
-
pooled_prompt_embeds: Optional[torch.
|
224
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
225
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
226
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
227
|
+
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
228
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
225
229
|
lora_scale: Optional[float] = None,
|
226
230
|
):
|
227
231
|
r"""
|
@@ -246,17 +250,17 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
246
250
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
247
251
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
248
252
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
249
|
-
prompt_embeds (`torch.
|
253
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
250
254
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
251
255
|
provided, text embeddings will be generated from `prompt` input argument.
|
252
|
-
negative_prompt_embeds (`torch.
|
256
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
253
257
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
254
258
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
255
259
|
argument.
|
256
|
-
pooled_prompt_embeds (`torch.
|
260
|
+
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
257
261
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
258
262
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
259
|
-
negative_pooled_prompt_embeds (`torch.
|
263
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
260
264
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
261
265
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
262
266
|
input argument.
|
@@ -432,7 +436,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
432
436
|
extra_step_kwargs["generator"] = generator
|
433
437
|
return extra_step_kwargs
|
434
438
|
|
435
|
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
|
436
439
|
def check_inputs(
|
437
440
|
self,
|
438
441
|
prompt,
|
@@ -483,7 +486,12 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
483
486
|
|
484
487
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
485
488
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
486
|
-
shape = (
|
489
|
+
shape = (
|
490
|
+
batch_size,
|
491
|
+
num_channels_latents,
|
492
|
+
int(height) // self.vae_scale_factor,
|
493
|
+
int(width) // self.vae_scale_factor,
|
494
|
+
)
|
487
495
|
if isinstance(generator, list) and len(generator) != batch_size:
|
488
496
|
raise ValueError(
|
489
497
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
@@ -517,8 +525,8 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
517
525
|
# make sure the VAE is in float32 mode, as it overflows in float16
|
518
526
|
needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
|
519
527
|
if needs_upcasting:
|
528
|
+
image = image.float()
|
520
529
|
self.upcast_vae()
|
521
|
-
image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
522
530
|
|
523
531
|
image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
|
524
532
|
|
@@ -551,6 +559,9 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
551
559
|
if image_latents.dtype != self.vae.dtype:
|
552
560
|
image_latents = image_latents.to(dtype=self.vae.dtype)
|
553
561
|
|
562
|
+
if self.is_cosxl_edit:
|
563
|
+
image_latents = image_latents * self.vae.config.scaling_factor
|
564
|
+
|
554
565
|
return image_latents
|
555
566
|
|
556
567
|
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
|
@@ -611,14 +622,14 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
611
622
|
num_images_per_prompt: Optional[int] = 1,
|
612
623
|
eta: float = 0.0,
|
613
624
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
614
|
-
latents: Optional[torch.
|
615
|
-
prompt_embeds: Optional[torch.
|
616
|
-
negative_prompt_embeds: Optional[torch.
|
617
|
-
pooled_prompt_embeds: Optional[torch.
|
618
|
-
negative_pooled_prompt_embeds: Optional[torch.
|
625
|
+
latents: Optional[torch.Tensor] = None,
|
626
|
+
prompt_embeds: Optional[torch.Tensor] = None,
|
627
|
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
628
|
+
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
629
|
+
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
619
630
|
output_type: Optional[str] = "pil",
|
620
631
|
return_dict: bool = True,
|
621
|
-
callback: Optional[Callable[[int, int, torch.
|
632
|
+
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
|
622
633
|
callback_steps: int = 1,
|
623
634
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
624
635
|
guidance_rescale: float = 0.0,
|
@@ -636,7 +647,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
636
647
|
prompt_2 (`str` or `List[str]`, *optional*):
|
637
648
|
The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
|
638
649
|
used in both text-encoders
|
639
|
-
image (`torch.
|
650
|
+
image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
|
640
651
|
The image(s) to modify with the pipeline.
|
641
652
|
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
642
653
|
The height in pixels of the generated image.
|
@@ -659,7 +670,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
659
670
|
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
660
671
|
usually at the expense of lower image quality.
|
661
672
|
image_guidance_scale (`float`, *optional*, defaults to 1.5):
|
662
|
-
Image guidance scale is to push the generated image towards the
|
673
|
+
Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
|
663
674
|
scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
|
664
675
|
generate images that are closely linked to the source image `image`, usually at the expense of lower
|
665
676
|
image quality. This pipeline requires a value of at least `1`.
|
@@ -678,21 +689,21 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
678
689
|
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
679
690
|
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
680
691
|
to make generation deterministic.
|
681
|
-
latents (`torch.
|
692
|
+
latents (`torch.Tensor`, *optional*):
|
682
693
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
683
694
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
684
695
|
tensor will ge generated by sampling using the supplied random `generator`.
|
685
|
-
prompt_embeds (`torch.
|
696
|
+
prompt_embeds (`torch.Tensor`, *optional*):
|
686
697
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
687
698
|
provided, text embeddings will be generated from `prompt` input argument.
|
688
|
-
negative_prompt_embeds (`torch.
|
699
|
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
689
700
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
690
701
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
691
702
|
argument.
|
692
|
-
pooled_prompt_embeds (`torch.
|
703
|
+
pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
693
704
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
694
705
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
695
|
-
negative_pooled_prompt_embeds (`torch.
|
706
|
+
negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
|
696
707
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
697
708
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
698
709
|
input argument.
|
@@ -704,7 +715,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
704
715
|
plain tuple.
|
705
716
|
callback (`Callable`, *optional*):
|
706
717
|
A function that will be called every `callback_steps` steps during inference. The function will be
|
707
|
-
called with the following arguments: `callback(step: int, timestep: int, latents: torch.
|
718
|
+
called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
|
708
719
|
callback_steps (`int`, *optional*, defaults to 1):
|
709
720
|
The frequency at which the `callback` function will be called. If not specified, the callback will be
|
710
721
|
called at every step.
|
@@ -918,7 +929,12 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
918
929
|
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
|
919
930
|
|
920
931
|
# compute the previous noisy sample x_t -> x_t-1
|
932
|
+
latents_dtype = latents.dtype
|
921
933
|
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
934
|
+
if latents.dtype != latents_dtype:
|
935
|
+
if torch.backends.mps.is_available():
|
936
|
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
937
|
+
latents = latents.to(latents_dtype)
|
922
938
|
|
923
939
|
# call the callback, if provided
|
924
940
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
@@ -937,6 +953,10 @@ class StableDiffusionXLInstructPix2PixPipeline(
|
|
937
953
|
if needs_upcasting:
|
938
954
|
self.upcast_vae()
|
939
955
|
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
|
956
|
+
elif latents.dtype != self.vae.dtype:
|
957
|
+
if torch.backends.mps.is_available():
|
958
|
+
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
|
959
|
+
self.vae = self.vae.to(latents.dtype)
|
940
960
|
|
941
961
|
# unscale/denormalize the latents
|
942
962
|
# denormalize with the mean and std if available and not None
|
@@ -21,16 +21,22 @@ class StableDiffusionXLWatermarker:
|
|
21
21
|
|
22
22
|
self.encoder.set_watermark("bits", self.watermark)
|
23
23
|
|
24
|
-
def apply_watermark(self, images: torch.
|
24
|
+
def apply_watermark(self, images: torch.Tensor):
|
25
25
|
# can't encode images that are smaller than 256
|
26
26
|
if images.shape[-1] < 256:
|
27
27
|
return images
|
28
28
|
|
29
29
|
images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
|
30
30
|
|
31
|
-
|
31
|
+
# Convert RGB to BGR, which is the channel order expected by the watermark encoder.
|
32
|
+
images = images[:, :, :, ::-1]
|
32
33
|
|
33
|
-
|
34
|
+
# Add watermark and convert BGR back to RGB
|
35
|
+
images = [self.encoder.encode(image, "dwtDct")[:, :, ::-1] for image in images]
|
36
|
+
|
37
|
+
images = np.array(images)
|
38
|
+
|
39
|
+
images = torch.from_numpy(images).permute(0, 3, 1, 2)
|
34
40
|
|
35
41
|
images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
|
36
42
|
return images
|