diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +97 -4
- diffusers/callbacks.py +56 -3
- diffusers/configuration_utils.py +13 -1
- diffusers/image_processor.py +282 -71
- diffusers/loaders/__init__.py +24 -3
- diffusers/loaders/ip_adapter.py +543 -16
- diffusers/loaders/lora_base.py +138 -125
- diffusers/loaders/lora_conversion_utils.py +647 -0
- diffusers/loaders/lora_pipeline.py +2216 -230
- diffusers/loaders/peft.py +380 -0
- diffusers/loaders/single_file_model.py +71 -4
- diffusers/loaders/single_file_utils.py +597 -10
- diffusers/loaders/textual_inversion.py +5 -3
- diffusers/loaders/transformer_flux.py +181 -0
- diffusers/loaders/transformer_sd3.py +89 -0
- diffusers/loaders/unet.py +56 -12
- diffusers/models/__init__.py +49 -12
- diffusers/models/activations.py +22 -9
- diffusers/models/adapter.py +53 -53
- diffusers/models/attention.py +98 -13
- diffusers/models/attention_flax.py +1 -1
- diffusers/models/attention_processor.py +2160 -346
- diffusers/models/autoencoders/__init__.py +5 -0
- diffusers/models/autoencoders/autoencoder_dc.py +620 -0
- diffusers/models/autoencoders/autoencoder_kl.py +73 -12
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
- diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
- diffusers/models/autoencoders/vae.py +18 -5
- diffusers/models/controlnet.py +47 -802
- diffusers/models/controlnet_flux.py +70 -0
- diffusers/models/controlnet_sd3.py +26 -376
- diffusers/models/controlnet_sparsectrl.py +46 -719
- diffusers/models/controlnets/__init__.py +23 -0
- diffusers/models/controlnets/controlnet.py +872 -0
- diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
- diffusers/models/controlnets/controlnet_flux.py +536 -0
- diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
- diffusers/models/controlnets/controlnet_sd3.py +489 -0
- diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
- diffusers/models/controlnets/controlnet_union.py +832 -0
- diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
- diffusers/models/controlnets/multicontrolnet.py +183 -0
- diffusers/models/embeddings.py +996 -92
- diffusers/models/embeddings_flax.py +23 -9
- diffusers/models/model_loading_utils.py +264 -14
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +334 -51
- diffusers/models/normalization.py +157 -13
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
- diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
- diffusers/models/transformers/dit_transformer_2d.py +1 -1
- diffusers/models/transformers/latte_transformer_3d.py +4 -4
- diffusers/models/transformers/pixart_transformer_2d.py +10 -2
- diffusers/models/transformers/sana_transformer.py +488 -0
- diffusers/models/transformers/stable_audio_transformer.py +1 -1
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +422 -0
- diffusers/models/transformers/transformer_cogview3plus.py +386 -0
- diffusers/models/transformers/transformer_flux.py +189 -51
- diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
- diffusers/models/transformers/transformer_ltx.py +469 -0
- diffusers/models/transformers/transformer_mochi.py +499 -0
- diffusers/models/transformers/transformer_sd3.py +112 -18
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +8 -1
- diffusers/models/unets/unet_2d_blocks.py +88 -21
- diffusers/models/unets/unet_2d_condition.py +9 -9
- diffusers/models/unets/unet_3d_blocks.py +9 -7
- diffusers/models/unets/unet_motion_model.py +46 -68
- diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
- diffusers/models/unets/unet_stable_cascade.py +2 -2
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +14 -6
- diffusers/pipelines/__init__.py +69 -6
- diffusers/pipelines/allegro/__init__.py +48 -0
- diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
- diffusers/pipelines/allegro/pipeline_output.py +23 -0
- diffusers/pipelines/animatediff/__init__.py +2 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
- diffusers/pipelines/auto_pipeline.py +88 -10
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/cogvideo/__init__.py +2 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
- diffusers/pipelines/cogview3/__init__.py +47 -0
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
- diffusers/pipelines/cogview3/pipeline_output.py +21 -0
- diffusers/pipelines/controlnet/__init__.py +86 -80
- diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
- diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
- diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
- diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
- diffusers/pipelines/flux/__init__.py +23 -1
- diffusers/pipelines/flux/modeling_flux.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +256 -48
- diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
- diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
- diffusers/pipelines/flux/pipeline_output.py +16 -0
- diffusers/pipelines/free_noise_utils.py +365 -5
- diffusers/pipelines/hunyuan_video/__init__.py +48 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
- diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
- diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
- diffusers/pipelines/kolors/text_encoder.py +2 -2
- diffusers/pipelines/kolors/tokenizer.py +4 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/latte/pipeline_latte.py +2 -2
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
- diffusers/pipelines/ltx/__init__.py +50 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
- diffusers/pipelines/ltx/pipeline_output.py +20 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
- diffusers/pipelines/mochi/__init__.py +48 -0
- diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
- diffusers/pipelines/mochi/pipeline_output.py +20 -0
- diffusers/pipelines/pag/__init__.py +13 -0
- diffusers/pipelines/pag/pag_utils.py +8 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
- diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
- diffusers/pipelines/pia/pipeline_pia.py +2 -0
- diffusers/pipelines/pipeline_flax_utils.py +1 -1
- diffusers/pipelines/pipeline_loading_utils.py +250 -31
- diffusers/pipelines/pipeline_utils.py +158 -186
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
- diffusers/pipelines/sana/__init__.py +47 -0
- diffusers/pipelines/sana/pipeline_output.py +21 -0
- diffusers/pipelines/sana/pipeline_sana.py +884 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
- diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
- diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
- diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/quantizers/__init__.py +16 -0
- diffusers/quantizers/auto.py +139 -0
- diffusers/quantizers/base.py +233 -0
- diffusers/quantizers/bitsandbytes/__init__.py +2 -0
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
- diffusers/quantizers/bitsandbytes/utils.py +306 -0
- diffusers/quantizers/gguf/__init__.py +1 -0
- diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
- diffusers/quantizers/gguf/utils.py +456 -0
- diffusers/quantizers/quantization_config.py +669 -0
- diffusers/quantizers/torchao/__init__.py +15 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
- diffusers/schedulers/scheduling_ddim.py +4 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
- diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
- diffusers/schedulers/scheduling_ddpm.py +6 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
- diffusers/schedulers/scheduling_deis_multistep.py +102 -6
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
- diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
- diffusers/schedulers/scheduling_edm_euler.py +8 -6
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
- diffusers/schedulers/scheduling_euler_discrete.py +92 -7
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
- diffusers/schedulers/scheduling_heun_discrete.py +114 -8
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
- diffusers/schedulers/scheduling_lcm.py +2 -6
- diffusers/schedulers/scheduling_lms_discrete.py +76 -1
- diffusers/schedulers/scheduling_repaint.py +1 -1
- diffusers/schedulers/scheduling_sasolver.py +102 -6
- diffusers/schedulers/scheduling_tcd.py +2 -6
- diffusers/schedulers/scheduling_unclip.py +4 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
- diffusers/training_utils.py +63 -19
- diffusers/utils/__init__.py +7 -1
- diffusers/utils/constants.py +1 -0
- diffusers/utils/dummy_pt_objects.py +240 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
- diffusers/utils/dynamic_modules_utils.py +3 -3
- diffusers/utils/hub_utils.py +44 -40
- diffusers/utils/import_utils.py +98 -8
- diffusers/utils/loading_utils.py +28 -4
- diffusers/utils/peft_utils.py +6 -3
- diffusers/utils/testing_utils.py +115 -1
- diffusers/utils/torch_utils.py +3 -0
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ from transformers import (
|
|
25
25
|
|
26
26
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
27
27
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
28
|
-
from ...loaders import SD3LoraLoaderMixin
|
28
|
+
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
29
29
|
from ...models.autoencoders import AutoencoderKL
|
30
30
|
from ...models.transformers import SD3Transformer2DModel
|
31
31
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
@@ -74,6 +74,20 @@ EXAMPLE_DOC_STRING = """
|
|
74
74
|
"""
|
75
75
|
|
76
76
|
|
77
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
|
78
|
+
def calculate_shift(
|
79
|
+
image_seq_len,
|
80
|
+
base_seq_len: int = 256,
|
81
|
+
max_seq_len: int = 4096,
|
82
|
+
base_shift: float = 0.5,
|
83
|
+
max_shift: float = 1.16,
|
84
|
+
):
|
85
|
+
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
86
|
+
b = base_shift - m * base_seq_len
|
87
|
+
mu = image_seq_len * m + b
|
88
|
+
return mu
|
89
|
+
|
90
|
+
|
77
91
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
78
92
|
def retrieve_latents(
|
79
93
|
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
@@ -97,7 +111,7 @@ def retrieve_timesteps(
|
|
97
111
|
sigmas: Optional[List[float]] = None,
|
98
112
|
**kwargs,
|
99
113
|
):
|
100
|
-
"""
|
114
|
+
r"""
|
101
115
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
102
116
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
103
117
|
|
@@ -148,7 +162,7 @@ def retrieve_timesteps(
|
|
148
162
|
return timesteps, num_inference_steps
|
149
163
|
|
150
164
|
|
151
|
-
class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
165
|
+
class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
|
152
166
|
r"""
|
153
167
|
Args:
|
154
168
|
transformer ([`SD3Transformer2DModel`]):
|
@@ -224,6 +238,9 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
224
238
|
)
|
225
239
|
self.tokenizer_max_length = self.tokenizer.model_max_length
|
226
240
|
self.default_sample_size = self.transformer.config.sample_size
|
241
|
+
self.patch_size = (
|
242
|
+
self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
|
243
|
+
)
|
227
244
|
|
228
245
|
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
|
229
246
|
def _get_t5_prompt_embeds(
|
@@ -538,6 +555,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
538
555
|
prompt,
|
539
556
|
prompt_2,
|
540
557
|
prompt_3,
|
558
|
+
height,
|
559
|
+
width,
|
541
560
|
strength,
|
542
561
|
negative_prompt=None,
|
543
562
|
negative_prompt_2=None,
|
@@ -549,6 +568,15 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
549
568
|
callback_on_step_end_tensor_inputs=None,
|
550
569
|
max_sequence_length=None,
|
551
570
|
):
|
571
|
+
if (
|
572
|
+
height % (self.vae_scale_factor * self.patch_size) != 0
|
573
|
+
or width % (self.vae_scale_factor * self.patch_size) != 0
|
574
|
+
):
|
575
|
+
raise ValueError(
|
576
|
+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
|
577
|
+
f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
|
578
|
+
)
|
579
|
+
|
552
580
|
if strength < 0 or strength > 1:
|
553
581
|
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
554
582
|
|
@@ -806,7 +834,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
806
834
|
padding_mask_crop: Optional[int] = None,
|
807
835
|
strength: float = 0.6,
|
808
836
|
num_inference_steps: int = 50,
|
809
|
-
|
837
|
+
sigmas: Optional[List[float]] = None,
|
810
838
|
guidance_scale: float = 7.0,
|
811
839
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
812
840
|
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
@@ -824,6 +852,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
824
852
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
825
853
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
826
854
|
max_sequence_length: int = 256,
|
855
|
+
mu: Optional[float] = None,
|
827
856
|
):
|
828
857
|
r"""
|
829
858
|
Function invoked when calling the pipeline for generation.
|
@@ -874,10 +903,10 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
874
903
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
875
904
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
876
905
|
expense of slower inference.
|
877
|
-
|
878
|
-
Custom
|
879
|
-
|
880
|
-
|
906
|
+
sigmas (`List[float]`, *optional*):
|
907
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
908
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
909
|
+
will be used.
|
881
910
|
guidance_scale (`float`, *optional*, defaults to 7.0):
|
882
911
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
883
912
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -921,8 +950,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
921
950
|
The output format of the generate image. Choose between
|
922
951
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
923
952
|
return_dict (`bool`, *optional*, defaults to `True`):
|
924
|
-
Whether or not to return a [`~pipelines.
|
925
|
-
|
953
|
+
Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
|
954
|
+
a plain tuple.
|
926
955
|
callback_on_step_end (`Callable`, *optional*):
|
927
956
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
928
957
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -933,6 +962,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
933
962
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
934
963
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
935
964
|
max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
|
965
|
+
mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
|
936
966
|
|
937
967
|
Examples:
|
938
968
|
|
@@ -953,6 +983,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
953
983
|
prompt,
|
954
984
|
prompt_2,
|
955
985
|
prompt_3,
|
986
|
+
height,
|
987
|
+
width,
|
956
988
|
strength,
|
957
989
|
negative_prompt=negative_prompt,
|
958
990
|
negative_prompt_2=negative_prompt_2,
|
@@ -1007,7 +1039,24 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
|
|
1007
1039
|
pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
|
1008
1040
|
|
1009
1041
|
# 3. Prepare timesteps
|
1010
|
-
|
1042
|
+
scheduler_kwargs = {}
|
1043
|
+
if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
|
1044
|
+
image_seq_len = (int(height) // self.vae_scale_factor // self.transformer.config.patch_size) * (
|
1045
|
+
int(width) // self.vae_scale_factor // self.transformer.config.patch_size
|
1046
|
+
)
|
1047
|
+
mu = calculate_shift(
|
1048
|
+
image_seq_len,
|
1049
|
+
self.scheduler.config.base_image_seq_len,
|
1050
|
+
self.scheduler.config.max_image_seq_len,
|
1051
|
+
self.scheduler.config.base_shift,
|
1052
|
+
self.scheduler.config.max_shift,
|
1053
|
+
)
|
1054
|
+
scheduler_kwargs["mu"] = mu
|
1055
|
+
elif mu is not None:
|
1056
|
+
scheduler_kwargs["mu"] = mu
|
1057
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
1058
|
+
self.scheduler, num_inference_steps, device, sigmas=sigmas, **scheduler_kwargs
|
1059
|
+
)
|
1011
1060
|
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
1012
1061
|
# check that number of inference steps is not < 1 - as this doesn't make sense
|
1013
1062
|
if num_inference_steps < 1:
|
@@ -446,13 +446,14 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
446
446
|
extra_step_kwargs["generator"] = generator
|
447
447
|
return extra_step_kwargs
|
448
448
|
|
449
|
-
# Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
|
450
449
|
def check_inputs(
|
451
450
|
self,
|
452
451
|
prompt,
|
453
452
|
height,
|
454
453
|
width,
|
455
454
|
callback_steps,
|
455
|
+
gligen_images,
|
456
|
+
gligen_phrases,
|
456
457
|
negative_prompt=None,
|
457
458
|
prompt_embeds=None,
|
458
459
|
negative_prompt_embeds=None,
|
@@ -499,6 +500,13 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
499
500
|
f" {negative_prompt_embeds.shape}."
|
500
501
|
)
|
501
502
|
|
503
|
+
if gligen_images is not None and gligen_phrases is not None:
|
504
|
+
if len(gligen_images) != len(gligen_phrases):
|
505
|
+
raise ValueError(
|
506
|
+
"`gligen_images` and `gligen_phrases` must have the same length when both are provided, but"
|
507
|
+
f" got: `gligen_images` with length {len(gligen_images)} != `gligen_phrases` with length {len(gligen_phrases)}."
|
508
|
+
)
|
509
|
+
|
502
510
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
|
503
511
|
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
504
512
|
shape = (
|
@@ -814,6 +822,8 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
814
822
|
height,
|
815
823
|
width,
|
816
824
|
callback_steps,
|
825
|
+
gligen_images,
|
826
|
+
gligen_phrases,
|
817
827
|
negative_prompt,
|
818
828
|
prompt_embeds,
|
819
829
|
negative_prompt_embeds,
|
@@ -602,9 +602,9 @@ class StableDiffusionKDiffusionPipeline(
|
|
602
602
|
sigma_min: float = self.k_diffusion_model.sigmas[0].item()
|
603
603
|
sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
|
604
604
|
sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
|
605
|
-
sigmas = sigmas.to(device)
|
606
605
|
else:
|
607
606
|
sigmas = self.scheduler.sigmas
|
607
|
+
sigmas = sigmas.to(device)
|
608
608
|
sigmas = sigmas.to(prompt_embeds.dtype)
|
609
609
|
|
610
610
|
# 6. Prepare latent variables
|
@@ -61,9 +61,21 @@ EXAMPLE_DOC_STRING = """
|
|
61
61
|
|
62
62
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
63
63
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
64
|
-
"""
|
65
|
-
|
66
|
-
|
64
|
+
r"""
|
65
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
66
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
67
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
68
|
+
|
69
|
+
Args:
|
70
|
+
noise_cfg (`torch.Tensor`):
|
71
|
+
The predicted noise tensor for the guided diffusion process.
|
72
|
+
noise_pred_text (`torch.Tensor`):
|
73
|
+
The predicted noise tensor for the text-guided diffusion process.
|
74
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
75
|
+
A rescale factor applied to the noise predictions.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
67
79
|
"""
|
68
80
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
69
81
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -83,7 +95,7 @@ def retrieve_timesteps(
|
|
83
95
|
sigmas: Optional[List[float]] = None,
|
84
96
|
**kwargs,
|
85
97
|
):
|
86
|
-
"""
|
98
|
+
r"""
|
87
99
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
88
100
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
89
101
|
|
@@ -61,9 +61,21 @@ EXAMPLE_DOC_STRING = """
|
|
61
61
|
|
62
62
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
63
63
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
64
|
-
"""
|
65
|
-
|
66
|
-
|
64
|
+
r"""
|
65
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
66
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
67
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
68
|
+
|
69
|
+
Args:
|
70
|
+
noise_cfg (`torch.Tensor`):
|
71
|
+
The predicted noise tensor for the guided diffusion process.
|
72
|
+
noise_pred_text (`torch.Tensor`):
|
73
|
+
The predicted noise tensor for the text-guided diffusion process.
|
74
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
75
|
+
A rescale factor applied to the noise predictions.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
67
79
|
"""
|
68
80
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
69
81
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -83,7 +95,7 @@ def retrieve_timesteps(
|
|
83
95
|
sigmas: Optional[List[float]] = None,
|
84
96
|
**kwargs,
|
85
97
|
):
|
86
|
-
"""
|
98
|
+
r"""
|
87
99
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
88
100
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
89
101
|
|
@@ -87,9 +87,21 @@ EXAMPLE_DOC_STRING = """
|
|
87
87
|
|
88
88
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
89
89
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
90
|
-
"""
|
91
|
-
|
92
|
-
|
90
|
+
r"""
|
91
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
92
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
93
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
94
|
+
|
95
|
+
Args:
|
96
|
+
noise_cfg (`torch.Tensor`):
|
97
|
+
The predicted noise tensor for the guided diffusion process.
|
98
|
+
noise_pred_text (`torch.Tensor`):
|
99
|
+
The predicted noise tensor for the text-guided diffusion process.
|
100
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
101
|
+
A rescale factor applied to the noise predictions.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
93
105
|
"""
|
94
106
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
95
107
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -109,7 +121,7 @@ def retrieve_timesteps(
|
|
109
121
|
sigmas: Optional[List[float]] = None,
|
110
122
|
**kwargs,
|
111
123
|
):
|
112
|
-
"""
|
124
|
+
r"""
|
113
125
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
114
126
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
115
127
|
|
@@ -225,11 +237,8 @@ class StableDiffusionXLPipeline(
|
|
225
237
|
_callback_tensor_inputs = [
|
226
238
|
"latents",
|
227
239
|
"prompt_embeds",
|
228
|
-
"negative_prompt_embeds",
|
229
240
|
"add_text_embeds",
|
230
241
|
"add_time_ids",
|
231
|
-
"negative_pooled_prompt_embeds",
|
232
|
-
"negative_add_time_ids",
|
233
242
|
]
|
234
243
|
|
235
244
|
def __init__(
|
@@ -1231,13 +1240,8 @@ class StableDiffusionXLPipeline(
|
|
1231
1240
|
|
1232
1241
|
latents = callback_outputs.pop("latents", latents)
|
1233
1242
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1234
|
-
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
1235
1243
|
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
1236
|
-
negative_pooled_prompt_embeds = callback_outputs.pop(
|
1237
|
-
"negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
|
1238
|
-
)
|
1239
1244
|
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
1240
|
-
negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
|
1241
1245
|
|
1242
1246
|
# call the callback, if provided
|
1243
1247
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
@@ -90,9 +90,21 @@ EXAMPLE_DOC_STRING = """
|
|
90
90
|
|
91
91
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
92
92
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
93
|
-
"""
|
94
|
-
|
95
|
-
|
93
|
+
r"""
|
94
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
95
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
96
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
97
|
+
|
98
|
+
Args:
|
99
|
+
noise_cfg (`torch.Tensor`):
|
100
|
+
The predicted noise tensor for the guided diffusion process.
|
101
|
+
noise_pred_text (`torch.Tensor`):
|
102
|
+
The predicted noise tensor for the text-guided diffusion process.
|
103
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
104
|
+
A rescale factor applied to the noise predictions.
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
96
108
|
"""
|
97
109
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
98
110
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -126,7 +138,7 @@ def retrieve_timesteps(
|
|
126
138
|
sigmas: Optional[List[float]] = None,
|
127
139
|
**kwargs,
|
128
140
|
):
|
129
|
-
"""
|
141
|
+
r"""
|
130
142
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
131
143
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
132
144
|
|
@@ -245,11 +257,8 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
245
257
|
_callback_tensor_inputs = [
|
246
258
|
"latents",
|
247
259
|
"prompt_embeds",
|
248
|
-
"negative_prompt_embeds",
|
249
260
|
"add_text_embeds",
|
250
261
|
"add_time_ids",
|
251
|
-
"negative_pooled_prompt_embeds",
|
252
|
-
"add_neg_time_ids",
|
253
262
|
]
|
254
263
|
|
255
264
|
def __init__(
|
@@ -640,14 +649,16 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
640
649
|
if denoising_start is None:
|
641
650
|
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
642
651
|
t_start = max(num_inference_steps - init_timestep, 0)
|
643
|
-
else:
|
644
|
-
t_start = 0
|
645
652
|
|
646
|
-
|
653
|
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
654
|
+
if hasattr(self.scheduler, "set_begin_index"):
|
655
|
+
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
656
|
+
|
657
|
+
return timesteps, num_inference_steps - t_start
|
647
658
|
|
648
|
-
|
649
|
-
|
650
|
-
|
659
|
+
else:
|
660
|
+
# Strength is irrelevant if we directly request a timestep to start at;
|
661
|
+
# that is, strength is determined by the denoising_start instead.
|
651
662
|
discrete_timestep_cutoff = int(
|
652
663
|
round(
|
653
664
|
self.scheduler.config.num_train_timesteps
|
@@ -655,7 +666,7 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
655
666
|
)
|
656
667
|
)
|
657
668
|
|
658
|
-
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
669
|
+
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
659
670
|
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
660
671
|
# if the scheduler is a 2nd order scheduler we might have to do +1
|
661
672
|
# because `num_inference_steps` might be even given that every timestep
|
@@ -666,11 +677,12 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
666
677
|
num_inference_steps = num_inference_steps + 1
|
667
678
|
|
668
679
|
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
669
|
-
|
680
|
+
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
681
|
+
timesteps = self.scheduler.timesteps[t_start:]
|
682
|
+
if hasattr(self.scheduler, "set_begin_index"):
|
683
|
+
self.scheduler.set_begin_index(t_start)
|
670
684
|
return timesteps, num_inference_steps
|
671
685
|
|
672
|
-
return timesteps, num_inference_steps - t_start
|
673
|
-
|
674
686
|
def prepare_latents(
|
675
687
|
self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
|
676
688
|
):
|
@@ -1423,13 +1435,8 @@ class StableDiffusionXLImg2ImgPipeline(
|
|
1423
1435
|
|
1424
1436
|
latents = callback_outputs.pop("latents", latents)
|
1425
1437
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1426
|
-
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
1427
1438
|
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
1428
|
-
negative_pooled_prompt_embeds = callback_outputs.pop(
|
1429
|
-
"negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
|
1430
|
-
)
|
1431
1439
|
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
1432
|
-
add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
|
1433
1440
|
|
1434
1441
|
# call the callback, if provided
|
1435
1442
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
@@ -101,9 +101,21 @@ EXAMPLE_DOC_STRING = """
|
|
101
101
|
|
102
102
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
103
103
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
104
|
-
"""
|
105
|
-
|
106
|
-
|
104
|
+
r"""
|
105
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
106
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
107
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
108
|
+
|
109
|
+
Args:
|
110
|
+
noise_cfg (`torch.Tensor`):
|
111
|
+
The predicted noise tensor for the guided diffusion process.
|
112
|
+
noise_pred_text (`torch.Tensor`):
|
113
|
+
The predicted noise tensor for the text-guided diffusion process.
|
114
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
115
|
+
A rescale factor applied to the noise predictions.
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
107
119
|
"""
|
108
120
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
109
121
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -153,7 +165,7 @@ def retrieve_timesteps(
|
|
153
165
|
sigmas: Optional[List[float]] = None,
|
154
166
|
**kwargs,
|
155
167
|
):
|
156
|
-
"""
|
168
|
+
r"""
|
157
169
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
158
170
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
159
171
|
|
@@ -273,11 +285,8 @@ class StableDiffusionXLInpaintPipeline(
|
|
273
285
|
_callback_tensor_inputs = [
|
274
286
|
"latents",
|
275
287
|
"prompt_embeds",
|
276
|
-
"negative_prompt_embeds",
|
277
288
|
"add_text_embeds",
|
278
289
|
"add_time_ids",
|
279
|
-
"negative_pooled_prompt_embeds",
|
280
|
-
"add_neg_time_ids",
|
281
290
|
"mask",
|
282
291
|
"masked_image_latents",
|
283
292
|
]
|
@@ -901,14 +910,16 @@ class StableDiffusionXLInpaintPipeline(
|
|
901
910
|
if denoising_start is None:
|
902
911
|
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
903
912
|
t_start = max(num_inference_steps - init_timestep, 0)
|
904
|
-
else:
|
905
|
-
t_start = 0
|
906
913
|
|
907
|
-
|
914
|
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
915
|
+
if hasattr(self.scheduler, "set_begin_index"):
|
916
|
+
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
917
|
+
|
918
|
+
return timesteps, num_inference_steps - t_start
|
908
919
|
|
909
|
-
|
910
|
-
|
911
|
-
|
920
|
+
else:
|
921
|
+
# Strength is irrelevant if we directly request a timestep to start at;
|
922
|
+
# that is, strength is determined by the denoising_start instead.
|
912
923
|
discrete_timestep_cutoff = int(
|
913
924
|
round(
|
914
925
|
self.scheduler.config.num_train_timesteps
|
@@ -916,7 +927,7 @@ class StableDiffusionXLInpaintPipeline(
|
|
916
927
|
)
|
917
928
|
)
|
918
929
|
|
919
|
-
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
|
930
|
+
num_inference_steps = (self.scheduler.timesteps < discrete_timestep_cutoff).sum().item()
|
920
931
|
if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
|
921
932
|
# if the scheduler is a 2nd order scheduler we might have to do +1
|
922
933
|
# because `num_inference_steps` might be even given that every timestep
|
@@ -927,11 +938,12 @@ class StableDiffusionXLInpaintPipeline(
|
|
927
938
|
num_inference_steps = num_inference_steps + 1
|
928
939
|
|
929
940
|
# because t_n+1 >= t_n, we slice the timesteps starting from the end
|
930
|
-
|
941
|
+
t_start = len(self.scheduler.timesteps) - num_inference_steps
|
942
|
+
timesteps = self.scheduler.timesteps[t_start:]
|
943
|
+
if hasattr(self.scheduler, "set_begin_index"):
|
944
|
+
self.scheduler.set_begin_index(t_start)
|
931
945
|
return timesteps, num_inference_steps
|
932
946
|
|
933
|
-
return timesteps, num_inference_steps - t_start
|
934
|
-
|
935
947
|
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
|
936
948
|
def _get_add_time_ids(
|
937
949
|
self,
|
@@ -1656,13 +1668,8 @@ class StableDiffusionXLInpaintPipeline(
|
|
1656
1668
|
|
1657
1669
|
latents = callback_outputs.pop("latents", latents)
|
1658
1670
|
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
1659
|
-
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
1660
1671
|
add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
|
1661
|
-
negative_pooled_prompt_embeds = callback_outputs.pop(
|
1662
|
-
"negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
|
1663
|
-
)
|
1664
1672
|
add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
|
1665
|
-
add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
|
1666
1673
|
mask = callback_outputs.pop("mask", mask)
|
1667
1674
|
masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
|
1668
1675
|
|
@@ -71,7 +71,7 @@ def retrieve_timesteps(
|
|
71
71
|
sigmas: Optional[List[float]] = None,
|
72
72
|
**kwargs,
|
73
73
|
):
|
74
|
-
"""
|
74
|
+
r"""
|
75
75
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
76
76
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
77
77
|
|
@@ -127,7 +127,7 @@ def retrieve_timesteps(
|
|
127
127
|
sigmas: Optional[List[float]] = None,
|
128
128
|
**kwargs,
|
129
129
|
):
|
130
|
-
"""
|
130
|
+
r"""
|
131
131
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
132
132
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
133
133
|
|
@@ -119,9 +119,21 @@ def _preprocess_adapter_image(image, height, width):
|
|
119
119
|
|
120
120
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
121
121
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
122
|
-
"""
|
123
|
-
|
124
|
-
|
122
|
+
r"""
|
123
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
124
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
125
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
126
|
+
|
127
|
+
Args:
|
128
|
+
noise_cfg (`torch.Tensor`):
|
129
|
+
The predicted noise tensor for the guided diffusion process.
|
130
|
+
noise_pred_text (`torch.Tensor`):
|
131
|
+
The predicted noise tensor for the text-guided diffusion process.
|
132
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
133
|
+
A rescale factor applied to the noise predictions.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
125
137
|
"""
|
126
138
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
127
139
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -141,7 +153,7 @@ def retrieve_timesteps(
|
|
141
153
|
sigmas: Optional[List[float]] = None,
|
142
154
|
**kwargs,
|
143
155
|
):
|
144
|
-
"""
|
156
|
+
r"""
|
145
157
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
146
158
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
147
159
|
|
@@ -310,9 +310,21 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s
|
|
310
310
|
|
311
311
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
|
312
312
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
313
|
-
"""
|
314
|
-
|
315
|
-
|
313
|
+
r"""
|
314
|
+
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
315
|
+
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
316
|
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
317
|
+
|
318
|
+
Args:
|
319
|
+
noise_cfg (`torch.Tensor`):
|
320
|
+
The predicted noise tensor for the guided diffusion process.
|
321
|
+
noise_pred_text (`torch.Tensor`):
|
322
|
+
The predicted noise tensor for the text-guided diffusion process.
|
323
|
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
324
|
+
A rescale factor applied to the noise predictions.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
|
316
328
|
"""
|
317
329
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
318
330
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
@@ -104,8 +104,8 @@ class PatchEmbed(nn.Module):
|
|
104
104
|
|
105
105
|
self.use_pos_embed = use_pos_embed
|
106
106
|
if self.use_pos_embed:
|
107
|
-
pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
|
108
|
-
self.register_buffer("pos_embed",
|
107
|
+
pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5), output_type="pt")
|
108
|
+
self.register_buffer("pos_embed", pos_embed.float().unsqueeze(0), persistent=False)
|
109
109
|
|
110
110
|
def forward(self, latent):
|
111
111
|
latent = self.proj(latent)
|